贝壳找房 带你用爬虫分析租房信息

数据爬取
1.1 取房源数据,生成Dataframe

from urllib import request import re import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns #定义一个类,用于抓房源 class Spider(): """用正则表达式定义抓取内容字符串""" root_pattern = '<div>([\s\S]*?)</div>' way_pattern = 'html">([\s\S]*?)·' num_pattern = 'zufang/([\s\S]*?).html' name_pattern = '·([\s\S]*?) \d' price_pattern = '<em>([\s\S]*?)元/月' #district_pattern = '/heping/">([\s\S]*?)</a>' area_pattern = '<i>/</i>\s([\s\S]*?)"' #发送html请求,从网站上抓取想要的数据 def fetch_content(self): #URL入口 r = request.urlopen(web) htmls = r.read() htmls = str(htmls,encoding = 'utf-8') #更改编码 return htmls def analysis(self,htmls): ##将抓取内容转换为列表,列表中每一个元素为字典,目的为方便转换为dataframe root_html = re.findall(Spider.root_pattern,htmls) anchors=[] for html in root_html: num = re.findall(Spider.num_pattern,html) way = re.findall(Spider.way_pattern,html) name = re.findall(Spider.name_pattern,html) price = re.findall(Spider.price_pattern,html) #district = re.findall(Spider.district_pattern,html) area = re.findall(Spider.area_pattern,html) anchor = {'num':num,'way':way,'name':name,'price':price,'area':area} anchors.append(anchor) return anchors def refine(self,anchors): #内容精炼 l = lambda x: { 'num':x['num'][0].strip(), 'way':x['way'][0].strip(), 'name':x['name'][0].strip() ,'price':x['price'][0].replace('</em>','') #,'district':x['district'][0].strip() ,'area':x['area'][0].strip().replace('<i>/</i>','').replace(' ','').replace('<spanclass=','') } return map(l,anchors) def go(self):#出口 htmls = self.fetch_content() anchors = self.analysis(htmls) anchors = list(self.refine(anchors

内容版权声明:除非注明,否则皆为本站原创文章。

转载注明出处:https://www.heiqu.com/zzgysz.html