数据爬取
1.1 取房源数据,生成Dataframe
from urllib import request
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#定义一个类,用于抓房源
class Spider():
"""用正则表达式定义抓取内容字符串"""
root_pattern = '<div>([\s\S]*?)</div>'
way_pattern = 'html">([\s\S]*?)·'
num_pattern = 'zufang/([\s\S]*?).html'
name_pattern = '·([\s\S]*?) \d'
price_pattern = '<em>([\s\S]*?)元/月'
#district_pattern = '/heping/">([\s\S]*?)</a>'
area_pattern = '<i>/</i>\s([\s\S]*?)"'
#发送html请求,从网站上抓取想要的数据
def fetch_content(self): #URL入口
r = request.urlopen(web)
htmls = r.read()
htmls = str(htmls,encoding = 'utf-8') #更改编码
return htmls
def analysis(self,htmls): ##将抓取内容转换为列表,列表中每一个元素为字典,目的为方便转换为dataframe
root_html = re.findall(Spider.root_pattern,htmls)
anchors=[]
for html in root_html:
num = re.findall(Spider.num_pattern,html)
way = re.findall(Spider.way_pattern,html)
name = re.findall(Spider.name_pattern,html)
price = re.findall(Spider.price_pattern,html)
#district = re.findall(Spider.district_pattern,html)
area = re.findall(Spider.area_pattern,html)
anchor = {'num':num,'way':way,'name':name,'price':price,'area':area}
anchors.append(anchor)
return anchors
def refine(self,anchors): #内容精炼
l = lambda x: {
'num':x['num'][0].strip(),
'way':x['way'][0].strip(),
'name':x['name'][0].strip()
,'price':x['price'][0].replace('</em>','')
#,'district':x['district'][0].strip()
,'area':x['area'][0].strip().replace('<i>/</i>','').replace(' ','').replace('<spanclass=','')
}
return map(l,anchors)
def go(self):#出口
htmls = self.fetch_content()
anchors = self.analysis(htmls)
anchors = list(self.refine(anchors