因为有在北京租房的打算,于是上网浏览了一下链家网站的房价,想将他们爬取下来,并保存到本地。
先看链家网的源码。。房价信息 都保存在 ul 下的li 里面
爬虫结构:
其中封装了一个数据库处理模块,还有一个user-agent池。。
先看mylianjia.py
# -*- coding: utf-8 -*- import scrapy from ..items import LianjiaItem from scrapy.http import Request from parsel import Selector import requests import os class MylianjiaSpider(scrapy.Spider): name = 'mylianjia' #allowed_domains = ['lianjia.com'] start_urls = ['https://bj.lianjia.com/ershoufang/chaoyang/pg'] def start_requests(self): for i in range(1, 101): #100页的所有信息 url1 = self.start_urls + list(str(i)) #print(url1) url = '' for j in url1: url += j + '' yield Request(url, self.parse) def parse(self, response): print(response.url) ''' response1 = requests.get(response.url, params={'search_text': '粉墨', 'cat': 1001}) if response1.status_code == 200: print(response1.text) dirPath = os.path.join(os.getcwd(), 'data') if not os.path.exists(dirPath): os.makedirs(dirPath) with open(os.path.join(dirPath, 'lianjia.html'), 'w', encoding='utf-8')as fp: fp.write(response1.text) print('网页源码写入完毕') ''' infoall=response.xpath("//div[4]/div[1]/ul/li") #infos = response.xpath('//div[@class="info clear"]') #print(infos) #info1 = infoall.xpath('div/div[1]/a/text()').extract_first() #print(infoall) for info in infoall: item =LianjiaItem() #print(info) info1 = info.xpath('div/div[1]/a/text()').extract_first() info1_url = info.xpath('div/div[1]/a/@href').extract_first() #info2 = info.xpath('div/div[2]/div/text()').extract_first() info2_dizhi = info.xpath('div/div[2]/div/a/text()').extract_first() info2_xiangxi= info.xpath('div/div[2]/div/text()').extract() #info3 = info.xpath('div/div[3]/div/a/text()').extract_first() #info4 = info.xpath('div/div[4]/text()').extract_first() price = info.xpath('div/div[4]/div[2]/div/span/text()').extract_first() perprice = info.xpath('div/div[4]/div[2]/div[2]/span/text()').extract_first() #print(info1,'--',info1_url,'--',info2_dizhi,'--',info2_xiangxi,'--',info4,'--',price,perprice) info2_xiangxi1 = '' for j1 in info2_xiangxi: info2_xiangxi1 += j1 + '' #print(info2_xiangxi1) #化为字符串 item['houseinfo']=info1 item['houseurl']=info1_url item['housedizhi']=info2_dizhi item['housexiangxi']=info2_xiangxi1 item['houseprice']=price item['houseperprice']=perprice yield item