调度器的配置
# 增加了一个去重容器类的配置,作用使用Redis的set集合来存储请求的指纹数据,从而实现请求去重的持久化 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" # 使用scrapy-redis组件自己的调度器 SCHEDULER = "scraoy_redis.scheduler.Scheduler" # 配置调度器是否要持久化,也就是爬虫结束,是否清空Redis中请求对列和去重的set,true表示持久化存储,不清空 SCHEDULER_PERSIST = True对Redis进行配置
REDIS_HOST = \'redis服务的ip地址\' REDIS_PORT = 6379对redis配置文件进行修改
56行:# bind 127.0.0.1
75行:protected-mode no
启动Redis服务和客户端
携带配置文件启动redis,在redis安装目录下运行cmd
redis-server.exe redis.windows.conf
启动客户端
redis-cli
启动程序
在终端中进入到爬虫文件对应的目录中
scrapy runspider spiderName.py
向调度器的队列中扔入一个起始的url
redisl-cli
lpush redis_key的属性值(被共享的调度器队列名称) 起始的网址
示例代码 # -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from scrapy_redis.spiders import RedisCrawlSpider from fbsPro.items import FbsproItem class FbsSpider(RedisCrawlSpider): name = \'fbs\' # allowed_domains = [\'www.xxx.com\'] # start_urls = [\'http://www.xxx.com/\'] redis_key = \'sunQueue\' # 可被共享的调度器队列的名称 rules = ( Rule(LinkExtractor(allow=r\'id=1&page=\d+\'), callback=\'parse_item\', follow=True), ) def parse_item(self, response): li_list = response.xpath(\'/html/body/div[2]/div[3]/ul[2]/li\') for li in li_list: title = li.xpath(\'./span[3]/a/text()\').extract_first() item = FbsproItem() item[\'title\'] = title yield item 增量式
监测网站数据更新情况,以便于爬取到最新更新的网站
核心:去重
记录仪:
特性:永久性存储(redis中的set)
爬取过的数据对应的url
可以以明文的形式存储(url数据长度较短)
记录的数据对其生成一个数据指纹(url数据长度比较长)
数据指纹就是该组数据的唯一标识
示例代码 # -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from redis import Redis from zlsPro.items import ZlsproItem class ZlsSpider(CrawlSpider): name = \'zls\' # allowed_domains = [\'www.xxx.com\'] start_urls = [\'http://wz.sun0769.com/political/index/politicsNewest?id=1&page=1\'] # 创建redis的链接对象 conn = Redis(host="127.0.0.1", port=6379, password=\'redis的密码,没有就不写\') rules = ( Rule(LinkExtractor(allow=r\'id=1&page=\d+\'), callback=\'parse_item\', follow=False), ) def parse_item(self, response): # 解析出标题和详情页的url(详情页的url需要存储到记录表中) li_list = response.xpath(\'/html/body/div[2]/div[3]/ul[2]/li\') for li in li_list: title = li.xpath(\'./span[3]/a/text()\').extract_first() detail_url = "http://wz.sun0769.com/" + li.xpath(\'./span[3]/a/@href\').extract_first() item = ZlsproItem() item[\'title\'] = title # 将进行请求发送的详情页的url去记录表中进行查看 ex = self.conn.sadd(\'urls\', detail_url) if ex == 1: print(\'数据已更新,可爬取\') yield scrapy.Request(url=detail_url, callback=self.parse_deatil, meta={\'item\': item}) else: print(\'数据未更新,不可爬\') def parse_deatil(self, response): item = response.meta[\'item\'] content = response.xpath(\'/html/body/div[3]/div[2]/div[2]/div[2]/pre/text()\').extract_first() item[\'content\'] = content yield item