爬虫之Scrapy框架 (5)

spiderNmae.py

# -*- coding: utf-8 -*- import scrapy from selenium import webdriver from wangyiPro.items import WangyiproItem class WangyiSpider(scrapy.Spider): name = \'wangyi\' # allowed_domains = [\'www.xx.com\'] start_urls = [\'https://news.163.com/\'] # 5个板块页面的url model_urls = [] # 实例化浏览器对象 bro = webdriver.Chrome(executable_path=r\'D:\Reptile\jupyter\onceagain\爬虫\Scrap框架\chromedriver.exe\') # 数据解析,解析5个板块对应页面url def parse(self, response): li_list = response.xpath(\'//*[@id="index2016_wrap"]/div[1]/div[2]/div[2]/div[2]/div[2]/div/ul/li\') index = [3, 4, 6, 7, 8] for i in index: model_url = li_list[i].xpath(\'./a/@href\').extract_first() self.model_urls.append(model_url) # 对板块url发请求,捕获每个页面的源码数据 for url in self.model_urls: yield scrapy.Request(url=url, callback=self.parse_model) # 解析标题和新闻详情页的url def parse_model(self, response): div_list = response.xpath(\'/html/body/div[1]/div[3]/div[4]/div[1]/div/div/ul/li/div/div\') for div in div_list: title = div.xpath(\'./a/img/@alt\').extract_first() new_detail_url = div.xpath(\'./a/@href\').extract_first() if new_detail_url: item = WangyiproItem() item[\'title\'] = title # 对新闻的详情页发请求,解析出新闻的内容 yield scrapy.Request(url=new_detail_url, callback=self.parse_new_detail, meta={\'item\': item}) # 解析新闻内容 def parse_new_detail(self, response): item = response.meta[\'item\'] content = response.xpath(\'//*[@id="endText"]//text()\').extract() content = \'\'.join(content) # print(content) item[\'content\'] = content yield item # 整个程序结束时调用一次:父类的方法 def closed(self, spider): self.bro.quit()

middlewares.py

from time import sleep class WangyiproDownloaderMiddleware(object): # 拦截响应,篡改指定响应对象的响应数据 def process_response(self, request, response, spider): # 获取5个板块对应的url model_urls = spider.model_urls bro = spider.bro if request.url in model_urls: # 成立之后定位到的response就是某一个板块对应的response # 指定响应数据的篡改 # 参数body就是响应数据 bro.get(request.url) sleep(1) page_text = bro.page_source # 作为新的响应数据,包含动态加载数据源 return HtmlResponse(url=request.url, body=page_text, encoding=\'utf-8\', request=request) else: return response

items.py

import scrapy class WangyiproItem(scrapy.Item): title = scrapy.Field() content = scrapy.Field() Scrapy爬大文本数据

大文本数据就是量级大的二进制数据,如图片,压缩包,音频,视频...

爬虫文件中将二进制资源的url进行爬取和解析,将其存储到item中向管道提交

在管道文件中指定对应的管道类

父类:from scrapy.pipelines.images import ImagesPipeline

配置文件中进行如下操作

# 自动创建一个指定的文件夹 IMAGES_STORE = \'./imgLib\'

案例:校花图片的爬取

内容版权声明:除非注明,否则皆为本站原创文章。

转载注明出处:https://www.heiqu.com/zzpzyg.html