设置全局设置
# settings.py 只展示需要更改的地方 ROBOTSTXT_OBEY = False DEFAULT_REQUEST_HEADERS = { 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', 'User-Agent': 'Mozilla/5.0' } 2. 定义数据结构./Maoyan/items.py
class MaoyanItem(scrapy.Item): # define the fields for your item here like: name = scrapy.Field() star = scrapy.Field() time = scrapy.Field() 3. 定义spider./Maoyan/Spiders/maoyan.py
# -*- coding: utf-8 -*- import scrapy from ..items import MaoyanItem class MaoyanSpider(scrapy.Spider): name = 'maoyan' allowed_domains = ['maoyan.com'] # 起始的URL地址,即第一页地址 start_urls = ['https://maoyan.com/board/4?offset=0'] def parse(self, response): # 基准的xpath dd_list = response.xpath('//dl[@class="board-wrapper"]/dd') # 依次遍历 for dd in dd_list: # 创建item对象(./Maoyan/items.py) item = MaoyanItem() # 电影名称 item['name'] = dd.xpath('./a/@title').extract_first().strip() # 电影主演 item['star'] = dd.xpath('.//p[@class="star"]/text()').extract_first().strip() # 电影上映时间 item['time'] = dd.xpath('.//p[@class="releasetime"]/text()').extract_first().strip() # 把爬取的数据交给管道文件piplines处理 yield item self.offset += 10 if self.offset <= 90: url = 'https://maoyan.com/board/4?offset={}'.format(self.offset) # 拼接下一页的地址 # 交给调度器入队列 yield scrapy.Request(url = url, callback=self.parse)当一个函数中出现了yield关键字时,则python将该函数当做生成器使用,也就是说,函数执行到yield时,将数据传递给piplines,并且函数暂停到此处,等待下一次的调用,当再次调用的时候,则会从上一次的地方继续开始,直到调用结束,scrapy也会捕获这个异常
4. 定义管道文件./Maoyan/pipelines.py
class MaoyanPipeline(object): def process_item(self, item, spider): print(item['name']) print(item['star']) print(item['time']) return item开启管道需要设置settings.py
ITEM_PIPELINES = { 'Maoyan.pipelines.MaoyanPipeline': 300, }300代表优先级,优先级的范围是1-1000,数字越小,优先级越高
5. 小结以上方法已经足以爬取需要的数据,但是,在运行的过程中,URL在入队列后又出队列,到了yield之后又交给调度器,如此往复循环,并没有发挥scrapy的多线程优势,届时,需要修改代码来充分的使用多线程技术。
5.1 法一修改maoyan.py
原理:将所有的URL全部交给调度器
# -*- coding: utf-8 -*- import scrapy from ..items import MaoyanItem class MaoyanSpider(scrapy.Spider): name = 'maoyan' allowed_domains = ['maoyan.com'] # 起始的URL地址,即第一页地址 start_urls = ['https://maoyan.com/board/4?offset=0'] offset = 0 def parse(self, response): for offset in range(0, 91, 10): url = 'https://maoyan.com/board/4?offset={}'.format(str(offset)) # 把地址交给调度器入队列 yield scrapy.Request(url=url,callback=self.parse_html) def parse_html(self, response): # 基准xpath,匹配每个电影信息节点对象列表 dd_list = response.xpath('//dl[@class="board-wrapper"]/dd') # dd_list : [<element dd at xxx>,<...>] for dd in dd_list: # 创建item对象 item = MaoyanItem() # [<selector xpath='' data='霸王别姬'>] # dd.xpath('')结果为[选择器1,选择器2] # .extract() 把[选择器1,选择器2]所有选择器序列化为unicode字符串 # .extract_first() : 取第一个字符串 item['name'] = dd.xpath('./a/@title').extract_first().strip() item['star'] = dd.xpath('.//p[@class="star"]/text()').extract()[0].strip() item['time'] = dd.xpath('.//p[@class="releasetime"]/text()').extract()[0] yield item这里有个问题,parse函数用来解析,但是在这里却是交付URL,物未尽其用,所以再次修改
5.2 法二既然离不开start_url,那么就剖其源码可以得到,该变量应用于start_requests函数
以下为start_requests()源码:
def start_requests(self): cls = self.__class__ if method_is_overridden(cls, Spider, 'make_requests_from_url'): warnings.warn( "Spider.make_requests_from_url method is deprecated; it " "won't be called in future Scrapy releases. Please " "override Spider.start_requests method instead (see %s.%s)." % ( cls.__module__, cls.__name__ ), ) for url in self.start_urls: yield self.make_requests_from_url(url) else: for url in self.start_urls: yield Request(url, dont_filter=True)那么就可以重写该方法,只需要再次定义start_requests函数即可,根据优先级则先执行本文件的start_requests方法
def start_requests(self): for offset in range(0, 91, 10): url = 'https://maoyan.com/board/4?offset={}'.format(str(offset)) # 交给调度器 yield scrapy.Request(url=url, callback=self.parse_html)