利用scrapy框架实现matplotlib实例脚本批量下载至本地并进行文件夹分类;话不多说上代码:
首先是爬虫代码:
import scrapy
from scrapy.linkextractors import LinkExtractor
from urllib.parse import urljoin
from ..items import MatplotlibExamplesItem
class MatExamplesSpider(scrapy.Spider):
name = 'mat_examples'
# allowed_domains = ['matplotlib.org']
start_urls = ['https://matplotlib.org/gallery/index.html']
def parse(self, response):
le = LinkExtractor(restrict_xpaths='//span[contains(@class, "caption-text")]/a[contains(@class, "reference internal")]')
links = le.extract_links(response)
for link in links:
yield scrapy.Request(link.url, callback=self.parse_mat)
def parse_mat(self, response):
href = response.xpath('//div[contains(@class, "docutils container")]/a/@href').extract_first()
# print('href:', href)
url = response.urljoin(href)
# print('url:', url)
example = MatplotlibExamplesItem()
example['file_urls'] = [url]
return example