爬虫入门之Scrapy框架实战(新浪百科豆瓣)(十二) (2)

项目搭建与开启

scrapy startproject baike cd baike scrapy genspider mybaike baike.baidu.com/item/Python/407313 2 项目setting配置 ROBOTSTXT_OBEY = False DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36" } ITEM_PIPELINES = { 'baike.pipelines.BaikePipeline': 300, } 3 启动文件start.py配置 import scrapy.cmdline def main(): # -o ['json', 'jsonlines', 'jl', 'csv', 'xml', 'marshal', 'pickle'] scrapy.cmdline.execute(['scrapy','crawl','mybaike']) if __name__ == '__main__': main() 4 需求目标items配置 import scrapy class BaikeItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() level1Title = scrapy.Field() level2Title = scrapy.Field() content = scrapy.Field() 5 爬虫逻辑文件配置mybaike.py # -*- coding: utf-8 -*- import scrapy from scrapy.spiders import CrawlSpider,Rule from scrapy.linkextractors import LinkExtractor from baike.items import BaikeItem class MybaikeSpider(CrawlSpider): name = 'mybaike' allowed_domains = ['baike.baidu.com'] start_urls = ['https://baike.baidu.com/item/Python/407313'] rules = [Rule(LinkExtractor(allow=('item/(.*)')),callback='getParse',follow=True)] def getParse(self, response): level1Title = response.xpath("//dd[@class='lemmaWgt-lemmaTitle-title']/h1/text()")[0].extract() level2Title = response.xpath("//dd[@class='lemmaWgt-lemmaTitle-title']/h2/text()") if len(level2Title) != 0: level2Title = level2Title[0].extract() else: level2Title = '待编辑' contentList = response.xpath("//div[@class='lemma-summary']//text()") content = '' for c in contentList: content += c.extract() item = BaikeItem() item['level1Title'] = level1Title item['level2Title'] = level2Title item['content'] = content yield item 6 管道存储pipelines.py # -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import pymysql class BaikePipeline(object): def __init__(self): self.conn = None self.cousor = None def open_spider(self, spider): # 连接 self.conn = pymysql.connect(host='111.230.169.107', user='root', password="20111673", database='baike', port=3306, charset='utf8') # 游标 self.cousor = self.conn.cursor() def process_item(self, item, spider): cols, values = zip(*item.items()) # `表名` sql = "INSERT INTO `%s`(%s) VALUES (%s)" % \ ('baike', ','.join(cols), ','.join(['%s'] * len(values))) self.cousor.execute(sql, values) self.conn.commit() return item def close_spider(self, spider): self.cousor.close() self.conn.close() 三 豆瓣电影的爬取 1 豆瓣电影排行版

项目搭建与开启

scrapy startproject douban cd douban scrapy genspider mysina movie.douban.com/top250 2 项目setting配置 ROBOTSTXT_OBEY = False DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36" } ITEM_PIPELINES = { 'douban.pipelines.DoubanPipeline': 300, } 3 启动文件start.py配置 import scrapy.cmdline def main(): # -o ['json', 'jsonlines', 'jl', 'csv', 'xml', 'marshal', 'pickle'] scrapy.cmdline.execute(['scrapy','crawl','mybaike']) if __name__ == '__main__': main() 4 需求目标items配置 import scrapy class DoubanItem(scrapy.Item): # define the fields for your item here like: name = scrapy.Field() movieInfo = scrapy.Field() star = scrapy.Field() quote = scrapy.Field() 5 爬虫逻辑文件配置mydouban.py # -*- coding: utf-8 -*- import scrapy from scrapy.http import Request from douban.items import DoubanItem class MydoubanSpider(scrapy.Spider): name = 'mydouban' url = ['https://movie.douban.com/top250'] start_urls = {'https://movie.douban.com/top250'} #方法1 '''#方法二 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36', } def start_requests(self): url = 'https://movie.douban.com/top250' yield Request(url, headers=self.headers) ''' def parse(self, response): item = DoubanItem() movies = response.xpath('//ol[@class="grid_view"]/li') for movie in movies: item['name'] = movie.xpath(".//div[@class='pic']/a/img/@alt").extract()[0] item['movieInfo'] = movie.xpath(".//div[@class='info']/div[@class='bd']/p/text()").extract()[0].strip() item['star'] = movie.xpath(".//div[@class='info']/div[@class='bd']/div[@class='star']/span[2]/text()").extract()[0] item['quote'] = movie.xpath('.//div[@class="star"]/span/text()').re(r'(\d+)人评价')[0] yield item next_url = response.xpath('//span[@class="next"]/a/@href').extract() #获取下一页链接 if next_url: next_url = 'https://movie.douban.com/top250' + next_url[0] yield Request(next_url,callback=self.parse) #执行回调 6 管道存储pipelines.py # -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import pymysql class DoubanPipeline(object): def __init__(self): self.conn = pymysql.connect(host='111.230.169.107', port=3306, user= 'root', passwd = 'xxx', database = 'douban',charset = 'utf8') self.cursor = self.conn.cursor() self.cursor.execute("truncate table Movie") #此处设置每开启就清空 self.conn.commit() def process_item(self, item, spider): try: self.cursor.execute("insert into Movie (name,movieInfo,star,quote) VALUES (%s,%s,%s,%s)",(item['name'], item['movieInfo'], item['star'], item['quote'])) self.conn.commit() except pymysql.Error: print("Error%s,%s,%s,%s" % (item['name'], item['movieInfo'], item['star'], item['quote'])) return item def close_spider(self, spider): self.cursor.close() self.conn.close()

内容版权声明:除非注明,否则皆为本站原创文章。

转载注明出处:https://www.heiqu.com/wppfsx.html