爬虫入门之Scrapy框架实战(新浪百科豆瓣)(十二)

新浪新闻爬取 1 爬取新浪新闻(全站爬取)

项目搭建与开启

scrapy startproject sina cd sina scrapy genspider mysina 2 项目setting配置 ROBOTSTXT_OBEY = False ITEM_PIPELINES = { 'sina.pipelines.SinaPipeline': 300, } 3 启动文件start.py配置 import scrapy.cmdline def main(): # -o ['json', 'jsonlines', 'jl', 'csv', 'xml', 'marshal', 'pickle'] scrapy.cmdline.execute(['scrapy','crawl','mysina']) if __name__ == '__main__': main() 4 需求目标item配置 import scrapy class SinaItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() newsTitle = scrapy.Field() newsUrl = scrapy.Field() newsTime = scrapy.Field() content = scrapy.Field() 5 爬虫逻辑文件配置mysina.py import scrapy import requests from lxml import etree from sina import items from scrapy.spiders import CrawlSpider,Rule #CrawlSpiders:定义了一些规则跟进link from scrapy.linkextractors import LinkExtractor #提取链接 class MysinaSpider(CrawlSpider): #继承了CrawlSpider因此parse需要重命名防止冲突 name = 'mysina' allowed_domains = ['sina.com.cn'] start_urls = ['http://roll.news.sina.com.cn/news/gnxw/gdxw1/index_2.shtml'] ''' Rule参数:link_extractor, callback=None, cb_kwargs=None, follow=None, process_links=None, process_request=identity LinkExtractor部分参数: allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=() allow=(正则)允许的, deny=(正则)不允许的 callback=回调函数 follow= 跟随如果为True就跟随 ''' rules = [Rule(LinkExtractor(allow=('index_(\d+).shtml')),callback='getParse',follow=True)] def getParse(self, response): #重命名逻辑方法 newsList = response.xpath("//ul[@class='list_009']/li") for news in newsList: item = items.SinaItem() #对其进行实例化 newsTitle = news.xpath('./a/text()')[0].extract() newsUrl = news.xpath('./a/@href')[0].extract() newsTime = news.xpath('./span/text()')[0].extract() content = self.getContent(newsUrl) item['newsTitle'] = newsTitle item['newsUrl'] = newsUrl item['newsTime'] = newsTime item['content'] = content yield item def getContent(self,url): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36" } response = requests.get(url,headers=headers).content.decode('utf-8','ignore') #content二进制 mytree = etree.HTML(response) contentList = mytree.xpath("//div[@class='article']//text()") print(contentList) content = '' for c in contentList: #Python strip() 方法用于移除字符串头尾指定的字符(默认为空格或换行符)或字符序列 content += c.strip().replace('\n','') #保证content为整片文章 return content

方法二 :mysina.py也可采用scrapy创建请求

# -*- coding: utf-8 -*- import scrapy import requests from lxml import etree from sina import items from scrapy.spiders import CrawlSpider,Rule #CrawlSpiders:定义了一些规则跟进link from scrapy.linkextractors import LinkExtractor #提取链接 class MysinaSpider(CrawlSpider): name = 'mysina' allowed_domains = ['sina.com.cn'] start_urls = ['http://roll.news.sina.com.cn/news/gnxw/gdxw1/index_2.shtml'] rules = [Rule(LinkExtractor(allow=('index_(\d+).shtml')),callback='getParse',follow=True)] def getParse(self, response): newsList = response.xpath("//ul[@class='list_009']/li") for news in newsList: newsTitle = news.xpath('./a/text()')[0].extract() newsUrl = news.xpath('./a/@href')[0].extract() newsTime = news.xpath('./span/text()')[0].extract() #构造请求(修改为框架Request构造请求) request = scrapy.Request(newsUrl,callback=self.getMataContent) #回调为getMataContent #使用meta传参 request.meta['newsTitle'] = newsTitle request.meta['newsUrl'] = newsUrl request.meta['newsTime'] = newsTime yield request def getMataContent(self,response): ''' getMataContent接受来自request请求后的响应response ''' contentList = response.xpath("//div[@class='article']//text()") content = '' for c in contentList: content += c.extract().strip() item = items.SinaItem() #response响应数据对应字段赋值给item item['newsTitle'] = response.meta['newsTitle'] item['newsUrl'] = response.meta['newsUrl'] item['newsTime'] = response.meta['newsTime'] item['content'] = content yield item 6 管道存储pipelines.py import pymysql class SinaPipeline(object): def __init__(self): self.conn = None self.cursor = None def open_spider(self,spider): self.conn = pymysql.connect(host='111.230.169.xxx',user='root',password='xxx',database='sina', port=3306,charset='utf8') #创建连接 self.cursor = self.conn.cursor() #创建数据库游标 def process_item(self, item, spider): sql = 'insert into sina_news(newsTitle,newsUrl,newsTime,content) VALUES (%r,%r,%r,%r)'%(item['newsTitle'], item['newsUrl'], item['newsTime'], item['content']) self.cursor.execute(sql) #执行sql语句 self.conn.commit() #提交 return item def close_spider(self,spider): self.cursor.close() #关闭 self.conn.close()

方法二 : pipelines.py 补充快速创建sql语句

import pymysql class DemoPipeline(object): def __init__(self): self.conn = None self.cur = None def open_spider(self, spider): self.conn = pymysql.connect( host='127.0.0.1', port=3306, user='root', password='123456', db='fate', charset='utf8') self.cur = self.conn.cursor() def process_item(self, item, spider): cols, values = zip(*item.items()) #zip打包返回两个参数 sql = "INSERT INTO `%s` (%s) VALUES (%s)" % \ ( 'sina_news', ','.join(cols), ','.join(['%s'] * len(values)) ) self.cur.execute(sql, values) #执行sql语句并将values填充到%s self.conn.commit() return item def close_spider(self, spider): self.cur.close() self.conn.close() 二 百科资料的爬取 1 百科资料爬取

内容版权声明:除非注明,否则皆为本站原创文章。

转载注明出处:https://www.heiqu.com/wppfsx.html