获取职位名称、详细信息、
class TencentItem(scrapy.Item): # 定义需要爬取的字段 jobTitle = scrapy.Field() jobCategories = scrapy.Field() number = scrapy.Field() location = scrapy.Field() releasetime = scrapy.Field()编写tencent.py
# -*- coding: utf-8 -*- import re import scrapy from Tencent import items class MytencentSpider(scrapy.Spider): name = 'myTencent' allowed_domains = ['hr.tencent.com'] start_urls = ['https://hr.tencent.com/position.php?lid=2218&start=0#a'] def parse(self, response): for data in response.xpath("//tr[@class=\"even\"] | //tr[@class=\"odd\"]"): item = items.TencentItem() item["jobTitle"] = data.xpath("./td[1]/a/text()")[0].extract() item["jobLink"] = data.xpath("./td[1]/a/@href")[0].extract() item["jobCategories"] = data.xpath("./td[1]/a/text()")[0].extract() item["number"] = data.xpath("./td[2]/text()")[0].extract() item["location"] = data.xpath("./td[3]/text()")[0].extract() item["releasetime"] = data.xpath("./td[4]/text()")[0].extract() yield item for i in range(1, 200): newurl = "https://hr.tencent.com/position.php?lid=2218&start=%d#a" % (i*10) yield scrapy.Request(newurl, callback=self.parse)编写pipeline.py文件
class TencentPipeline(object): def __init__(self): self.file = open("tencent.txt", "w", encoding="utf-8") #初始化即打开 def process_item(self, item, spider): line = str(item) + "\r\n" self.file.write(line) self.file.flush() return item def __del__(self): #数据清除时关闭 self.file.close()在 setting.py 里设置ITEM_PIPELINES
ITEM_PIPELINES = { "mySpider.pipelines.TencentJsonPipeline":300 }
执行爬虫:
scrapy crawl tencent.py