使用scrapy+selenium爬取淘宝网

  打开cmd,输入scrapy startproject taobao_s新建一个项目。

  

使用scrapy+selenium爬取淘宝网

  接着cd 进入我们的项目文件夹内输入scrapy genspider taobao 新建一个爬虫

使用scrapy+selenium爬取淘宝网

文件内是这样的,tools是我建的一个工具模块,里面有一个处理数据的函数和selenium登录的函数。

class TaobaoSpider(scrapy.Spider): name = 'taobao' # allowed_domains = ['www.taobao.com'] base_url = ['https://s.taobao.com/search?q='] pages = 100 re_headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36', 'referer': 'https://www.taobao.com/', 'accept-encoding': 'gzip, deflate, b', } i = 1 def start_requests(self): keys = self.settings.get('KEYS')#获取要搜索的关键词 self.browser,list = register()#这里调用selenium登录的方法并返回browser和一个cookies self.browser.get(self.base_url[0]+keys)#使用browser登录淘宝商品搜索页面 self.browser.execute_script("window.scrollTo(0, document.body.scrollHeight)")#使用execute_script执行js操作,这里是下拉到最底下 url_i = self.browser.current_url#获取selenium界面当前的url用来错误处理 html = self.browser.page_source#获取源代码 yield scrapy.Request(url=self.base_url[0]+keys,headers=self.re_headers,cookies=list,callback=self.parse,meta={'html':html,'i':self.i,'url':url_i}) def parse(self, response): time.sleep(5)#等待时间,可调 html = response.meta.get('html') i = response.meta.get("i") url_i = response.meta.get("url") i +=1 if i > 100:#因为翻一百页,到了之后就不在执行循环 return try: soup = BeautifulSoup(html,'html.parser') lists = soup.select('#mainsrp-itemlist > div > div > div > div') for list in lists:#这一段是解析数据 item = TaobaoSItem() url = list.select('a[class="pic-link J_ClickStat J_ItemPicA"]')[0].attrs.get('href','') name = list.select("a[class='J_ClickStat']")[0].get_text().strip() name = data_cleaning(name) price = list.select('div[class="price g_price g_price-highlight"] strong')[0].get_text() num = list.select('div[class="deal-cnt"]')[0].get_text() shop_name = list.select("a[class='shopname J_MouseEneterLeave J_ShopInfo']")[0].get_text().strip() shop_name = data_cleaning(shop_name) item['url'] = url item['name'] = name item['price'] = price item['num'] = num item['shop_name'] = shop_name yield item button = self.browser.find_elements(By.XPATH,'//a[@class="J_Ajax num icon-tag"]')[-1]#这里是获取点击下一页的,因为到第二页以后会有二个一样class的,一个是上一页,一个是下一页。 button.click()#点击进入下一页 time.sleep(random.random()*2) self.browser.execute_script("window.scrollTo(0, document.body.scrollHeight)")#下拉操作 html = self.browser.page_source yield scrapy.Request(url=response.url,headers=self.re_headers,callback=self.parse,meta={'html':html,'i':i,'url':url_i},dont_filter=True) except Exception as e:#如果被淘宝抓到就重新登录,用保存的url在接着获取数据 time.sleep(10) print(e) self.browser.close() self.browser,list = register() self.browser.get(url=url_i) time.sleep(random.random()*2) self.browser.execute_script("window.scrollTo(0, document.body.scrollHeight)") html = self.browser.page_source yield scrapy.Request(url=response.url,headers=self.re_headers,callback=self.parse,meta={'html':html,'i':i,'url':url_i},dont_filter=True) def close(spider, reason):#这是结束时执行的函数,用来关掉开启的浏览器进程 spider.browser.close()

内容版权声明:除非注明,否则皆为本站原创文章。

转载注明出处:https://www.heiqu.com/wppjyy.html