python爬虫 爬取58同城商品信息

title: python爬虫 爬去58同城二手平板电脑信息
tags: python,爬虫
grammar_cjkRuby: true 爬去中除转转、推广商品以外的产品信息 # coding:utf-8 # 爬取58同城二手电脑信息 # 进入页面 # 爬取列表中除转转、推广商品外的正常商品 from bs4 import BeautifulSoup import requests import time def get_links_from(who_sells): # 爬取列表中除转转、推广商品外的正常商品爬取列表中除转转、推广商品外的正常商品的连接 urls = [] list_view = \'http://bj.58.com/pbdn/{}/pn2/\'.format(str(who_sells)) wb_data = requests.get(list_view) soup = BeautifulSoup(wb_data.text, \'lxml\') # 通过对页面分析 发现商品链接在 tr > td.t > a.t 中 for link in soup.select(\'tr td.t a.t\'): if len(link.get(\'href\').split(\'?\')[0]) == 53: # 因为转转商品也符合 tr > td.t > a.t,要排除,观察发现正常商品链接 # 的长度为53, 可通过字符串长度筛选去正常的连接 urls.append(link.get(\'href\').split(\'?\')[0]) return urls def get_views(url): id = url.split(\'/\')[-1].strip(\'x.shtml\') api = \'http://jst1.58.com/counter?infoid={}\'.format(id) js = requests.get(api) views = js.text.split(\'=\')[-1] return views def get_item_info(who_sells=0): # urls = get_links_from(who_sells) for url in urls: time.sleep(2) web_data = requests.get(url) soup = BeautifulSoup(web_data.text, \'lxml\') data = { \'title\': soup.title.text, \'price\': soup.find_all(\'span\', \'price c_f50\')[0].text, \'area\': list(soup.select(\'.c_25d\')[0].stripped_strings) if soup.find_all(\'span\',\'c_25d\') else None, \'date\': soup.select(\'.time\')[0].text, \'cate\': \'个人\' if who_sells == 0 else \'商家\', \'views\': get_views(url) } print(data) get_item_info()

内容版权声明:除非注明,否则皆为本站原创文章。

转载注明出处:https://www.heiqu.com/zzdgfy.html