最近在家闲得无聊,由于家里开网点,妈妈对于起商品标题感到很头痛,所以我就想在淘宝爬取一些信息。
小破站找了个学习视频,跟一遍发现视频是2018年的,而淘宝在2019年可能加入了反爬取机制,使用正常的方法爬不到结果。
但是有一种方式可以爬取,要先登陆淘宝网页版,然后去搜索,获取cookie和user-agent。
代码如下:
import requests
import re
def getHTMLText(url):
kv = {
\'cookie\': \'cna=54y8Fm+TyioCATzcP+BwvvDA; thw=cn; lgc=%5Cu58A8%5Cu8FF9%5Cu9519%5Cu54AF%5Cu548C; tracknick=%5Cu58A8%5Cu8FF9%5Cu9519%5Cu54AF%5Cu548C; tg=0; enc=p6YWWbSWACqr5t1PcdDiNADVd7zKpnQG9X%2FZ666%2Fl7CM9%2FsOLpiM1WX5QQNnS%2B5ydtOFYKtHlmwg9AgeUX0Rjg%3D%3D; mt=ci=25_1; hng=CN%7Czh-CN%7CCNY%7C156; _m_h5_tk=9564049e168909dda591afc00632fed0_1581060181038; _m_h5_tk_enc=69c0046fffbc1f3750258e3f8fb06eb6; v=0; t=a8c0eb2a0d2265808242379d7f81bf64; cookie2=1f0913be01a97ec8038fb3c5354c793a; _tb_token_=ed66e3be155ef; alitrackid=www.taobao.com; _samesite_flag_=true; sgcookie=Q0iMW4gaGTiQdKuBbG0Q; unb=2514996592; uc3=vt3=F8dBxdzxpjLCa1%2BSc2Y%3D&id2=UU2zVEbkyyO2WQ%3D%3D&nk2=p2NDIgVDkDZm7A%3D%3D&lg2=VT5L2FSpMGV7TQ%3D%3D; csg=5b7ad9c4; cookie17=UU2zVEbkyyO2WQ%3D%3D; dnk=%5Cu58A8%5Cu8FF9%5Cu9519%5Cu54AF%5Cu548C; skt=c0aa8f1a6e59edcd; existShop=MTU4MTU5NTkzOQ%3D%3D; uc4=nk4=0%40pVWU4YQkw8jOJbWHe0nFlK5IE6%2Bq&id4=0%40U2%2F0ltjMwUTM8KkFFqREWIu1Zr5o; _cc_=VFC%2FuZ9ajQ%3D%3D; _l_g_=Ug%3D%3D; sg=%E5%92%8C22; _nk_=%5Cu58A8%5Cu8FF9%5Cu9519%5Cu54AF%5Cu548C; cookie1=BxAV4i9dmFVSXeCQYeZRwnecoaXNB46utcHDDLh6ZgY%3D; lastalitrackid=i.taobao.com; _uab_collina=158159732272025423845542; uc1=cookie14=UoTUO8VjaYjE6g%3D%3D&lng=zh_CN&cookie16=U%2BGCWk%2F74Mx5tgzv3dWpnhjPaQ%3D%3D&existShop=false&cookie21=WqG3DMC9Fb5mPLIQo9kR&tag=8&cookie15=W5iHLLyFOGW7aA%3D%3D&pas=0; x5sec=7b227365617263686170703b32223a226538646339643635363639643831393663383261346336303365333530316433435075626c664946454b664b6b596569376f544571514561444449314d5451354f5459314f5449374e413d3d227d; JSESSIONID=A874DB22E1D5B34B2B463E86B2D768B4; isg=BKys9N3R3jpM2Mp_Xhy2vn8bfYreZVAPB10kRwbs5dfbEU0bMHcin-7jMNGpuohn; l=dBLXnIncQINxmXxCBOfwVSFu6kQOaIOb8iVrDzSKMICPOc12kkfPWZVSsMLyCnhNHsh9m3oTJ-juByT1byCqJxpsw3k_J_DmndC..\',
\'user-agent\': \'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36\'}
try:
r = requests.get(url, headers=kv, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return ""
def parsePage(ilt, html):
try:
plt = re.findall(r\'\"view_price\"\:\"[\d\.]*\"\', html)
tlt = re.findall(r\'\"raw_title\"\:\".*?\"\', html)
for i in range(len(plt)):
price = eval(plt[i].split(\':\')[1])
title = eval(tlt[i].split(\':\')[1])
ilt.append([price, title])
except:
print("")
def printGoodsList(ilt):
tplt = "{:4}\t{:8}\t{:16}"
print(tplt.format("序号", "价格", "商品名称"))
# 写入文件
f = open(\'服饰.txt\', \'w\', encoding=\'utf-8\')
f.write(\'商品名称\n\')
count = 0
for g in ilt:
count = count + 1
print(tplt.format(count, g[0], g[1]))
f.write(g[1])
f.write(\'\r\n\')
def main():
goods = \'妈妈服饰\'
depth = 20
start_url = \'https://s.taobao.com/search?q=\' + goods
infoList = []
for i in range(depth):
try:
url = start_url + \'&s=\' + str(44 * i)
html = getHTMLText(url)
parsePage(infoList, html)
except:
continue
printGoodsList(infoList)
main()