快手H5端的粉丝数是字体反爬,抓到的html文本是乱码 <SPAN></SPAN> 可以看到对应的字体格式为 kwaiFont。
经过一顿分析操作,发现每次返回的ttf文件内容每次都不太一样,无法自己做一份映射模板, 那么就不做模板了。可以通过OCR 或者 KNN 进行内容识别。本人采用 OCR 方式进行识别。这里推荐一个很吊的 OCR 库 ddddocr.
流程分析找到对应ttf文件
分析ttf文件,将每个字体转换成图片
图片识别成文本
乱码映射
直接上代码 import re import ddddocr import requests from lxml import etree from io import BytesIO from fontTools.ttLib import TTFont from fontTools.pens.basePen import BasePen from reportlab.graphics.shapes import Path from reportlab.lib import colors from reportlab.graphics import renderPM from reportlab.graphics.shapes import Group, Drawing class ReportLabPen(BasePen): """ 绘图 """ def __init__(self, glyph_set, path=None): BasePen.__init__(self, glyph_set) if path is None: path = Path() self.path = path def _moveTo(self, p): (x, y) = p self.path.moveTo(x, y) def _lineTo(self, p): (x, y) = p self.path.lineTo(x, y) def _curveToOne(self, p1, p2, p3): (x1, y1) = p1 (x2, y2) = p2 (x3, y3) = p3 self.path.curveTo(x1, y1, x2, y2, x3, y3) def _closePath(self): self.path.closePath() class KuaiShouSpider(object): """ 快手爬虫 """ def __init__(self): # OCR 识别类 self.ocr = ddddocr.DdddOcr() def ttf_2_word_map(self, ttf_content, fmt="png"): """ ttf内容转文本 :param ttf_content: :param fmt: :return: """ font = TTFont(BytesIO(ttf_content)) gs = font.getGlyphSet() glyphNames = font.getGlyphNames() uniMap = font['cmap'].tables[0].ttFont.getBestCmap() key_map = dict() for k, v in uniMap.items(): key_map[v] = hex(k) data_dict = dict() for i in glyphNames: # 跳过'.notdef', '.null' if i[0] == '.': continue g = gs[i] pen = ReportLabPen(gs, Path(fillColor=colors.black, strokeWidth=5)) g.draw(pen) w = 800 h = 800 g = Group(pen.path) g.translate(0, 0) d = Drawing(w, h) d.add(g) img = renderPM.drawToString(d, fmt) data = self.ocr.classification(img) if data == '十': data = '+' elif data in [',', '。']: data = '.' key = key_map[i] data_dict[key] = data return data_dict @staticmethod def uni_code_2_word(uni_code, word_map): """ unicode 转 文本 :param uni_code: :param word_map: :return: """ def _sub(num): num = num.group() num = re.findall(r'\d+', num)[0] num = str(hex(int(num))) return word_map[num] data = re.sub('&#(.+?);', _sub, uni_code) return data def get_user_info(self): """ 获取用户数据 :return: """ url = 'https://c.kuaishou.com/fw/user/ounixiong?fid=0&cc=share_copylink&followRefer=151&shareMethod=TOKEN&kpn=KUAISHOU&subBiz=PROFILE&shareId=16509009682073&shareToken=X-7IIolIHVVgN2bx&shareResourceType=PROFILE_OTHER&shareMode=APP&originShareId=16509009682073&appType=21&shareObjectId=136457866&shareUrlOpened=0×tamp=1633759010452&captchaToken=HEADCgp6dC5jYXB0Y2hhEscCX569ztU1Y9XCAVp1Q5Rsm1H8fPYfPZBHvTyg5mwPyIQrJSR_j2mphorguzP9cB2sNWhg61OwW_LQEBvnHRS47j0GpmjIBOeqJ9j9kIbNTsXgNSQYZxkdToAm25EKa4ZLXOmE9ez5Bl-UMzRs4P2_g6SzI3fBs1yFvI7_eLd_yFogwimBE5eyopG9qDDm5lFPfSPm0GI6IhqLKpA1VBZd9cjZxsxq4jGlld1vYRxOFyfJis4oFSVM8fpDArN32KQ2pqejgjV8kK42jW-kpg4fl-1g5iWmqSczszEvEdB9s4l3QmQBfztuDSPbGf0yfY-whf93nOynaRmSeLH49sHSaPr_nwcGvjNjqeFdZoTpf2VBLV7mWvkVdthG0yV5Y6BqDPWSr57Js-dvLIcYlyq3gLbNxQOsulNch6o-HQ7dw2CZY006z-_eGhLniyxQb2WiE0ZVkCv0UGAb2gsoBTACTAIL' headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Cookie': 'did=web_232e842d3bcd4eceb358abfcf31ec030; didv=1634614098000; sid=e7921611a1cbb9669d28ce19; Hm_lvt_86a27b7db2c5c0ae37fee4a8a35033ee=1634614100; Hm_lpvt_86a27b7db2c5c0ae37fee4a8a35033ee=1634614104', 'DNT': '1', 'Host': 'c.kuaishou.com', 'Pragma': 'no-cache', 'Referer': 'https://c.kuaishou.com/fw/user/ounixiong?fid=0&cc=share_copylink&followRefer=151&shareMethod=TOKEN&kpn=KUAISHOU&subBiz=PROFILE&shareId=16509009682073&shareToken=X-7IIolIHVVgN2bx&shareResourceType=PROFILE_OTHER&shareMode=APP&originShareId=16509009682073&appType=21&shareObjectId=136457866&shareUrlOpened=0×tamp=1633759010452', 'sec-ch-ua': '"Chromium";v="94", "Google Chrome";v="94", ";Not A Brand";v="99"', 'sec-ch-ua-mobile': '?1', 'sec-ch-ua-platform': '"Android"', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'same-origin', 'Sec-Fetch-User': '?1', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Mobile Safari/537.36', } response = requests.get(url, headers=headers) # 获取网页中的ttf文件 try: ttf_file = re.findall(r'url\((https:.+?\.ttf)\)', response.text)[0] except Exception as err: print('网页访问异常') return ttf_data = requests.get(ttf_file) ttf_word = self.ttf_2_word_map(ttf_data.content) # 解析 html = etree.HTML(response.text) fans_node = html.xpath('//span[contains(text(),"粉丝")]/preceding-sibling::span[1]')[0] focus_node = html.xpath('//span[contains(text(),"关注")]/preceding-sibling::span[1]')[0] fans = etree.tostring(fans_node).decode('utf-8') focus = etree.tostring(focus_node).decode('utf-8') fans = re.findall('>(.+?)<', fans)[0] focus = re.findall('>(.+?)<', focus)[0] fans = self.uni_code_2_word(fans, ttf_word) focus = self.uni_code_2_word(focus, ttf_word) print(fans) print(focus) if __name__ == '__main__': spider = KuaiShouSpider() spider.get_user_info() 后记可以考虑一下用 KNN的方式根据字体特征进行分类,准备好一些样本,进行训练.