url_manager.py
encoding = \'utf-8\' class UrlManager(object): def __init__(self): self.new_urls = set() self.old_urls = set() #像管理器中添加一个新的url def add_new_url(self,url): if url is None: return if url not in self.new_urls and url not in self.old_urls: self.new_urls.add(url) # 向管理器中添加新的多个URL def add_new_urls(self,urls): if urls is None or len(urls) == 0: return for url in urls: self.add_new_url(url) #判断管理器中是否有新的待爬取的 def has_new_url(self): return len(self.new_urls) != 0 #从url管理器中获取新的待爬取的URL def get_new_url(self): #pop会从列表中获取一个url并移除这个url new_url = self.new_urls.pop() self.old_urls.add(new_url) return new_urlhtml_parser.py
encoding = \'utf-8\' from bs4 import BeautifulSoup import re from urllib import parse class HtmlParser(object): def _get_new_urls(self, page_url, soup): new_urls = set() links = soup.find_all(\'a\', href=http://www.likecs.com/re.compile(r"[\s\S]*")) print(links) for link in links: new_url = link[\'href\'] # urljoin自动将两个url拼接成一个完整的url new_full_url = parse.urljoin(page_url, new_url) new_urls.add(new_full_url) return new_urls # 解析 def _get_new_data(self, page_url, soup): res_data = {} res_data[\'url\'] = page_url # <dd> title_node = soup.find(\'dd\', class_=\'lemmaWgt-lemmaTitle-title\').find(\'h1\') res_data[\'title\'] = title_node.get_text() # <div label-module="lemmaSummary"> summary_node = soup.find(\'div\', class_=\'lemma-summary\') res_data[\'summary\'] = summary_node.get_text() return res_data def parse(self, page_url, html_cont): print(type(html_cont)) html_conts = html_cont.decode(\'utf-8\') print(type(html_conts)) if page_url is None or html_cont is None: print(\'page_url is None or html_cont is None\') return None print(\'Get new urls start\') soup = BeautifulSoup(html_conts, \'html.parser\', from_encoding=\'utf-8\') print(\'Get new urls over\') print(type(html_conts)) new_urls = self._get_new_urls(page_url, soup) new_data = self._get_new_data(page_url, soup) return new_urls, new_datahtml_downloader.py
encoding = \'utf-8\' import urllib.request class HtmlDownloader(object): def download(self,url): print(\'Here is download(),url:\',url) if url is None: print(\'But url is None\') return None print(\'URL is not None\') response = urllib.request.urlopen(url) print("Get response is over") if response.getcode() != 200: print("response.getcode() != 200") return None return response.read()html_outputer.py
encoding = \'utf-8\' class HtmlOutput(object): def __init__(self): self.datas = [] def collect_data(self,data): if data is None: return self.datas.append(data) def output_html(self): fout = open(\'output.html\',\'w\') fout.write("<html>") fout.write("<body>") fout.write("<table>") for data in self.datas: fout.write("<tr>") fout.write("<td>%s</td>" % data[\'url\']) fout.write("<td>%s</td>" % data[\'title\']) fout.write("<td>%s</td>" % data[\'summary\']) fout.write("<tr>") fout.write("</table>") fout.write("</body>") fout.write("</html>") fout.close()