class TaskQueue { constructor(concurrency) { this.concurrency = concurrency; this.running = 0; this.queue = []; } pushTask(task) { this.queue.push(task); this.next(); } next() { while (this.running < this.concurrency && this.queue.length) { const task = this.queue.shift(); task(() => { this.running--; this.next(); }); this.running++; } } } module.exports = TaskQueue;
上边的代码就是队列的实现代码,核心是 next() 方法,可以看出,当task加入队列中后,会立刻执行,这不是说这个任务一定马上执行,而是指的是next会立刻调用。
(spider_v5.js)
const request = require("request"); const fs = require("fs"); const mkdirp = require("mkdirp"); const path = require("path"); const utilities = require("./utilities"); const TaskQueue = require("./task-Queue"); const downloadQueue = new TaskQueue(2); function saveFile(filename, contents, callback) { mkdirp(path.dirname(filename), err => { if (err) { return callback(err); } fs.writeFile(filename, contents, callback); }); } function download(url, filename, callback) { console.log(`Downloading ${url}`); request(url, (err, response, body) => { if (err) { return callback(err); } saveFile(filename, body, err => { if (err) { return callback(err); } console.log(`Downloaded and saved: ${url}`); callback(null, body); }); }) } /// 最大的启发是实现了如何异步循环遍历数组 function spiderLinks(currentUrl, body, nesting, callback) { if (nesting === 0) { return process.nextTick(callback); } const links = utilities.getPageLinks(currentUrl, body); if (links.length === 0) { return process.nextTick(callback); } let completed = 0, hasErrors = false; links.forEach(link => { /// 给队列出传递一个任务,这个任务首先是一个函数,其次该函数接受一个参数 /// 当调用任务时,触发该函数,然后给函数传递一个参数,告诉该函数在任务结束时干什么 downloadQueue.pushTask(done => { spider(link, nesting - 1, err => { /// 这里表示,只要发生错误,队列就会退出 if (err) { hasErrors = true; return callback(err); } if (++completed === links.length && !hasErrors) { callback(); } done(); }); }); }); } const spidering = new Map(); function spider(url, nesting, callback) { if (spidering.has(url)) { return process.nextTick(callback); } spidering.set(url, true); const filename = utilities.urlToFilename(url); /// In this pattern, there will be some issues. /// Possible problems to download the same url again and again。 fs.readFile(filename, "utf8", (err, body) => { if (err) { if (err.code !== 'ENOENT') { return callback(err); } return download(url, filename, (err, body) => { if (err) { return callback(err); } spiderLinks(url, body, nesting, callback); }); } spiderLinks(url, body, nesting, callback); }); } spider(process.argv[2], 2, (err, filename, downloaded) => { if (err) { console.log(`error: ${err}`); } else if (downloaded) { console.log(`Completed the download of ${filename}`); } else { console.log(`${filename} was already downloaded`); } });
因此,为了限制并发的个数,只需在 spiderLinks 方法中,把task遍历放入队列就可以了。这相对来说很简单。
到这里为止,我们使用原生JavaScript实现了一个有相对完整功能的网络蜘蛛,既能串行,也能并发,还可以控制并发个数。
2.使用async库
把不同的功能放到不同的函数中,会给我们带来巨大的好处,async库十分流行,它的性能也不错,它内部基于callback。
(spider_v6.js)
const request = require("request"); const fs = require("fs"); const mkdirp = require("mkdirp"); const path = require("path"); const utilities = require("./utilities"); const series = require("async/series"); const eachSeries = require("async/eachSeries"); function download(url, filename, callback) { console.log(`Downloading ${url}`); let body; series([ callback => { request(url, (err, response, resBody) => { if (err) { return callback(err); } body = resBody; callback(); }); }, mkdirp.bind(null, path.dirname(filename)), callback => { fs.writeFile(filename, body, callback); } ], err => { if (err) { return callback(err); } console.log(`Downloaded and saved: ${url}`); callback(null, body); }); } /// 最大的启发是实现了如何异步循环遍历数组 function spiderLinks(currentUrl, body, nesting, callback) { if (nesting === 0) { return process.nextTick(callback); } const links = utilities.getPageLinks(currentUrl, body); if (links.length === 0) { return process.nextTick(callback); } eachSeries(links, (link, cb) => { "use strict"; spider(link, nesting - 1, cb); }, callback); } const spidering = new Map(); function spider(url, nesting, callback) { if (spidering.has(url)) { return process.nextTick(callback); } spidering.set(url, true); const filename = utilities.urlToFilename(url); fs.readFile(filename, "utf8", (err, body) => { if (err) { if (err.code !== 'ENOENT') { return callback(err); } return download(url, filename, (err, body) => { if (err) { return callback(err); } spiderLinks(url, body, nesting, callback); }); } spiderLinks(url, body, nesting, callback); }); } spider(process.argv[2], 1, (err, filename, downloaded) => { if (err) { console.log(err); } else if (downloaded) { console.log(`Completed the download of ${filename}`); } else { console.log(`${filename} was already downloaded`); } });
在上边的代码中,我们只使用了async的三个功能:
const series = require("async/series"); // 串行 const eachSeries = require("async/eachSeries"); // 并行 const queue = require("async/queue"); // 队列
由于比较简单,就不做解释了。async中的队列的代码在(spider_v7.js)中,和上边我们自定义的队列很相似,也不做更多解释了。
3.Promise