四、 使用ConcurrentBag创建一个可扩展的爬虫
本示例在多个独立的即可生产任务又可消费任务的工作者间如何扩展工作量。
1.程序代码如下。
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Collections.Concurrent;
using System.Diagnostics;
using System.Threading;
namespace ThreadCollectionDemo
{
class Program
{
static Dictionary<string, string[]> contextItems = new Dictionary<string, string[]>();
static void Main(string[] args)
{
Console.WriteLine(string.Format("----- ConcurrentBag 操作----"));
CreateLinks();
Task task = RunBag();
task.Wait();
Console.Read();
}
static async Task RunBag()
{
var taskBag = new ConcurrentBag<CrawlingTask>();
string[] urls = new string[] { "", "", "",
"", "" };
var crawlers = new Task[5];
for (int i = 1; i <= 5; i++)
{
string crawlerName = "Crawler " + i.ToString();
taskBag.Add(new CrawlingTask { UrlToCraw = urls[i - 1], ProductName = "root" });
crawlers[i - 1] = Task.Run(() => Craw(taskBag,crawlerName));
}
await Task.WhenAll(crawlers);
}
static async Task Craw(ConcurrentBag<CrawlingTask> bag, string crawlerName)
{
CrawlingTask task;
while (bag.TryTake(out task))
{
Console.WriteLine(" {0} url 从ConcurrentBag 取出,上一节点{1},名称{2}", task.UrlToCraw, task.ProductName, crawlerName);
IEnumerable<string> urls = await GetLinksFromContent(task);
if (urls != null)
{
foreach (var url in urls)
{
var t = new CrawlingTask
{
UrlToCraw = url,
ProductName = crawlerName
};
bag.Add(t);
}
}
}
if (task != null)
{
Console.WriteLine("第{0} 个url 添加到ConcurrentBag,线程名称{1},爬虫名称{2}", task.UrlToCraw, task.ProductName, crawlerName);
}
else
Console.WriteLine(" TASK IS NULL ");
}
static async Task<IEnumerable<string>> GetLinksFromContent(CrawlingTask task)
{
await GetRandomDely();
if (contextItems.ContainsKey(task.UrlToCraw))
return contextItems[task.UrlToCraw];
return null;
}
static void CreateLinks()
{
contextItems[""] = new[]{
"",""
};
contextItems[""] = new[]{
"",""
};
contextItems[""] = new[]{
"","",
"",""
};
contextItems[""] = new[]{
"",""
};
contextItems[""] = new[]{
"","",
"",""
};
}
static Task GetRandomDely()
{
int dely = new Random(DateTime.Now.Millisecond).Next(150, 600);
return Task.Delay(dely);
}
}
class CrawlingTask
{
public string UrlToCraw { get; set; }
public string ProductName { get; set; }
}
}