自己做采集程序(2)
然后就可以分析采集网站的网页结构,写采集程序了。
下面给个例子:
<!--#include file="conn.asp"-->
<!--#include file="inc/xhttp_class.asp"-->
<!--#include file="inc/function.asp"-->
<%
server.ScriptTimeout = 1000
%>
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=gb2312" />
<title>BT采集器</title>
</head>
<body>
<form name="form1" method="post" action="get81bt.asp">
分类ID:
<input type="text" name="cid" value="<%=request("cid")%>"><br>
开始ID:
<input type="text" name="startid" value="<%=request("startid")%>">
<br>
结束ID:
<input type="text" name="overid" value="<%=request("overid")%>">
<br>
分类名称:<input type="text" name="classname" value="<%=request("classname")%>">为空自动获取
<br>
<input name="action" type="hidden" id="action" value="getdata">
<input type="submit" name="Submit" value="采集">
</form>
当前ID:<%=request("id")%> <br>
<%
dim action
action = Request("action")
if action = "getdata" then
cid = Request("cid")
startid = Request("startid")
overid = Request("overid")
id = Request("id")
if id = "" then id = startid
set objxhttp = new xhttp
objxhttp.URL = "http://www.81dd.com/Class/"&cid&"_"&id&".htm"
content = objxhttp.Html
if InStr(content,"网站维护中") then
call NextID
response.End()
end if
list = GetContent(content,"<!--内容开始-->","<!--内容结束-->",0)
Dim regEx, Match, Matches,patrn
Set regEx = New RegExp
patrn = "<a href=""../BtHtml/(.+?)"">"
regEx.Pattern = patrn
regEx.IgnoreCase = True
regEx.Global = True
Set Matches = regEx.Execute(list)
on error resume next
For Each Match in Matches
'response.write Match.Value & "<br>"
weburl = "http://www.81dd.com/BtHtml/" & regEx.Replace(Match.Value,"$1")
response.write weburl & "<br>"
内容版权声明:除非注明,否则皆为本站原创文章。