自己做采集程序
2018-09-06 10:29
  现在网上的采集程序很多,但是有时候你发现一个好的网站,想自己做个采集工具采集一些信息,就需要自己去写程序了,其实这样的采集程序并不难写,主要是去分析源网站的网页结构。
首先去下载个XMLHTTP的类文件:
<%
Class xhttp
private cset,sUrl,sError
Private Sub Class_Initialize()
cset=UTF-8
cset=GB2312
sError=
end sub
  Private Sub Class_Terminate()
End Sub
  Public Property LET URL(theurl)
sUrl=theurl
end property
public property GET BasePath()
BasePath=mid(sUrl,1,InStrRev(sUrl,/)-1)
end property
public property GET FileName()
FileName=mid(sUrl,InStrRev(sUrl,/)+1)
end property
public property GET Html()
Html=BytesToBstr(getBody(sUrl))
end property
  public property GET xhttpError()
xhttpError=sError
end property
  private Function BytesToBstr(body)
on error resume next
Cset:GB2312 UTF-8
dim objstream 
set objstream = Server.CreateObject(adodb.stream) 
with objstream
.Type = 1 
.Mode = 3 
.Open 
.Write body 
.Position = 0 
.Type = 2 
.Charset = Cset 
BytesToBstr = .ReadText 
.Close 
end with
set objstream = nothing 
End Function 
  private function getBody(surl)
on error resume next
dim xmlHttp
Set xmlHttp=server.createobject(Msxml2.XMLHTTP.4.0)
set xmlHttp=server.createobject(Microsoft.XMLHTTP)
set xmlHttp=server.createobject(MSXML2.ServerXMLHTTP)
xmlHttp.setTimeouts 10000,10000,10000,30000
xmlHttp.open GET,surl,false
xmlHttp.send
if xmlHttp.readystate=4 then 
if xmlHttp.status=200 then
getBody=xmlhttp.responsebody
end if
else
getBody=
end if
  if Err.Number<>0 then 
sError=Err.Number
Err.clear
else
sError=
end if
set xmlHttp=nothing
end function
  Public function saveimage(tofile,isoverwrite)
on error resume next
dim objStream,objFSO,imgs
  if Not isoverwrite Then
Set objFSO = Server.CreateObject(Scripting.FileSystemObject)
If objFSO.FileExists(Server.MapPath(tofile)) Then
Exit Function
End If
Set objFSO = Nothing
End IF
  imgs=getBody(sUrl)
Set objStream = Server.CreateObject(ADODB.Stream)
with objStream
.Type =1
.Open
.write imgs
.SaveToFile server.mappath(tofile),2
.Close()
end with
set objstream=nothing
end function
end class
  %>
用了这个类文件,做起事情来就方便多了。
然后就可以分析采集网站的网页结构,写采集程序了。
下面给个例子:
<!--#include file=conn.asp-->
<!--#include file=inc/xhttp_class.asp-->
<!--#include file=inc/function.asp-->
<%
server.ScriptTimeout = 1000
%>
<html>
<head>
<meta http-equiv=Content-Type content=text/html; charset=gb2312 />
<title>BT采集器</title>
</head>
<body>
<form name=form1 method=post action=get81bt.asp>
分类ID:
<input type=text name=cid value=<%=request(cid)%>><br>
开始ID:
<input type=text name=startid value=<%=request(startid)%>>
<br>
结束ID:
<input type=text name=overid value=<%=request(overid)%>>
<br>
分类名称:<input type=text name=classname value=<%=request(classname)%>>为空自动获取
<br>
<input name=action type=hidden id=action value=getdata>
<input type=submit name=Submit value=采集>
</form>
当前ID:<%=request(id)%> <br>
<%
dim action
action = Request(action)
if action = getdata then
 cid = Request(cid)
 startid = Request(startid)
 overid = Request(overid)
 id = Request(id) 
 if id = then id = startid
 set objxhttp = new xhttp
 objxhttp.URL = 
 content = objxhttp.Html
 if InStr(content,网站维护中) then
 call NextID
 response.End()
 end if
 list = GetContent(content,<!--内容开始-->,<!--内容结束-->,0)
 Dim regEx, Match, Matches,patrn
 Set regEx = New RegExp
 patrn = <a href=../BtHtml/(.+?)>
 regEx.Pattern = patrn
 regEx.IgnoreCase = True
 regEx.Global = True
 Set Matches = regEx.Execute(list)
 on error resume next
 For Each Match in Matches
 response.write Match.Value & <br>
 weburl = 
 response.write weburl & <br>
 response.Flush()
 objxhttp.URL = weburl
 cpage = objxhttp.Html 
 cpage = GetContent(cpage,<!--内容开始-->,<!--内容结束-->,0)
 title = GetContent(cpage,BT资源名称:<strong>,</strong>,0)
 title = stripHTML(title)
 IF Request(classname) <> then
 classname = Request(classname)
 Else 
 if InStr(title,喜剧) then
 classname = 喜剧
 Elseif InStr(title,动作) then
 classname = 动作
 Elseif InStr(title,惊悚) then
 classname = 惊悚
 Elseif InStr(title,犯罪) then
 classname = 犯罪
 Elseif InStr(title,恐怖) then
 classname = 恐怖
 Elseif InStr(title,爱情) then
 classname = 爱情
 Elseif InStr(title,冒险) then
 classname = 冒险
 Elseif InStr(title,科幻) then
 classname = 科幻
 Elseif InStr(title,悬念) then
 classname = 悬念
 Elseif InStr(title,奇幻) then
 classname = 奇幻
 Elseif InStr(title,战争) then
 classname = 战争
 Elseif InStr(title,连续剧) then
 classname = 连续剧
 Elseif InStr(title,综艺) then
 classname = 综艺
 Elseif InStr(title,灾难) then
 classname = 灾难
 Elseif InStr(title,伦理) then
 classname = 伦理
 Elseif InStr(title,动漫) or InStr(title,动画) then
 classname = 动漫
 Elseif InStr(title,国语) or InStr(title,集) then
 classname = 其他影视
 Else
 classname = 其他
 End if
 End IF
 intro = GetContent(cpage,<tr><td width=770 bgcolor=#FFFFFF><div style=margin:10px;line-height:150%>,</div>,0)
 intro = Replace(intro,<br />,[br])
 intro = Replace(intro,<BR />,[br])
 intro = Replace(intro,<BR>,[br])
 intro = Replace(intro,<br>,[br])
 intro = Replace(intro,<p>,[p])
 intro = Replace(intro,<P>,[p])
 intro = Replace(intro,</p>,[/p])
 intro = Replace(intro,</P>,[p])
 intro = Replace(intro,<img,[img)
 intro = Replace(intro,<IMG,[img) 
 intro = stripHTML(intro)
 intro = Replace(intro,[br],<br>)
 intro = Replace(intro,[p],<p>)
 intro = Replace(intro,[/p],</p>)
 intro = Replace(intro,[img,<img)
 intro = Replace(intro,[img],<img src=)
 intro = Replace(intro,[/img],>)
 intro = Replace(intro,[IMG],<img src=)
 intro = Replace(intro,[/IMG],>)
 response.write t
 response.End()
 addtime = Trim(GetContent(cpage,发布时间:,,0))
 if Not IsDate(addtime) then addtime = now()
 username = bt
 filesize = GetContent(content,BT文件大小:,,0)
 title2 = title
 downurl = GetContent(cpage,<a style=color:red href=,,0)
 p = CDate(addtime)
 Dim sRnd
 Randomize
 sRnd = Int(900 * Rnd) + 100
 sFileName = year(p) & month(p) & day(p) & hour(now) & minute(now) & second(now) & sRnd & .torrent
 url = torrent/ & year(p) & - & month(p) & - & day(p) & / & sFileName
 Call CreateF(url)
 Text
 Response.Write classname & <br>
 Response.write title & <br>
 response.Write intro & <br>
 response.Write addtime & <br>
 response.Write username & <br>
 response.Write filesize & <br>
 response.Write downurl & <br>
 response.Write url & <br>
 response.Flush()
 response.End()
 database
 if err.number = 0 then
 if (Not IsNull(title)) and title <> and downurl <> then
 set rs = server.CreateObject(adodb.recordset)
 sql = select * from bt_class where classname = & classname & 
 rs.open sql,conn,1,3
 if rs.eof then
 rs.addnew
 rs(classname) = classname
 rs.update
 end if
 classid = rs(classid)
 rs.close
 set rs = nothing
 set rs = server.CreateObject(adodb.recordset)
 sql = select * from bt_movie where title in ( & title & )
 rs.open sql,conn,1,3
 if rs.eof then
 response.Write <div><font color=blue>写入数据库...</font></div>
 response.Flush()
 rs.addnew 
 rs(classid) = classid
 rs(title) = title
 rs(title2) = title2
 rs(intro) = intro
 rs(username) = username
 rs(filesize) = filesize
 rs(url) = url
 rs(serverid) = 1
 rs(addtime) = addtime
 rs(ismake) = 0
 rs.update
 objxhttp.URL = downurl
 objxhttp.saveimage url,False 
 else
 response.Write <div><font color=red>已经存在!</font></div>
 end if
 rs.close
 set rs = nothing
 objxhttp.URL = downurl
 objxhttp.saveimage url,False 
 End IF
 Else
 err.clear
 End IF
 response.Write -------------------------------------------<br>
 Next
 set regEx = nothing
 response.Write 下一页<br>
 response.Flush()
 Call NextID()
end if
Sub NextID
 conn.close
 set conn = nothing
 if cint(startid) < cint(overid) and cint(id) < cint(overid) then
 response.Write <script>location.href=get81bt.asp?action=getdata&classname= & Request(classname) & &cid= & cid & &startid= & startid & &overid= & overid & &id=& id + 1 &</script>
 Elseif cint(startid) > cint(overid) and cint(id) > cint(overid) then
 response.Write <script>location.href=get81bt.asp?action=getdata&classname= & Request(classname) & &cid= & cid & &startid= & startid & &overid= & overid & &id=& id - 1 &</script>
 Else
 Response.Write 采集完成!<br>
 response.End()
 End if
End Sub
%>
</body>
</html>