实例讲解ASP实现抓取网上房产信息
2018-09-05 23:52
  <%@LANGUAGE=VBSCRIPTCODEPAGE=936%> 
<!--#includefile=conn.ASP--> 
<!--#includefile=inc/function.asp--> 
<!DOCTYPEHTMLPUBLIC-//W3C//DTDHTML4.01Transitional//EN
<html> 
<head> 
<title>UntitledDocument</title> 
<metahttp-equiv=Content-Typecontent=text/html;charset=gb2312> 
<metahttp-equiv=refreshcontent=300;URL=steal_house.asp> 
</head> 
<body> 
<% 
onerrorresumenext 
Server.ScriptTimeout=999999 
======================================================== 
字符编码函数 
==================================================== 
FunctionBytesToBstr(body,code) 
dimobjstream 
setobjstream=Server.createObject(adodb.stream) 
objstream.Type=1 
objstream.Mode=3 
objstream.Open 
objstream.Writebody 
objstream.Position=0 
objstream.Type=2 
objstream.Charset=code 
BytesToBstr=objstream.ReadText 
objstream.Close 
setobjstream=nothing 
EndFunction 
取行字符串在另一字符串中的出现位置 
FunctionNewstring(wstr,strng) 
Newstring=Instr(lcase(wstr),lcase(strng)) 
ifNewstring<=0thenNewstring=Len(wstr) 
EndFunction 
替换字符串函数 
functionReplaceStr(ori,str1,str2) 
ReplaceStr=replace(ori,str1,str2) 
endfunction 
==================================================== 
functionReadXML(url,code,start,ends) 
setoSend=createobject(Microsoft.XMLHTTP) 
SourceCode=oSend.open(GET,url,false) 
oSend.send() 
ReadXml=BytesToBstr(oSend.responseBody,code) 
start=Instr(ReadXml,start) 
ReadXml=mid(ReadXml,start) 
ends=Instr(ReadXml,ends) 
ReadXml=left(ReadXml,ends-1) 
endfunction 
functionSubStr(body,start,ends) 
start=Instr(body,start) 
SubStr=mid(body,start+len(start)+1) 
ends=Instr(SubStr,ends) 
SubStr=left(SubStr,ends-1) 
endfunction 
dimgetcont,NewsContent 
dimurl,title 
url=新闻网址
getcont=ReadXml(url,gb2312,<tableclass=k2border=0,</table>) 
getcont=RegexHtml(getcont) 
dimKeyId,NewsClass,City,Position,HouseType,Level,Area,Price,Demostra 
dimContactMan,Contact 
fori=2toubound(getcont) 
response.Write(getcont(i)&__<br>) 
tempLink=mid(getcont(i),instr(getcont(i),href=)+6,instr(getcont(i),onClick)-10) 
tempLink=replace(tempLink,../,) 
response.Write(i&:&tempLink&<br>) 
NewsContent=ReadXml(tempLink,gb2312,<tdvalign=bottomwidth=400>,<hrwidth=760noshadesize=1color=#808080>) 
NewsContent=RemoveHtml(NewsContent) 
NewsContent=replace(NewsContent,VbCrLf,) 
NewsContent=replace(NewsContent,vbNewLine,) 
NewsContent=replace(NewsContent,,) 
NewsContent=replace(NewsContent,,) 
NewsContent=replace(NewsContent,,) 
NewsContent=replace(NewsContent,\n,) 
NewsContent=replace(NewsContent,chr(10),) 
NewsContent=replace(NewsContent,chr(13),) 
===============getContent======================= 
response.Write(NewsContent) 
KeyId=SubStr(NewsContent,列号:,信息类别:) 
NewsClass=SubStr(NewsContent,类别:,所在城市:) 
City=SubStr(NewsContent,城市:,房屋具体位置:) 
Position=SubStr(NewsContent,位置:,房屋类型:) 
HouseType=SubStr(NewsContent,类型:,楼层:) 
Level=SubStr(NewsContent,楼层:,使用面积:) 
Area=SubStr(NewsContent,面积:,房价:) 
Price=SubStr(NewsContent,房价:,其他说明:) 
Demostra=SubStr(NewsContent,说明:,联系人:) 
ContactMan=SubStr(NewsContent,联系人:,联系方式:) 
Contact=SubStr(NewsContent,联系方式:,信息来源:) 
response.Write(总序列号:&KeyId&<br>) 
response.Write(信息类别:&NewsClass&<br>) 
response.Write(所在城市:&City&<br>) 
response.Write(房屋具体位置:&Position&<br>) 
response.Write(房屋类型:&HouseType&<br>) 
response.Write(楼层:&Level&<br>) 
response.Write(使用面积:&Area&<br>) 
response.Write(房价:&Price&<br>) 
response.Write(其他说明:&Demostra&<br>) 
response.Write(联系人:&ContactMan&<br>) 
response.Write(联系方式:&Contact&<br>) 
title=RemoveHTML(aa(i)) 
response.Write(title:&title) 
if(application.Contents(n)=KeyId)then 
ifexit=true 
endif 
next 
ifnotifexitthen 
application(time&i)=KeyId 
添加到数据库 
==================================================== 
setrs=server.createObject(adodb.recordset) 
rs.openselecttop1*fromnewsorderbyiddesc,conn,3,3 
rs.addnew 
rs(NewsClass)=NewsClass 
rs(City)=City 
rs(Position)=Position 
rs(HouseType)=HouseType 
rs(Level)=Level 
rs(Area)=Area 
rs(Price)=Price 
rs(Demostra)=Demostra 
rs(ContactMan)=ContactMan 
rs(Contact)=Contact 
rs.update 
rs.close 
setrs=nothing 
endif 
================================================== 
next 
functionRemoveTag(body) 
SetregEx=NewRegExp 
regEx.Pattern=<[a].*?<\/[a]> 
regEx.IgnoreCase=True 
regEx.Global=True 
SetMatches=regEx.Execute(body) 
dimi,arr(15),ifexit 
i=0 
j=0 
ForEachMatchinMatches 
TempStr=Match.Value 
TempStr=replace(TempStr,<td>,) 
TempStr=replace(TempStr,</td>,) 
TempStr=replace(TempStr,<tr>,) 
TempStr=replace(TempStr,</tr>,) 
arr(i)=TempStr 
i=i+1 
if(i>=15)then 
exitfor 
endif 
Next 
SetregEx=nothing 
SetMatches=nothing 
RemoveTag=arr 
endfunction 
functionRegexHtml(body) 
dimr_arr(47),r_temp 
SetregEx2=NewRegExp 
regEx2.Pattern=<a.*?<\/a> 
regEx2.IgnoreCase=True 
regEx2.Global=True 
SetMatches2=regEx2.Execute(body) 
iii=0 
ForEachMatchinMatches2 
r_arr(iii)=Match.Value 
iii=iii+1 
Next 
RegexHtml=r_arr 
setregEx2=nothing 
setMatches2=nothing 
endfunction 
====================================================== 
conn.close 
setconn=nothing 
%> 
</body> 
</html> 
function.asp 
<% 
************************************************** 
函数名:gotTopic 
作用:截字符串,汉字一个算两个字符,英文算一个字符 
参数:str----原字符串 
strlen----截取长度 
返回值:截取后的字符串 
************************************************** 
functiongotTopic(str,strlen) 
ifstr=then 
gotTopic= 
exitfunction 
endif 
diml,t,c,i 
str=replace(replace(replace(replace(str,,),",chr(34)),>,>),<,<) 
str=replace(str,?,) 
l=len(str) 
t=0 
fori=1tol 
c=Abs(Asc(Mid(str,i,1))) 
ifc>255then 
t=t+2 
else 
t=t+1 
endif 
ift>=strlenthen 
gotTopic=left(str,i)&… 
exitfor 
else 
gotTopic=str 
endif 
next 
gotTopic=replace(replace(replace(replace(gotTopic,,),chr(34),"),>,>),<,<) 
endfunction 
========================================================= 
函数:RemoveHTML(strHTML) 
功能:去除HTML标记 
参数:strHTML--要去除HTML标记的字符串 
========================================================= 
FunctionRemoveHTML(strHTML) 
DimobjRegExp,Match,Matches 
SetobjRegExp=NewRegexp 
objRegExp.IgnoreCase=True 
objRegExp.Global=True 
取闭合的<> 
objRegExp.Pattern=<.+?> 
进行匹配 
SetMatches=objRegExp.Execute(strHTML) 
遍历匹配集合,并替换掉匹配的项目 
ForEachMatchinMatches 
strHtml=Replace(strHTML,Match.Value,) 
Next 
RemoveHTML=strHTML 
SetobjRegExp=Nothing 
setMatches=nothing 
EndFunction 
%> 
conn.asp 
<% 
onerrorresumenext 
setconn=server.createObject(adodb.connection) 
con=driver={MicrosoftAccessDriver(*.mdb)};dbq=&Server.MapPath(stest.mdb) 
conn.opencon 
subconnclose 
conn.close 
setconn=nothing 
endsub 
%>