凹丫丫网络社区's Archiver

Amm 发表于 2008-5-19 18:40

实例讲解ASP实现抓取网上房产信息

&lt;<A href='mailto:%@LANGUAGE="VBSCRIPT'>%@LANGUAGE="VBSCRIPT</A>" CODEPAGE="936"%&gt;<BR>&lt;!-- #include file="conn.<A class=ReplaceKeyword href="http://www.knowsky.com/asp.asp" target=_blank>asp</A>" --&gt;
<P>&lt;!-- #include file="inc/function.asp" --&gt;<BR>&lt;!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "<A href="http://www.w3.org/TR/html4/loose.dtd">http://www.w3.org/TR/html4/loose.dtd</A>"&gt;<BR>&lt;html&gt;<BR>&lt;head&gt;<BR>&lt;title&gt;Untitled Document&lt;/title&gt;<BR>&lt;meta http-equiv="Content-Type" content="text/html; charset=gb2312"&gt;<BR>&lt;meta http-equiv="refresh" content="300;URL=steal_house.asp"&gt;<BR>&lt;/head&gt;</P>
<P>&lt;body&gt;<BR>&lt;%<BR>on error resume next<BR>' <BR>Server.ScriptTimeout = 999999<BR>'========================================================<BR>'字符编码函数<BR>'====================================================<BR>Function BytesToBstr(body,code) <BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; dim objstream <BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; set objstream = Server.CreateObject("adodb.stream") <BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; objstream.Type = 1 <BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; objstream.Mode =3 <BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; objstream.Open <BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; objstream.Write body <BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; objstream.Position = 0 <BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; objstream.Type = 2 <BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; objstream.Charset =code<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; BytesToBstr = objstream.ReadText&nbsp; <BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; objstream.Close <BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; set objstream = nothing <BR>End Function </P>
<P>'取行字符串在另一字符串中的出现位置<BR>Function Newstring(wstr,strng) <BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; Newstring=Instr(lcase(wstr),lcase(strng)) <BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; if Newstring&lt;=0 then Newstring=Len(wstr) <BR>End Function <BR>'替换字符串函数<BR>function ReplaceStr(ori,str1,str2)<BR>ReplaceStr=replace(ori,str1,str2)<BR>end function<BR>'====================================================<BR>function Read<A class=ReplaceKeyword href="http://www.knowsky.com/xml.asp" target=_blank>Xml</A>(url,code,start,ends)<BR>set oSend=createobject("Microsoft.XMLHTTP")<BR>SourceCode = oSend.open ("GET",url,false) <BR>oSend.send()<BR>ReadXml=BytesToBstr(oSend.responseBody,code )<BR>start=Instr(ReadXml,start)<BR>ReadXml=mid(ReadXml,start)<BR>ends=Instr(ReadXml,ends)<BR>ReadXml=left(ReadXml,ends-1)<BR>end function</P>
<P>function SubStr(body,start,ends)<BR>start=Instr(body,start)<BR>SubStr=mid(body,start+len(start)+1)<BR>ends=Instr(SubStr,ends)<BR>SubStr=left(SubStr,ends-1)<BR>end function</P>
<P>dim getcont,NewsContent<BR>dim url,title<BR>url="<A href="http://www.***.com&quot;'/">http://www.***.com"'</A>新闻网址knowsky.com<BR>getcont=ReadXml(url,"gb2312","&lt;table class=k2 border=""0""","&lt;/table&gt;")<BR>getcont=RegexHtml(getcont)<BR>dim KeyId,NewsClass,City,Position,HouseType,Level,Area,Price,Demostra</P>
<P>dim ContactMan,Contact<BR>for i=2 to ubound(getcont)<BR>&nbsp;response.Write(getcont(i)&amp;"__&lt;br&gt;")<BR>&nbsp;<BR>&nbsp;tempLink=mid(getcont(i),instr(getcont(i),"href=""")+6,instr(getcont(i),""" onClick")-10)<BR>&nbsp;tempLink=replace(tempLink,"../","")<BR>&nbsp; <BR>&nbsp;response.Write(i&amp;":"&amp;tempLink&amp;"&lt;br&gt;")<BR>&nbsp;NewsContent=ReadXml(tempLink,"gb2312","&lt;td valign=""bottom"" width=""400""&gt;","&lt;hr width=""760"" noshade size=""1"" color=""#808080""&gt; ")<BR>&nbsp;NewsContent=RemoveHtml(NewsContent)<BR>&nbsp;NewsContent=replace(NewsContent,VbCrLf,"")<BR>&nbsp;NewsContent=replace(NewsContent,vbNewLine,"")<BR>&nbsp;NewsContent=replace(NewsContent," ","")<BR>&nbsp;NewsContent=replace(NewsContent," ","")<BR>&nbsp;NewsContent=replace(NewsContent,"&amp;nbsp;","") <BR>&nbsp;NewsContent=replace(NewsContent,"\n","") <BR>&nbsp;NewsContent=replace(NewsContent,chr(10),"")<BR>&nbsp;NewsContent=replace(NewsContent,chr(13),"")<BR>&nbsp;'===============get Content=======================<BR>&nbsp;response.Write(NewsContent)<BR>&nbsp;KeyId=SubStr(NewsContent,"列号:","信息类别:")<BR>&nbsp;NewsClass=SubStr(NewsContent,"类别:","所在城市:")<BR>&nbsp;City=SubStr(NewsContent,"城市:","房屋具体位置:")<BR>&nbsp;Position=SubStr(NewsContent,"位置:","房屋类型:")<BR>&nbsp;HouseType=SubStr(NewsContent,"类型:","楼层:")<BR>&nbsp;Level=SubStr(NewsContent,"楼层:","使用面积:")<BR>&nbsp;Area=SubStr(NewsContent,"面积:","房价:")<BR>&nbsp;Price=SubStr(NewsContent,"房价:","其他说明:")<BR>&nbsp;Demostra=SubStr(NewsContent,"说明:","联系人:")<BR>&nbsp;ContactMan=SubStr(NewsContent,"联系人:","联系方式:")<BR>&nbsp;Contact=SubStr(NewsContent,"联系方式:","信息来源:") <BR>&nbsp;response.Write("总序列号:"&amp;KeyId&amp;"&lt;br&gt;")<BR>&nbsp;response.Write("信息类别:"&amp;NewsClass&amp;"&lt;br&gt;")<BR>&nbsp;response.Write("所在城市:"&amp;City&amp;"&lt;br&gt;")<BR>&nbsp;response.Write("房屋具体位置:"&amp;Position&amp;"&lt;br&gt;")<BR>&nbsp;response.Write("房屋类型:"&amp;HouseType&amp;"&lt;br&gt;")<BR>&nbsp;response.Write("楼层:"&amp;Level&amp;"&lt;br&gt;")<BR>&nbsp;response.Write("使用面积:"&amp;Area&amp;"&lt;br&gt;")<BR>&nbsp;response.Write("房价:"&amp;Price&amp;"&lt;br&gt;")<BR>&nbsp;response.Write("其他说明:"&amp;Demostra&amp;"&lt;br&gt;")<BR>&nbsp;response.Write("联系人:"&amp;ContactMan&amp;"&lt;br&gt;")<BR>&nbsp;response.Write("联系方式:"&amp;Contact&amp;"&lt;br&gt;")<BR>&nbsp;'title=RemoveHTML(aa(i))<BR>&nbsp;'response.Write("title:"&amp;title)<BR>&nbsp;for n=0 to application.Contents.count<BR>&nbsp;&nbsp; if(application.Contents(n)=KeyId) then<BR>&nbsp;&nbsp;&nbsp; ifexit=true&nbsp;&nbsp;&nbsp;&nbsp; <BR>&nbsp;&nbsp; end if&nbsp;&nbsp; <BR>&nbsp;next&nbsp; <BR>&nbsp;if not ifexit then<BR>&nbsp;&nbsp; application(time&amp;i)=KeyId<BR>&nbsp;'添加到<A class=ReplaceKeyword href="http://www.knowsky.com/sql.asp" target=_blank>数据库</A><BR>&nbsp;'====================================================<BR>&nbsp;set rs=server.CreateObject("adodb.recordset") <BR>&nbsp;rs.open "select top 1 * from news order by id desc",conn,3,3<BR>&nbsp;rs.addnew<BR>&nbsp;rs("NewsClass")=NewsClass<BR>&nbsp;rs("City")=City<BR>&nbsp;rs("Position")=Position<BR>&nbsp;rs("HouseType")=HouseType<BR>&nbsp;rs("Level")=Level<BR>&nbsp;rs("Area")=Area<BR>&nbsp;rs("Price")=Price<BR>&nbsp;rs("Demostra")=Demostra<BR>&nbsp;rs("ContactMan")=ContactMan<BR>&nbsp;rs("Contact")=Contact<BR>&nbsp;rs.update<BR>&nbsp;rs.close<BR>&nbsp;set rs=nothing<BR>&nbsp;end if<BR>&nbsp;'==================================================<BR>&nbsp;<BR>next<BR>function RemoveTag(body)</P>
<P>&nbsp;Set regEx = New RegExp<BR>&nbsp;regEx.Pattern = "&lt;[a].*?&lt;\/[a]&gt;"<BR>&nbsp;regEx.IgnoreCase = True<BR>&nbsp;regEx.Global = True<BR>&nbsp;Set Matches = regEx.Execute(body) <BR>&nbsp;dim i,arr(15),ifexit<BR>&nbsp;i=0<BR>&nbsp;j=0<BR>&nbsp;For Each Match in Matches<BR>&nbsp; TempStr = Match.Value&nbsp; <BR>&nbsp; TempStr=replace(TempStr,"&lt;td&gt;","")<BR>&nbsp; TempStr=replace(TempStr,"&lt;/td&gt;","")<BR>&nbsp; TempStr=replace(TempStr,"&lt;tr&gt;","")<BR>&nbsp; TempStr=replace(TempStr,"&lt;/tr&gt;","")&nbsp; <BR>&nbsp; arr(i)=TempStr&nbsp; <BR>&nbsp; i=i+1<BR>&nbsp; if(i&gt;=15) then<BR>&nbsp;&nbsp; exit for<BR>&nbsp; end if<BR>&nbsp;Next<BR>&nbsp;Set regEx=nothing<BR>&nbsp;Set Matches =nothing<BR>&nbsp;RemoveTag=arr<BR>&nbsp;<BR>end function<BR>function RegexHtml(body)<BR>&nbsp;dim r_arr(47),r_temp<BR>&nbsp;Set regEx2 = New RegExp<BR>&nbsp;regEx2.Pattern ="&lt;a.*?&lt;\/a&gt;"<BR>&nbsp;regEx2.IgnoreCase = True<BR>&nbsp;regEx2.Global = True<BR>&nbsp;Set Matches2 = regEx2.Execute(body) <BR>&nbsp;iii=0 <BR>&nbsp;For Each Match in Matches2<BR>&nbsp; <BR>&nbsp; r_arr(iii)=Match.Value<BR>&nbsp; <BR>&nbsp; iii=iii+1&nbsp; <BR>&nbsp;Next<BR>&nbsp;RegexHtml=r_arr<BR>&nbsp;set regEx2=nothing<BR>&nbsp;set Matches2=nothing<BR>end function<BR>'======================================================</P>
<P>conn.close<BR>set conn=nothing<BR>%&gt;<BR>&lt;/body&gt;<BR>&lt;/html&gt;</P>
<P>&nbsp;</P>
<P><BR>  function.asp</P>
<P>&nbsp;&lt;%<BR>'**************************************************<BR>'函数名:gotTopic<BR>'作&nbsp; 用:截字符串,汉字一个算两个字符,英文算一个字符<BR>'参&nbsp; 数:str&nbsp;&nbsp; ----原字符串<BR>'&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; strlen ----截取长度<BR>'返回值:截取后的字符串<BR>'**************************************************<BR>function gotTopic(str,strlen)<BR>&nbsp;if str="" then<BR>&nbsp; gotTopic=""<BR>&nbsp; exit function<BR>&nbsp;end if<BR>&nbsp;dim l,t,c, i<BR>&nbsp;str=replace(replace(replace(replace(str,"&amp;nbsp;"," "),"&amp;quot;",chr(34)),"&amp;gt;","&gt;"),"&amp;lt;","&lt;")<BR>&nbsp;str=replace(str,"?","")<BR>&nbsp;l=len(str)<BR>&nbsp;t=0<BR>&nbsp;for i=1 to l<BR>&nbsp; c=Abs(Asc(Mid(str,i,1)))<BR>&nbsp; if c&gt;255 then<BR>&nbsp;&nbsp; t=t+2<BR>&nbsp; else<BR>&nbsp;&nbsp; t=t+1<BR>&nbsp; end if<BR>&nbsp; if t&gt;=strlen then<BR>&nbsp;&nbsp; gotTopic=left(str,i) &amp; "…"<BR>&nbsp;&nbsp; exit for<BR>&nbsp; else<BR>&nbsp;&nbsp; gotTopic=str<BR>&nbsp; end if<BR>&nbsp;next<BR>&nbsp;gotTopic=replace(replace(replace(replace(gotTopic," ","&amp;nbsp;"),chr(34),"&amp;quot;"),"&gt;","&amp;gt;"),"&lt;","&amp;lt;")<BR>end function<BR>'=========================================================<BR>'函数:RemoveHTML(strHTML)<BR>'功能:去除HTML标记<BR>'参数:strHTML&nbsp; --要去除HTML标记的字符串<BR>'=========================================================<BR>Function RemoveHTML(strHTML) <BR>Dim objRegExp, Match, Matches <BR>Set objRegExp = New Regexp </P>
<P>objRegExp.IgnoreCase = True <BR>objRegExp.Global = True <BR>'取闭合的&lt;&gt; <BR>objRegExp.Pattern = "&lt;.+?&gt;" <BR>'进行匹配 <BR>Set Matches = objRegExp.Execute(strHTML) </P>
<P>' 遍历匹配集合,并替换掉匹配的项目 <BR>For Each Match in Matches <BR>strHtml=Replace(strHTML,Match.Value,"") <BR>Next <BR>RemoveHTML=strHTML <BR>Set objRegExp = Nothing <BR>set Matches=nothing<BR>End Function </P>
<P>%&gt;<BR>&nbsp;</P>
<P><BR>  conn.asp</P>
<P>&nbsp;&lt;%<BR>'on error resume next<BR>set conn=server.CreateObject("adodb.connection") <BR>con= "driver={Microsoft <A class=ReplaceKeyword href="http://www.knowsky.com/article.asp?typeid=173" target=_blank>Access</A> Driver (*.mdb)};dbq=" &amp; Server.MapPath("stest.mdb") <BR>conn.open con</P>
<P>sub connclose <BR>&nbsp;&nbsp; conn.close<BR>&nbsp;&nbsp; set conn=nothing&nbsp;&nbsp; <BR>end sub<BR>%&gt;<BR></P>

页: [1]

Powered by Discuz! Archiver 7.0.0  © 2001-2009 Comsenz Inc.