在编写网抓的解决方案时,会遇到需要提取网页的超链接元素<a>的href属性值的情况。
这时候一般使用getElementsByTagName方法获取网页的超链接元素<a>的集合对象,然后通过遍历集合获取具体的href属性值.
这时候可以使用如下的代码:
Sub QQ1722187970()
Excel.Application.ScreenUpdating = False
Excel.Application.Calculation = xlCalculationManual
Dim sVerb As String
Dim sUrl As String
Dim sCharset As String
Dim sPostData As String
sVerb = "GET"
sUrl = "http://www.mca.gov.cn/article/sj/xzqh//1980/?"
sResult = HtmlBasic("GET", sUrl)
Debug.Print rResult
Call HtmlTable(sResult)
Excel.Application.Calculation = xlCalculationAutomatic
Excel.Application.ScreenUpdating = True
End Sub
Function HtmlBasic(ByVal sVerb As String, ByVal sUrl As String, Optional ByVal sCharset As String = "utf-8", Optional ByVal sPostData As String = "")
'sVerb为发送的Html请求的方法,sUrl为具体的网址,sCharset为网址对应的字符集编码,sPostData为Post方法对应的发送body
Dim oHtml As Object
'https://msdn.microsoft.com/en-us/library/windows/desktop/aa384106(v=vs.85).aspx
Set oHtml = VBA.CreateObject("WinHttp.WinHttpRequest.5.1")
With oHtml
Select Case sVerb
Case "GET"
.Open "GET", sUrl, False
Case "POST"
.Open "POST", sUrl, False
.SetRequestHeader "Content-Type", "application/x-www-form-urlencoded"
End Select
.send (sPostData)
'获取返回的字节数组
bResult = .ResponseBody
'按照指定的字符编码显示
sResult = Byte2String(bResult, sCharset)
Debug.Print sResult
HtmlBasic = sResult
End With
Set oHtml = Nothing
End Function
Function Byte2String(bContent, ByVal sCharset As String)
Const adTypeBinary = 1
Const adTypeText = 2
Const adModeRead = 1
Const adModeWrite = 2
Const adModeReadWrite = 3
Dim oStream As Object
'创建流对象
Set oStream = CreateObject("ADODB.Stream")
With oStream
'打开流
.Open
'设置为字节模式
.Type = adTypeBinary
'写入字节
.write bContent
'将位置定位在第一个字节
.Position = 0
'设置为文本模式
.Type = adTypeText
'设置编码的字符集
.Charset = sCharset
'读取编码后的文本
Byte2String = .ReadText
'关闭流对象
.Close
End With
End Function
'提取网页表格的代码
Sub HtmlTable(ByVal sHtml As String)
'网页html文档对象
Dim oHtmlDom As Object
'网页表格对象
Dim oTable As Object
'网页表格行对象
Dim oRows As Object
'网页表格单元格对象
Dim oCells As Object
'网页超链接元素对象
Dim oA As Object
'抓取的数据存放的excel表格对象
Dim oWK As Worksheet
Set oWK = Excel.ActiveSheet
iRow = oWK.Range("a65536").End(xlUp).Row + 1
Set oHtmlDom = CreateObject("htmlfile")
With oHtmlDom
.body.innerhtml = sHtml
Set oTable = .getElementsByTagName("table")(0)
Set oA = oTable.getElementsByTagName("a")
'遍历第一个表格中的所有超链接
For i = 0 To oA.Length - 1
oWK.Cells(iRow, 1) = oA(i).href
iRow = iRow + 1
Next i
End With
oWK.Columns.AutoFit
Set oHtmlDom = Nothing
Set oTable = Nothing
Set oRows = Nothing
Set oCells = Nothing
End Sub


发表评论