如何用vba实现网站数据采集(网抓、爬虫)?

要实现网站数据的采集,首先要了解HTTP协议。

当我们在浏览器中输入网址,按下回车时,客户端会发送一个请求到服务器,服务器根据请求的内容返回数据到客户端,浏览器显示返回的结果。

当用编程的方法获取网站数据时,实际上就是模拟了以上的过程,客户端发送请求→服务器响应发回结果。

然后通过各种方式处理获得的结果,提取想要的数据。

本文介绍用WinHttpRequest对象进行网站数据采集的方法。

以下代码是最基础的获取网站数据的vba代码:

Sub QQ1722187970()
    Dim oHtml As Object
    Set oHtml = VBA.CreateObject("WinHttp.WinHttpRequest.5.1")
    Dim sUrl As String
    sUrl = "https://blog.csdn.net/tylm22733367/article/details/52596990"
    With oHtml
        .Open "GET", sUrl, False
        .send
        Debug.Print .ResponseText
    End With
    Set oHtml = Nothing
End Sub

由于不同的网站有不同的编码字符集,如果不是UTF-8或者Unicode编码字符集,用ResponseText返回的字符在VBA中会乱码。

为此,可以使用如下的代码实现通用的获取网站数据:

Sub QQ1722187970()
    Dim oHtml As Object
    Set oHtml = VBA.CreateObject("WinHttp.WinHttpRequest.5.1")
    Dim sUrl As String
    '指定要抓取的网站
    sUrl = "https://blog.csdn.net/tylm22733367/article/details/52596990"
    Dim sCharset As String
    '指定要抓取的网站的字符编码
    sCharset = "utf-8"
    With oHtml
        .Open "GET", sUrl, False
        .Send
        '获取返回的字节数组
        bResult = .ResponseBody
        '按照指定的字符编码显示
        sResult = Byte2String(bResult, sCharset)
        Debug.Print sResult
    End With
    Set oHtml = Nothing
End Sub
Function Byte2String(bContent, ByVal sCharset As String)
    Const adTypeBinary = 1
    Const adTypeText = 2
    Const adModeRead = 1
    Const adModeWrite = 2
    Const adModeReadWrite = 3
    Dim oStream As Object
    '创建流对象
    Set oStream = CreateObject("ADODB.Stream")
    With oStream
        '打开流
        .Open
        '设置为字节模式
        .Type = adTypeBinary
        '写入字节
        .Write bContent
        '将位置定位在第一个字节
        .Position = 0
        '设置为文本模式
        .Type = adTypeText
        '设置编码的字符集
        .Charset = sCharset
        '读取编码后的文本
         Byte2String = .ReadText
        '关闭流对象
        .Close
    End With
End Function
'提取网页表格的代码
Sub HtmlTable(ByVal sHtml As String)
    '网页html文档对象
    Dim oHtmlDom As Object
    '网页表格对象
    Dim oTable As Object
    '网页表格行对象
    Dim oRows As Object
    '网页表格单元格对象
    Dim oCells As Object
    '抓取的数据存放的excel表格对象
    Dim oWK As Worksheet
    Set oWK = Sheet1
    iRow = oWK.Range("a65536").End(xlUp).Row + 1
    Set oHtmlDom = CreateObject("htmlfile")
    With oHtmlDom
        .Body.innerHTML = sHtml
        Set oTable = .getElementsByTagName("table")(0)
        'Set obj = getElementById("id")
        With oTable
            Set oRows = .Rows
            For i = 1 To oRows.Length - 1
                Set oCells = oRows(i).Cells
                For j = 0 To oCells.Length - 1
                    oWK.Cells(iRow, j + 1) = oCells(j).innertext
                Next j
                iRow = iRow + 1
            Next i
        End With
    End With
    Set oHtmlDom = Nothing
    Set oTable = Nothing
    Set oRows = Nothing
    Set oCells = Nothing
End Sub

以上介绍的是最基本的GET网络请求的数据,如果要抓取POST请求的数据,可以使用如下的通用代码:

Sub QQ1722187970()
    Dim oHtml As Object
    Set oHtml = VBA.CreateObject("WinHttp.WinHttpRequest.5.1")
    Dim sUrl As String
    '指定要抓取的网站
    sUrl = "http://kw.beijing.gov.cn/module/web/jpage/dataproxy.jsp?startrecord=1&endrecord=120&perpage=40"
    Dim sPostText As String
    sPostText = "col=1&appid=1&webid=1&path=%2F&columnid=149&sourceContentType=3&unitid=2793&webname=%E5%8C%97%E4%BA%AC%E5%B8%82%E7%A7%91%E5%AD%A6%E6%8A%80%E6%9C%AF%E5%A7%94%E5%91%98%E4%BC%9A&permissiontype=0"
    Dim sCharset As String
    '指定要抓取的网站的字符编码
    sCharset = "utf-8"
    With oHtml
        .Open "POST", sUrl, False
        'POST方法一定要带Content-Type请求头
        .setRequestheader "Content-Type", "application/x-www-form-urlencoded; charset=UTF-8"
        .Send sPostText
        '获取返回的字节数组
        bResult = .ResponseBody
        '按照指定的字符编码显示
        sResult = Byte2String(bResult, sCharset)
        Debug.Print sResult
    End With
    Set oHtml = Nothing
End Sub
Function Byte2String(bContent, ByVal sCharset As String)
    Const adTypeBinary = 1
    Const adTypeText = 2
    Const adModeRead = 1
    Const adModeWrite = 2
    Const adModeReadWrite = 3
    Dim oStream As Object
    '创建流对象
    Set oStream = CreateObject("ADODB.Stream")
    With oStream
        '打开流
        .Open
        '设置为字节模式
        .Type = adTypeBinary
        '写入字节
        .Write bContent
        '将位置定位在第一个字节
        .Position = 0
        '设置为文本模式
        .Type = adTypeText
        '设置编码的字符集
        .Charset = sCharset
        '读取编码后的文本
         Byte2String = .ReadText
        '关闭流对象
        .Close
    End With
End Function
'提取网页表格的代码
Sub HtmlTable(ByVal sHtml As String)
    '网页html文档对象
    Dim oHtmlDom As Object
    '网页表格对象
    Dim oTable As Object
    '网页表格行对象
    Dim oRows As Object
    '网页表格单元格对象
    Dim oCells As Object
    '抓取的数据存放的excel表格对象
    Dim oWK As Worksheet
    Set oWK = Sheet1
    iRow = oWK.Range("a65536").End(xlUp).Row + 1
    Set oHtmlDom = CreateObject("htmlfile")
    With oHtmlDom
        .Body.innerHTML = sHtml
        Set oTable = .getElementsByTagName("table")(0)
        'Set obj = getElementById("id")
        With oTable
            Set oRows = .Rows
            For i = 1 To oRows.Length - 1
                Set oCells = oRows(i).Cells
                For j = 0 To oCells.Length - 1
                    oWK.Cells(iRow, j + 1) = oCells(j).innertext
                Next j
                iRow = iRow + 1
            Next i
        End With
    End With
    Set oHtmlDom = Nothing
    Set oTable = Nothing
    Set oRows = Nothing
    Set oCells = Nothing
End Sub
       

仅有1条评论 发表评论

  1. 匿名 /

    .aspx网站咋抓

  2. 匿名 /

    VBA爬虫如何伪装?设置reffer,可用winhttp

  3. 匿名 /

    VBA爬虫如何伪装

  4. 匿名 /

    怎么抓取不了?

发表评论