locked
VBA Internet Explorer RRS feed

  • Question

  • The macro given below, opens a web site from which I would like to extract the main text.

    When accessing said web site only the translation of the original text is shown to the user. When moving the cursor over a paragraph, the orginial text is displayed additionally in a speech bubble.

    When using HTMLDoc.getElementById("text").innerText, I can only extract the first few paragraphs rather than the complete text. This holds also true when I stop the programm and wait until the site is loaded completely.

    If anyone knows what the problem how to extract the full text. This would be great!

    The macro is as follows:

    Declare Sub Sleep Lib "kernel32" (ByVal dwMilliseconds As Long)
    Declare Function GetSystemMetrics Lib "user32" (ByVal nIndex As Long) As Long

    Sub X()
       Dim t As Single
        
       t = Timer
       Sleep 1500
       Debug.Print Timer - t
    End Sub


    Sub LoadEspaceNet()

     Dim Description(1, 5000, 1)
     
        Dim Browser As SHDocVw.InternetExplorer
        Dim HTMLDoc As MSHTML.HTMLDocument

        Set Browser = New SHDocVw.InternetExplorer
        Browser.Visible = True
        Application.StatusBar = ".... opening page"
        
        
    'Espacenet Homepage
        Browser.navigate "http://worldwide.espacenet.com/?locale=de_EP" ' navigate to page
        
        'On Error Resume Next
        
        Do While Browser.Busy
            DoEvents
        Loop
                                        
        NumberDoc = 1
       
       
       Set HTMLDoc = Browser.Document                                 ' load the DOM object
       
       Do While Browser.Busy
            DoEvents
        Loop
        
        Dim Elem As Variant
        MsgBox "Load Document!"
        With HTMLDoc
             
             currentdoc = "EP2840773A2"
             
             .getElementById("cqlEditBox").Value = currentdoc
             .getElementById("submit").Click
             
        Do While Browser.Busy
            DoEvents
        Loop
        
        Call X
        
        Do While Browser.Busy
            DoEvents
        Loop
        MsgBox "Loading Document ... "
        
        .getElementById("Publicationid1").Click
         End With

        Do While Browser.Busy
            DoEvents
        Loop
        
    'Description Upload

        Do While Browser.Busy
             DoEvents
        Loop
        
        Call X
        
        For Each Coll In HTMLDoc.getElementsByTagName("a")
                    
             jx = jx + 1
             If InStr(Coll.innerText, "Beschreibung") > 0 Then Coll.Target = "_self": Coll.Click: Exit For
          
            ': If jx = 64 Then Coll.Click: Exit For
               
         Next
     
        Do While Browser.Busy
             DoEvents
        Loop

        MsgBox "Loading Description/Translation"
            
       With HTMLDoc.getElementById("translatethislink")
      .Target = "_self"
      .Click
        End With
        
          Call X
              Do While Browser.Busy
                DoEvents
                Loop

         
        For Each ad In HTMLDoc.getElementsByTagName("a")
              If InStr(ad.innerText, "Deutsch") > 0 Then ad.Target = "_self": kj = True: ad.Click: Exit For
              If InStr(ad.innerText, "Englisch") > 0 Then ad.Target = "_self": kj = True: ad.Click: Exit For
        Next
        
        MsgBox "Document "
        jxcounter = 0
        innerdescription = ""
       
       
        innerdescription = HTMLDoc.getElementById("text").innerText
     

    Selection.WholeStory
    Selection.Delete
     
    Selection.TypeText innerdescription
        
        
    End Sub

    Friday, March 27, 2015 4:03 PM

All replies

  • To get a better sense of what is behind the page, please try this Macro.

    Sub DumpData()
    
    Set IE = CreateObject("InternetExplorer.Application")
    IE.Visible = True
    
    URL = "http://www.sgx.com/wps/portal/sgxweb/home/company_disclosure/company_announcements"
    
    'Wait for site to fully load
    IE.Navigate2 URL
    Do While IE.Busy = True
    DoEvents
    Loop
    
    RowCount = 1
    
    With Sheets("Sheet1")
    .Cells.ClearContents
    RowCount = 1
    For Each itm In IE.document.all
    .Range("A" & RowCount) = Left(itm.innertext, 1024)
    RowCount = RowCount + 1
    Next itm
    End With
    End Sub 

    I got that from Joel a few years ago.  When it comes to this kind of stuff, he is one of the best, ever.



    Knowledge is the only thing that I can give you, and still retain, and we are both better off for it.

    Wednesday, April 1, 2015 5:42 AM