Zuma Lifeguard Wiki
Advertisement

This part of the WikiTaxi article.

Script File: ss_AllPages.vbs[]

If WScript.Arguments.Count < 3 Then
	wscript.echo "must specify parameter"
	wscript.quit
End If

Set htmlFile = GetFileFromArgument( 0 )
Set outputFolder = GetFolderFromArgument( 1 )
wikiSite = WScript.Arguments.Item(2)

' Load the htmlFile and get a pointer to the all tag of the DOM
Set ie = CreateObject("InternetExplorer.Application")
Set all = GetIEForHTML( ie, htmlFile.Path )

'============
' Script Specific Logic 
' Makes sure all tables have an ID
FixIDs "table"

' Grab all the 'a' tags inside of the table called 'fly1'"
Set alist = all.tags("a")
If alist Is Nothing Then wscript.echo "could not get alist" : wscript.quit
If alist.length = 0 Then wscript.echo "alist empty" : wscript.quit

elementCount = 0
For Each a In alist
	If UCase(a.parentElement.tagName) = "TD" Then
	    If a.parentElement.parentElement.parentElement.parentElement.id = "fly1" Then
			urlEnd = Replace( Mid( a.href, GetIndexAfterToken( a.href, "/wiki/" ), 999 ), "%", "%%" )
		    wscript.echo "wget -O """ & outputFolder.Path & "\" & strClean(a.innerText) & ".xml"" " & wikiSite & "/wiki/Special:Export/" & urlEnd
			wscript.echo "if not defined pingskip ping 1.1.1.1 -n 1 -w 1000"
			elementCount = elementCount + 1
	    End If
	End If
Next 
wscript.echo "set alistelementcount=" & elementCount
'============

ie.Quit

Function GetFileFromArgument( ByVal arg )
	Set fso = CreateObject("Scripting.FileSystemObject")
	Set GetFileFromArgument = fso.GetFile( WScript.Arguments.Item(arg) )
End Function

Function GetFolderFromArgument( ByVal arg )
	Set fso = CreateObject("Scripting.FileSystemObject")
	Set GetFolderFromArgument = fso.GetFolder( WScript.Arguments.Item(arg) )
End Function

Sub FixIDs( ByVal tagName )
	Set tagList = all.tags(tagName)
	If tagList Is nothing Then wscript.echo "could not get tagList:" & tagName : wscript.quit

	id = 0
	For Each table In tagList
		If Len(table.id) = 0 Then table.id = "fly" & id : id = id + 1
	Next
End Sub

Function GetIEForHTML( ByVal ie, ByVal path )
	ie.Navigate htmlFile.Path, CLng(1024)
	WScript.sleep 5000
	Set GetIEForHTML = ie.document.all
End Function

Function GetIndexAfterToken(ByVal outerText, ByVal searchFor )
    index = InStr( outerText, searchFor )
    If index > 0 Then
        index = index + Len( searchFor )
    End If        
    GetIndexAfterToken = CLng( index )
End Function


Function strClean(strtoclean)
   Dim objRegExp, outputStr
   Set objRegExp = New Regexp

   objRegExp.IgnoreCase = True
   objRegExp.Global = True
   objRegExp.Pattern = "[(?*"",\\<>&#~%{}+_.@:\/!;]+"
   outputStr = objRegExp.Replace(strtoclean, "-")

   objRegExp.Pattern = "\-+"
   outputStr = objRegExp.Replace(outputStr, "-")

   strClean = outputStr
End Function

This can also serve as an example for screen-scraping.

Advertisement