REBOL [ title: "TURTLE (sTUff foR Testing Links on the nEt)" date: 2000-11-12 Name: "TURTLE" file: %turtle.r author: "Francine Le Peintre" email: francine.lepeintre@ias.u-psud.fr organization: "D2SET" Web-Site: http://www.multimania.com/d2set/ Version: 3.1 History: { created on 2000-11-12 (1.0) modified on 2000-11-15 (1.1) modified on 2000-12-10 (2.0) modified on 2000-12-15 (2.1) modified on 2000-12-16 (2.2) modified on 2000-12-17 (3.0) modified on 2000-12-18 (3.1) } Language: "English" Purpose: { TURTLE contains functions to test urls and local links : - test-url/file looks for all the urls inside a html file, tests it and write the unreachable urls into a text file. - test-url/dir looks for all html files inside a directory (optionnaly, this can be done recursively), and calls test-url/file function for each of them. - test-link/file does the same as test-url/file, but for local links instead of urls. - test-link/dir does the same as test-url/dir, but calls test-link/file instead of test-url/file function. } Usage: { type first do %turtle.r and then any of these functions : test-url/file %file.html %listfile this appends into listfile the unreachable urls found in file.html. test-url/dir %path %listfile this appends into listfile the unreachable urls found in the html files present in path directory. In listfile, the list of these urls for each file are preceded by the name of the file on a line beginning with "#####". test-url/dir/r %path %listfile does the same recursively through all the arborescence of path directory. test-link/file %file.html %listfile this appends into listfile the broken links found in file.html test-link/dir %path %listfile this appends into listfile the broken links found in the html files present in path directory. In listfile, the list of these links for each file are preceded by the name of the file on a line beginning with "#####". test-link/dir/r %path %listfile does the same recursively through all the arborescence of path directory. } Comments: { if you wish no standard outputs, set verbose variable (see below) to false. } ] ;################ TURTLE - MAIN ###################### verbose: true ; verbose mode - replace true by false for no outputs secure allow if verbose [ print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" print " Welcome in TURTLE :" print " sTUff foR Testing Links on the nEt " print " " print "-----------------------------------------------------------------------" print system/script/header/Purpose print " " print " for use of TURTLE functions, type turtle-help" print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" ] help: system/script/header/Usage ;-------------- turtle-help function ------------------- turtle-help: func[ { prints a help for turtle} ] [print help ] ;----------------------- test object --------------------- ; constructor for test-url and test-link object; ; contains dir function that is common to both test: make object! [ ;- - - - - - dir function - - - - - - dir: func[ {scans through all the html files of the directory given in argument, and calls file function that looks for the strings containing an url, and tries the connection to each of these urls ; if this failed, the url is stored in Check-File file.} dir-to-scan "the directory to search in" [file!] outputfile " the text file into which unreachable urls are stored" [file!] /r "refinement : recursive scan of directory dir-to-scan" ] [ if verbose [print [ "search of html files inside" dir-to-scan]] files: make block! 100 ; block for storing names of the ; files inside the current directory dirs: make block! 100 ; block for storing names of the ; directories inside the current directory foreach fileindir read dir-to-scan [ ; list everybody inside current directory ; and parts files and directories into ; the two blocks files and dirs either #"/" = last fileindir [ ; case fileindir is a directory append dirs fileindir ] [ ; case fileindir is a file append files fileindir ] ] ; end of foreach ; treats first the files foreach fileindir files [ curfile: join dir-to-scan fileindir ; forms the complete path to fileindir self/file curfile outputfile ; ] either r [ ; treats now recursively the directories, if recursive refinement is set foreach dirindir dirs [ curdir: join dir-to-scan dirindir ; forms the complete path to dirindir dir/r curdir outputfile ] ][] ; end of either return ] ; end of dir function ]; end of test object ; --------------- test-url object -------------------------- test-url: make test [ ;- - - - - - - file function - - - - - - file: func[ {scans through the html file given in argument, looks for the strings containing a :// and tries the connection to each of these URLs.} file-to-scan "the file to search in" [file!] outputfile "the text file into which unreachable urls are stored" [file!] ] [ set [path target] split-path file-to-scan ; retrieves directory (path) of file-to-scan strfile: to-string file-to-scan ; converts file type to string if (find strfile ".htm") <> none [ ; selects only the html files if verbose [ print [" search of urls inside " file-to-scan]] write/append/lines outputfile rejoin [ "#####" file-to-scan " :" ] ; writes the name of the file into outputfile tag-text: load/markup file-to-scan ; retrieves block text of file-to-scan foreach tag tag-text [ ; lists all the items of the text block if tag? tag [ ; seeks if it's a tag if parse tag [ "A" thru "HREF=" [{"} copy link to {"} | copy link to ">"] to end ] [ strurl: to-string link ; converts to string (in fact it is already a string ...) url: to-url link ; converts to url type if (find strurl "://") <> none [ ; tests if reading url returns an error either error? test: try [read url] [ ; something is wrong in reading url res: probe disarm test if (res/id = 'no-connect)[ ; it's really serious write/append/lines outputfile rejoin [ strurl " " now] if verbose [print [ " " strurl ": cannot connect"]] ] if (res/id = 'message)[ ; it can be serious if (find res/arg1 "404 Not Found") [ write/append/lines outputfile rejoin [ strurl " " now] if verbose [print [ " " strurl ": cannot connect"]] ] ] ] [ ; do nothing: all is right if verbose [ print [ " " strurl ": OK"]] ] ] ;end of if (find strurl... ] ; end of if parse tag ... ] ; end of if tag? tag ... ] ; end of foreach ] ; end of find strfile ] ; end of file function ] ; end of test-url object ;--------------- test-link object -------------------------- test-link: make test [ ;- - - - - - - file function - - - - - - file: func[ {scans through the html file given in argument, looks for the strings beginning with "href=" that contains not a :// or a mailto: and tries to open each of these links.} file-to-scan "the file to search in" [file!] outputfile "the text file into which broken links are stored" [file!] ] [ set [path target] split-path file-to-scan ; retrieves directory (path) of file-to-scan strfile: to-string file-to-scan ; converts file type to string if (find strfile ".htm") <> none [ ; selects only the html files if verbose [ print [" search of links inside " file-to-scan]] write/append/lines outputfile rejoin [ "#####" file-to-scan " :" ] ; writes the name of the file into outputfile tag-text: load/markup file-to-scan ; retrieves block text of file-to-scan string-text: to-string tag-text ; converts block text into string (used to look for bookmark) foreach tag tag-text [ ; lists all the items of the text block if tag? tag [ ; seeks if it's a tag if parse tag [ "A" thru "HREF=" [{"} copy strlk to {"} | copy strlk to ">"] to end ] [ strlink: join path strlk ; forms the complete path to the file if all [(find strlink "://") = none (find strlink "mailto:") = none] [ ; link is a local link (not an url or a mailto) either (find strlink "#") <> none [ ; strlink is a bookmark either (first strlink) = "#" [; strlink is a bookmark in the same page (# is the first character) if (seek-bookmark strlink string-text) = 'false [ ; if the tag does not exist, appends in outputfile write/append/lines outputfile rejoin [ strlink " " now] if verbose [print [ " " strlink ": bookmark does not exists"]] ] ] [ ; strlink is a bookmark in another page splitblock: parse strlink {#} ; splits strlink into the name of the file and the bookmark strbookmarkfile: first splitblock ;retrieves path to the file either (seek-file strbookmarkfile) = 'false [ ; if file does not exist write/append/lines outputfile rejoin [ strlink " " now] if verbose [print [ " " strlink ": file does not exists"]] ] [ ; if file exists bookmark: join "#" second splitblock ; forms the bookmark page-text: to-string load/markup (to-file strbookmarkfile) either (seek-bookmark bookmark page-text) = 'false [; if the tag does not exist write/append/lines outputfile rejoin [ strlink " " now] if verbose [print [ " " strlink ": bookmark does not exists"]] ] [; do nothing : all is right if verbose [print [ " " strlink ": OK"]] ] ]; end of either (seek-file ... ]; end of either first strlink ... ] [ ; link is not a bookmark either (seek-file strlink) = 'false [ write/append/lines outputfile rejoin [ strlink " " now] if verbose [print [ " " strlink ": cannot open"]] ] [ ; do nothing : all is right if verbose [print [ " " strlink ": OK"]] ] ] ; end of either (find strlink ... ] ; end of if all ... ] ; end of if parse tag ... ] ; end of if tag? tag ... ] ; end of foreach ] ; end of find strfile ] ; end of file function ;- - - - - - - - - - - seek-bookmark function - - - - - - - seek-bookmark: func[{ looks inside page-text for the "}] ; forms the matching tag check: find page-text bookmarktag ; looks for it in the page either check = none [ ; if the tag does not exist, returns false return 'false ] [ return 'true ] ]; end of seek-bookmark function ;- - - - - - - - - seek-file function - - - - - - - - seek-file: func[{looks if file named strfile does exists ; returns 'true if yes, 'false if not} strfile "the name of the file to look for" [string!] ] [ thefile: to-file strfile ; converts string into file type test: error? try [read thefile] ; tests if reading file returns an error either test [ mesg: disarm try [read thefile] res: probe mesg/id either (res = 'cannot-open)[ return 'false ; file does not exists ] [ return 'true ; there's trouble to read the file, but likely it's not serious ] ] [ return 'true ; all is right ] ]; end of seek-file function ]; end of test-link object