In R, there is a very powerful function download.file() for grabbing html from website. I use this function in combination with regular expression to extract address information form the index web page so that I can get each individual html.
# TODO: Get data from website # # Author: Roger Everett ############################################################################### setwd('Your directory') ############################################################ #Constant Declaration ############################################################ items <- c('match','ccoef','crank','tcoef','trank') method <- c('method1','method2','method3','method4') base_path <- 'http://www.xs4all.nl/~kassiesa/bert/uefa/data/' # first download the main page url <- 'http://www.xs4all.nl/~kassiesa/bert/uefa/data/index.html' # as a courtesy to the maintainers of the web sites, if the index file exist, then # don't strain the server if (!file.exists("index.html")) download.file(url,'index.html',method = 'auto') indexFile <- file("index.html",'r') # open the saved file for read only indexContent <- readLines(indexFile) # extract individual file url from the index page match <- grep("^(<td><a href=\"method.)",indexContent, perl = TRUE, value = TRUE) #url_name store the unique identify for each html file, e.g. method3/coef2005.html, etc url_name <- sapply(match,function(x){substr(x,14,35)}, simplify = TRUE, USE.NAMES = FALSE) # url_list is the full path of the htmls url_list <- sapply(url_name,function(x){paste(base_path, x, sep='')}, simplify = TRUE, USE.NAMES = FALSE) close(indexFile) # create directories if(length(dir("method*")) != 4) { for (i in 1:length(method)){ dir.create(method[i]) } } #gather data for (i in 1:length(url_list)) { if (!file.exists(url_name[i])) download.file(url_list[i],url_name[i],method = 'auto') }
No comments:
Post a Comment