In R, there is a very powerful function download.file() for grabbing html from website. I use this function in combination with regular expression to extract address information form the index web page so that I can get each individual html.
# TODO: Get data from website
#
# Author: Roger Everett
###############################################################################
setwd('Your directory')
############################################################
#Constant Declaration
############################################################
items <- c('match','ccoef','crank','tcoef','trank')
method <- c('method1','method2','method3','method4')
base_path <- 'http://www.xs4all.nl/~kassiesa/bert/uefa/data/'
# first download the main page
url <- 'http://www.xs4all.nl/~kassiesa/bert/uefa/data/index.html'
# as a courtesy to the maintainers of the web sites, if the index file exist, then
# don't strain the server
if (!file.exists("index.html")) download.file(url,'index.html',method = 'auto')
indexFile <- file("index.html",'r') # open the saved file for read only
indexContent <- readLines(indexFile)
# extract individual file url from the index page
match <- grep("^(<td><a href=\"method.)",indexContent, perl = TRUE, value = TRUE)
#url_name store the unique identify for each html file, e.g. method3/coef2005.html, etc
url_name <- sapply(match,function(x){substr(x,14,35)},
simplify = TRUE, USE.NAMES = FALSE)
# url_list is the full path of the htmls
url_list <- sapply(url_name,function(x){paste(base_path, x, sep='')},
simplify = TRUE, USE.NAMES = FALSE)
close(indexFile)
# create directories
if(length(dir("method*")) != 4) {
for (i in 1:length(method)){
dir.create(method[i])
}
}
#gather data
for (i in 1:length(url_list))
{
if (!file.exists(url_name[i])) download.file(url_list[i],url_name[i],method = 'auto')
}
No comments:
Post a Comment