Here's one approach to read this (using the two packages I support, and the awesome stacksplitshape package). You will need the qdapTools dev qdapTools .
devtools::install_github("trinker/qdapTools") library(qdapTools); library(qdapRegex); library(splitstackshape) url<-"http://www.arrs.net/MaraList/ML_2014.htm" m <- readLines(url)[-c(1:7, 2760:2767)] ## Split into lists by country x <- loc_split(m, unique(grep("<B><FONT", m))) ## Clean up country names nms <- rm_angle(sapply(x, `[`, 1)) ## remove html country name from data can convert to a data.frame dat <- list2df(setNames(lapply(x, `[`, -1), nms), "dats", "Country")[, 2:1] ## Use hand parsing technique to locate widths ## I added a # before each column in row one of data ## gregexpr tells us the location of the # characters det <- "AAR #26#Jan #King George Island # #27+25 #White Continent #4:03:30 #Steve Hibbs (USA) #4:13:02 #Suzy Seeley (54,TX/USA) " widths <- gregexpr("#", det)[[1]] ## replace those widths with # character as it is not any where else in data set for (i in widths){ substring(dat[["dats"]], i, i) <- "#" } ## split columns on # character out <- cSplit(dat, 2, sep="#") out
source share