Data fetch, infomation extraction and build web application: all in R!

Several years ago, I created a web page to show the exam scores of schools in Norway on Google Map. The original intention of creating that web page is to help us avoid the area where the schools’ score are low when we want to change our apartment. Recent days, I found many people have intresting of this kind information. So, I created a web-app for that purpose with R. In case somebody want to do the similar work, I wrote the steps how I created the web-app. You can modify some of my codes to create the app you needed.

Step 1: fetch data from internet.

Note, if you have got your data ready, just skip this step.

The exam scores data can be found in https://skoleporten.udir.no. We can see there are 4 figures, and what I have interesting is the percentage of the students fall into to level 3 (high) for each subject. There is an option allow you to download the data of the figures mannually, but I don’t want to do that mannually, so I downloaded the source of this page and found the the figure’s data was stored in JSON format. JSON format array can be easily converted into list class of R, which is much easier to mannage for me. The page contains the average scores of whole Norway, and we need to find all schools’ URLs. By inspecting the elements of the page, we can find the there are 19 fylkes (counties), see the following screenshot:

If we click one fylke, we can find some schools inside that fylke, but not all. However, if you inspect the elements of the page again, we can see all the rest schools were also listed, but weren’t shown.

So, we can get all schools’ URLs and download their pages. The following is the R codes for downloading all schools’ pages.

library(XML)
library(RCurl)
library(RJSONIO)

## Fylker ##
udir <- "https://skoleporten.udir.no"
Norge <- "https://skoleporten.udir.no/rapportvisning/grunnskole/laeringsresultater/nasjonale-proever-5-trinn/nasjonalt?enhetsid=00&vurderingsomrade=11&underomrade=50&skoletype=0&skoletypemenuid=0&sammenstilling=1"
HTM <- getURL(Norge)
XML <- htmlParse(HTM, encoding = "UTF-8")
XML <- xpathApply(XML, "//li[@class='container fylker']/ul[1]/li/a[@href]")

n <- 0
for (f in 1:length(XML)) {
  urlF <- xmlAttrs(XML[[f]], "href")[2]
  fylk <- regmatches(urlF, regexec("trinn/(.*?)\\?", urlF))[[1]][2]
  dir_target <- paste("/srv/hd/NasjonalePrøver5/Data/", fylk, sep = "")
  if (!dir.exists(dir_target)) { dir.create(dir_target)}
  htm <- getURL(paste(udir, urlF, sep = ""))
  xml <- htmlParse(htm, encoding = "UTF-8")
  xml <- xpathApply(xml, "//li[@class='container']/ul[@class='linkGroup'][2]/li/a")
  m <- 0
  for (s in 1:length(xml)) {
    url <- paste(udir, xmlAttrs(xml[[s]], "href")[2], sep = "")
    skole <- regmatches(url, regexec("trinn/(.*?)\\?", url))[[1]][2]
    download.file(url = url, destfile = paste(dir_target, "/", skole, ".html", sep = ""), quiet = TRUE)
    n <- n + 1; m <- m + 1
    print(paste(n, m, fylk, skole, sep = ": "))
  }
}


Step 2: Geocoding schools address

The information about each schools contains no geo-coordinates. To show the schools on map we need their longitudes and latitudes. We can match their geo-coordinates through Google Map API. By constructing a URL address contains the schools name, address and “Norway”, we can get the latitude and longtitude of the schools by send a request to Google. The schools names and address inforamtion can extracted using xpath, here is the codes:

setwd("/srv")
file_ls <- list.files(path = "hd/NasjonalePrøver5/Data",recursive = TRUE, full.names = TRUE)
DAT <- data.frame(Fylke = rep(NA, length(file_ls)), Kommuner = NA, Skole = NA, Address = NA)

for (i in 1:length(file_ls)) {
  if (exists("XML")) rm(XML)
  XML <- htmlParse(getURL(file_ls[i], encoding = "UTF-8"))  
  xmlSkole <- xpathApply(XML, "//span[@class='enhetsnavn']")
  xmlAddress <- xpathApply(XML, "//div[@class='adressekolonne block']/div/div")
  xmlKommuner <- xpathApply(XML, "//div[@class='skoleeierkolonne block']/div/span[2]") 
  skole <- gsub("^ *| *$", "", gsub("\n *|  *", " ", xmlValue(xmlSkole[[1]]))) 
  address <- gsub("^ *| *$", "", gsub("\n *|  *", " ", paste (xmlValue(xmlAddress[[1]]), xmlValue(xmlAddress[[2]]))))
  kommuner <- gsub(" kommune skoleeier", "", gsub("\n *|  *", " ", xmlValue(xmlKommuner[[1]])))
  fylk <- regmatches(file_ls[i], regexec("Data/(.*?)/", file_ls[i]))[[1]][2]
  DAT$Fylke[i] <- fylk
  DAT$Kommuner[i] <- kommuner
  DAT$Skole[i] <- skole
  DAT$Address[i] <- address 
  print(i)
}

## Geocoding
DAT[, c("Lon", "Lat")] <- NA
googleMap <- "http://maps.googleapis.com/maps/api/geocode/xml?sensor=false&address="
for (i in 1:nrow(DAT)) { 
  url <- paste(googleMap, paste(paste(DAT$Address[i], DAT$Skole[i], DAT$Kommuner[i], sep= " "), "Norway", sep = " "), sep = "")
  url <- gsub(" ", "%20", url)
  if (exists("xml")) {rm(xml)}
  xml <- getURL(url); Sys.sleep(.5)
  if (exists("xml")) {
    DAT[i, 'Lon'] <- as.numeric(regmatches(xml, regexec("<lng>(.+?)</lng>", xml))[[1]][2])
    DAT[i, 'Lat'] <- as.numeric(regmatches(xml, regexec("<lat>(.+?)</lat>", xml))[[1]][2])
    print(i)
  }
}

save(DAT, file = "hd/NasjonalePrøver5/Data/SkoleGeo.RData")

Step 3: Information extraction.

This step acctually can be included into above step. However, geocoding may take longer time, and Google has a limited number of request, so, it’s better do it seperately.

Scores information stored in figures, and figure’s data stored as JavaScript array. Using regular expression may extract the information needed, but it’s much easier if we convert JavaScript array into list class of R. See the codes below:

setwd("/srv")
file_ls <- list.files(path = "hd/NasjonalePrøver5/Data", pattern = ".html", recursive = TRUE, full.names = TRUE)
load("hd/NasjonalePrøver5/Data/SkoleGeo.RData")

DAT$Engelsk2014 <- DAT$Engelsk2015 <- 
DAT$Lesing2014  <- DAT$Lesing2015 <-
DAT$Regning2014 <- DAT$Regning2015 <- NA

for (i in 1:length(file_ls)) {
  if (exists("XML")) rm(XML)
  ## htmlParse(getURL("hd/NasjonalePrøver5/Data/akershus-fylke/algarheim-skole.html"), encoding = "UTF-8")
  XML <- htmlParse(getURL(file_ls[i], encoding = "UTF-8"))
  
  ## Figure data
  xmlScript <- xpathApply(XML, "//script[contains(., 'showDiagram')]")
  if (length(xmlScript) == 4) {
    ## 2 Engelsk
    xmlEngelsk <- gsub("^.*?, '|',.*$", "", gsub("\\", "", xmlValue(xmlScript[[2]]), fixed = TRUE))
    if (grepl("chartType", xmlEngelsk)) {
      jsonEngelsk <- fromJSON(xmlEngelsk)
      for (n in 1: length(jsonEngelsk$categories)) {
        id <- jsonEngelsk$categories[[n]]$id
        if (id == "2014-2015") {
          v1 <- jsonEngelsk$series[[1]]$data[[n]]$verdi
          if (!is.null(v1)) {DAT$Engelsk14[i] <- v1}
        }
        if (id == "2015-2016") {
          v2 <- jsonEngelsk$series[[1]]$data[[n]]$verdi
          if (!is.null(v2)) {DAT$Engelsk15[i] <- v2}
        }
      }
    }
    
    ## 3 Lesing
    xmlLesing <- gsub("^.*?, '|',.*$", "", gsub("\\", "", xmlValue(xmlScript[[3]]), fixed = TRUE))
    if (grepl("chartType", xmlLesing)) {
      jsonLesing <- fromJSON(xmlLesing)
      for (n in 1: length(jsonLesing$categories)) {
        id <- jsonLesing$categories[[n]]$id
        if (id == "2014-2015") {
          v1 <- jsonLesing$series[[1]]$data[[n]]$verdi
          if (!is.null(v1)) {DAT$Lesing14[i] <- v1}
        }
        if (id == "2015-2016") {
          v2 <- jsonLesing$series[[1]]$data[[n]]$verdi
          if (!is.null(v2)) {DAT$Lesing15[i] <- v2}
        }
      }
      
    }
    
    ## 4 Regning
    xmlRegning <- gsub("^.*?, '|',.*$", "", gsub("\\", "", xmlValue(xmlScript[[4]]), fixed = TRUE))
    if (grepl("chartType", xmlRegning)) {
      jsonRegning <- fromJSON(xmlRegning)
      for (n in 1: length(jsonRegning$categories)) {
        id <- jsonRegning$categories[[n]]$id
        if (id == "2014-2015") {
          v1 <- jsonRegning$series[[1]]$data[[n]]$verdi
          if (!is.null(v1)) {DAT$Regning14[i] <- v1}
        }
        if (id == "2015-2016") {
          v2 <- jsonRegning$series[[1]]$data[[n]]$verdi
          if (!is.null(v2)) {DAT$Regning15[i] <- v2}
        }
      }
    }
  }
  print(i)
}
save(DAT, file = "hd/NasjonalePrøver5/Data/DAT.RData")


Step 4: create a web-app for interactively display and quire.

I will not present all files needed for creating a Shiny web-app, because even I post them all here, you may still can’t make the web-app run. But you can get all files on my gist.

ui.R


library(rCharts)
shinyUI(bootstrapPage(
  # Add custom CSS & Javascript;
  tagList(
    tags$head(
      tags$style("hr {margin: 5px 0;}"),
      tags$script(src="jquery-ui.js"),
      tags$script(src="app.js"),
      tags$link(rel="stylesheet", type="text/css",href="style.css")
    )
  ),
  img(id = "control", src = "control.png"),
  div(class = "Input", 
      HTML("<div class='drag'></div>"),
      uiOutput("uiFylke"),
      uiOutput("uiYear"),
      uiOutput("uiSubject"),
      uiOutput("uiChart"), 
      HTML("<hr></hr>"),
      uiOutput("uiLegend"),
      textInput(inputId = "log", label = "")),
  div(class = "control", 
      uiOutput("uiMapType")),
  div(class = "Output", 
      HTML("<div class='drag'></div>"),
      showOutput("fig", lib = "highcharts")),
  showOutput("myMap", "leaflet")
))

server.R

load("Data/DAT.RData")
shinyServer(function(input, output, session) {
  p <- reactiveValues()
  observe({
      msg <- gsub(".*ipInfo", paste(date(), "ipInfo"), input$log)
      if (nchar(msg) > 20) {
        write(msg, "www/visitors.txt", append  = TRUE)
      }  
  })
  ## Source input and output;
  source("RScript/input.R",  local = TRUE)
  source("RScript/jsFun.R", local = TRUE)
  source("RScript/output.R", local = TRUE)
})

output.R

observe({
  if (!is.null(input$chart)) {
    if (input$chart) {
      output$fig <- renderChart({
        ## Highchart basic;
        H <- Highcharts$new()
        H$addParams(dom = "fig")
        H$chart(type = "column", zoomType = "xy", width=1000, height = 600, spacingTop = 10, inverted = TRUE)
        H$subtitle(text = " ", style = list(fontSize = "14px", color = "black"))
        H$exporting(enabled = TRUE)
        H$legend(enabled = FALSE)
        H$data(p$dat$score)
        H$xAxis(tickLength = 0, 
                categories = p$dat$Skole,
                title      = list(text = "Skole"))
        H
      })
    }
  }
})

observe({
  if (!is.null(input$fylke)) {
    output$myMap <- renderMap({
      m <- Leaflet$new()
      m$addParams(width="100%", height=1000, layerOpts = list(maxZoom = 18))
      if (input$mapType == "openstreet") {
        m$addParams(urlTemplate= 'http://{s}.tile.openstreetmap.de/tiles/osmde/{z}/{x}/{y}.png')
      } else if (input$mapType == "statllite") {
        m$addParams(urlTemplate= 'http://server.arcgisonline.com/ArcGIS/rest/services/World_Imagery/MapServer/tile/{z}/{y}/{x}')
      } else if (input$mapType == "opencycle") {
        m$addParams(urlTemplate= 'http://{s}.tile3.opencyclemap.org/landscape/{z}/{x}/{y}.png')
      }
      ## Fylke
      dat <- DAT[which(DAT$Fylke %in% input$fylke), ]    
      ## Year
      id <- 1:6
      for (yr in input$year) {
          id <- c(id, grep(yr, names(dat)))
      }
      dat <- dat[, id]
      ## Subject
      id <- 1:6
      for (s in input$subject) {
        id <- c(id, grep(s, names(dat)))
      }
      dat <- dat[, id]  
      if (is.data.frame(dat[, -c(1:6)]) ) {
        dat$score <- round(rowMeans(dat[, -c(1:6)], na.rm = TRUE),1)
      } else {
        dat$score <- dat[, -c(1:6)]
      }
      dat <- dat[which(!is.na(dat$Lon) & !is.na(dat$Lat) & !is.na(dat$score)), ]
      print(dat)    
      lon <- mean(dat$Lon)
      lat <- mean(dat$Lat)    
      m$addParams(bounds = list(c(lat-.1,lon-.1), c(lat+.1,lon+.1)))
      ## Build GeoJSON list;
      Dat <- list()
      scoreUni <- sort(unique(round(dat$score)))
      color_ls <-   colorRampPalette(c("#00007F", "blue", "#007FFF", "cyan", "#7FFF7F", "yellow", "#FF7F00", "red", "#7F0000"))(length(scoreUni))
      for (i in 1:nrow(dat)) {
        geoList <- list(type = "Feature", 
                        geometry = list(type = "Point", coordinates = c(dat$Lon[i], dat$Lat[i])),
                        properties = list(color = "black", 
                                          fillColor = color_ls[which(scoreUni == round(dat$score[i]))],
                                          fillOpacity = .8,
                                          radius = 10*dat$score[i]/100 + 3,
                                          popup = paste(paste(c("School", "Address", "Score"), dat[i, c("Skole", "Address", "score")], sep = ": "), collapse = "<br>")
                        ))
        Dat[[length(Dat)+1L]] <- geoList
      }
      p$dat <- dat[rev(order(dat$score)), ]
      p$scoreUni <- scoreUni
      p$color_ls <- color_ls
      m$geoJson(Dat, onEachFeature = jsOnEachFeature,
                style = jsStyle, pointToLayer =jsPointToLayer)
      m
    })
  }
})

I hope this post can help you to build some simple web application.

Written on February 8, 2016