## The Iowa City Press-Citizen provides data on ITBS results over time ## at all Iowa City schools at ## http://www.press-citizen.com/interactive/article/99999999/DATA/304220034/Iowa-Tests-Basic-Skills-school-by-school-scores ## I do not know how long the data can be expected to be available. ## ## This code reads in the data for the elementary schools, and does ## some rearranging and some plotting. The code needs a lot of ## cleaning up. ## ## The Percentile and Standard data have been written out and saved in ## http://www.stat.uiowa.edu/~luke/classes/248/examples/ITBS/ICPC-ITBS-Standard.csv ## http://www.stat.uiowa.edu/~luke/classes/248/examples/ITBS/ICPC-ITBS-Percentile.csv ## page <- htmlParse("http://www.press-citizen.com/interactive/article/99999999/DATA/304220003/Coralville-Central-Elementary") ## tabs <- readHTMLTable(page, stringsAsFactors = FALSE) ## extract <- function(tab, rows) { ## data <- as.data.frame(lapply(tab[rows[-1],], as.numeric)) ## names <- as.character(tab[rows[1],]) ## Encoding(names) <- "UTF-8" ## structure(data, names = names) ## } extract <- function(tab, rows) { data <- as.data.frame(lapply(tab[rows[-1],], as.numeric)) names <- as.character(tab[rows[1],]) ## set encoding of names to UTF-8 to help with initial quote Encoding(names) <- "UTF-8" ## trim names to last two digits -- mainly means drop initial quote ncm1 <-nchar(names[-1]) names[-1] <- substr(names[-1], ncm1 - 1, ncm1) structure(data, names = names) } ## missing values seem to be wither 'x' or '?' ## extract <- function(tab, rows) { ## conv <- function(val) ## as.numeric(ifelse(val == "x", NA_character_, val)) ## data <- as.data.frame(lapply(tab[rows[-1],], conv)) ## names <- as.character(tab[rows[1],]) ## structure(data, names = names) ## } ## splitScores <- function(tab) { ## list(Reading = extract(tab, 1 : 5), ## Language = extract(tab, 8 : 12), ## Math = extract(tab, 15 : 19), ## Core = extract(tab, 22 : 26), ## SocialScience = extract(tab, 29 : 33), ## Science = extract(tab, 36 : 40), ## Composite = extract(tab, 43 : 47)) ## } splitScores <- function(tab) { ng <- (nrow(tab) - 7 * 3 + 2) / 7 start <- seq.int(from = 1, by = 3 + ng, length.out = 7) end <- seq.int(from = 1 + ng, by = 3 + ng, length.out = 7) ranges <- mapply(`:`, start, end, SIMPLIFY = FALSE) structure(lapply(ranges, function(r) extract(tab, r)), names = c("Reading", "Language", "Math", "Core", "SocialScience", "Science", "Composite")) } urls <- paste("http://www.press-citizen.com/interactive/article/99999999/DATA", c("304220003/Coralville-Central-Elementary", "304220006/Hills-Elementary", "304220007/Hoover-Elementary", "304220008/Horn-Elementary", "304220010/Kirkwood-Elementary", "304220011/Lemme-Elementary", "304220012/Lincoln-Elementary", "304220014/Longfellow-Elementary", "304220015/Lucas-Elementary", "304220016/Mann-Elementary", "304220021/Penn-Elementary", "304220027/Roosevelt-Elementary", "304220028/Shimek-Elementary", "304220035/Twain-Elementary", "304220036/Van-Allen-Elementary", "304220038/Weber-Elementary", "304220043/Wickham-Elementary", "304220045/Wood-Elementary", "304220024/Regina-Elementary"), sep = "/") ## Secondary Schools: ## "304220020/Northwest-Junior-High" ## "90414003/Heritage-Christian-School" ## "304220030/South-East-Junior-High" ## "304220026/Regina-Junior-High" ## "304220019/North-Central-Junior-High" ## "304220040/West-High" ## "304220025/Regina-High" ## "304220031/Tate-High-School" ## "90414005/City-High" getSchoolData <- function(url) { capture.output(page <- htmlParse(url)) tabs <- readHTMLTable(page, stringsAsFactors = FALSE) list(Percentile = splitScores(tabs[[1]]), Standard = splitScores(tabs[[2]])) } elementarySchools <- c("Coralville-Central", "Hills", "Hoover", "Horn", "Kirkwood", "Lemme", "Lincoln", "Longfellow", "Lucas", "Mann", "Penn", "Roosevelt", "Shimek", "Twain", "Van Allen", "Weber", "Wickham", "Wood", "Regina") library(XML) library(reshape) sd <- lapply(urls, getSchoolData) names(sd) <- elementarySchools sd$Lincoln$Standard$Math ml <- melt(sd$Lincoln$Standard$Math, id = "Grade", variable_name = "Year") head(ml) tapply(ml$value, ml$Grade, mean) mergeSchoolTests <- function(s) { st <- lapply(seq_along(s), function(i) cbind(s[[i]], Test = names(s)[i])) do.call(rbind, st) } mergeAllTests <- function(sd, which) { mergeAndName <- function(i) cbind(mergeSchoolTests(sd[[i]][[which]]), School = names(sd)[i]) do.call(rbind, lapply(seq_along(sd), mergeAndName)) } ## Regina tables are inconsistent to drop for now sd$Regina <- NULL Standard <- mergeAllTests(sd, "Standard") Percentile <- mergeAllTests(sd, "Percentile") write.csv(Standard, file = "ICPC-ITBS-Standard.csv") write.csv(Percentile, file = "ICPC-ITBS-Percentile.csv") std <- read.csv("ICPC-ITBS-Standard.csv", stringsAsFactors = FALSE, row.names = 1) names(std) <- sub("X", "", names(std)) head(std) pct <- read.csv("ICPC-ITBS-Percentile.csv", stringsAsFactors = FALSE, row.names = 1) names(pct) <- sub("X", "", names(pct)) head(pct) mS <- melt(Standard, id=c("Grade", "Test", "School"), variable_name = "Year") names(mS)[5] <- "Score" head(mS) library(lattice) xyplot(Score ~ Grade | Year, group = School, type = "l", data = mS) xyplot(Score ~ Grade | Year, group = Test, type = "l", data = mS, auto.key = TRUE) xyplot(Score ~ Grade | Year, group = Test, type = "l", data = mS, subset = School == "Lincoln")