NSW Anti-Discrimination Board Statistics: Trend Analysis

An effort to visualize trend in racial complaints reported to the Anti-Discrimination Board, NSW, Australia. These incidents may be serious enough to be reported, and thus form a tiny sample compared to the whole population of unreported ones.

Drawing
Source: http://data.gov.au/dataset?q=discrimination

As CSV is not available, have to manually download HTML data and stitch things together. Here is the code (main function fetch_discrimination_data) to download HTML data using rvest

substrRight <- function(x, n){  
  substr(x, nchar(x)-n+1, nchar(x))
}

pattern_selector <- function(h2_text) {  
  # Construct the XPATH for common header selector
  h2_anchor <- sprintf("//h2[contains(., '%s')]", h2_text)

  # Construct the final XPATH
  sprintf("%s/following-sibling::table[1] | (%s/following-sibling::div[@class='clearfix']/table)[1]", 
          h2_anchor, h2_anchor)
}

####### MAIN ######
fetch_discrimination_data <- function(year) {  
  from_year <- toString(year)
  to_year <- toString(year + 1)

  base_url <- "http://www.antidiscrimination.justice.nsw.gov.au/Pages/adb1_resources/adb1_statistics/"
  path_url <- sprintf("%s%s%s%s.aspx", 
                      ifelse(year < 2011, "adb1_statistics", "stats"),
                      substrRight(from_year, 2),
                      ifelse(year < 2012, "_", "-"),
                      substrRight(to_year, 2))
  discrimination_url <- paste(base_url, path_url, sep = "")
  discrimination_markup <- html(discrimination_url)

  selector <- paste(c(pattern_selector('Complaints received by ground and area'),
                      pattern_selector('Complaints received by Ground and Area')), 
                    collapse = " | ")

  discrimination_table <- discrimination_markup %>% 
                            html_nodes(xpath = selector) %>% 
                            html_table()
  discrimination_frame <- discrimination_table[[1]]

  # For some reason following doesn't work for year = 2013 case
  # discrimination_frame <- discrimination_frame[discrimination_frame$X1 == "Race", 2:6]
  discrimination_frame <- discrimination_frame[as.integer(factor(discrimination_frame$X1)) == 13, 2:6]

  names(discrimination_frame) <- c("employment", "goods_and_services", "accomodation", "education", "clubs")

  # subset columns
  data.frame(discrimination_frame, year=year)
}

Following code uses rbind to build the time series data -

frame <- NA  
for(year in 1999:2013) {  
  print(sprintf("Year - %s", year))
  print(str(frame))

  if (is.na(frame)) {
    frame <- fetch_discrimination_data(year)  
  }
  else {
    frame <- rbind(frame, fetch_discrimination_data(year))  
  }
}