Skip to content

Commit

Permalink
include counts for products on mid-level (non leaf) categories
Browse files Browse the repository at this point in the history
  • Loading branch information
peterschretlen committed Jan 8, 2016
1 parent 7839400 commit 8cc895b
Show file tree
Hide file tree
Showing 5 changed files with 140 additions and 72 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@
*.Rproj
data/*
plots_screencaps/*
widget_tmp
88 changes: 88 additions & 0 deletions build_treemap.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
library(treemap)
library(dplyr)
# devtools::install_github("timelyportfolio/d3treeR")
library(d3treeR)
library(htmlwidgets)

source("taxonomy.R")
source("scrape.R")

scrape_counts <- FALSE

# You can fetch and save a snapshot of Etsy taxonomy data to file using save_taxonomy
# save_taxonomy(<put your ETSY API key here>)

taxonomy <- load_taxonomy("data/taxonomy_08Jan2016.json")

#TODO: add a root entry to get all products

urls <- category_url(taxonomy$path)
taxonomy$url <- urls

filename <- "taxonomy.csv"
file.create(filename)

if(scrape_counts){
counts_and_listings_all <- adply(urls, c(1), function(x) scrape(x, filename), .progress = "text" )
}

taxonomy_counts <- read.table("data/taxonomy.csv",sep = '~', stringsAsFactors = FALSE)

names(taxonomy_counts) <- c("url","count")

#join back to original data
taxonomy <- left_join(taxonomy, taxonomy_counts, by = c("url") )

taxonomy$parent_id[is.na(taxonomy$parent_id)] <- 0
taxonomy$count[is.na(taxonomy$count)] <- 0

#Title case name and remove underscores:
pretty_path <- str_to_title(str_replace_all(taxonomy$path, "_", " "))
pretty_path <- str_replace_all(pretty_path, "And", "and")

taxonomy_levels <- as.data.frame(str_split_fixed(pretty_path, "\\.", 8))
names(taxonomy_levels) <- paste("pathlevel", 1:8, sep="")

taxonomy <- cbind(taxonomy, taxonomy_levels)

taxonomy$count <- as.numeric(taxonomy$count)

# we need to account for products that are not assigned to taxonomy leaf nodes,
# otherwise they do not show up in the treemap.
#
# We'll do this by takig the difference of a category and the sum of all child category counts.
# This assumes products do not get multi-assigned categories, which does not always hold
# because some categories end up with a count < 0
#
# These will be given as name of "-"
taxonomy_agg <- taxonomy %>% group_by( parent ) %>% dplyr::summarise( child_count = sum(count))
taxonomy_no_sub_category <- taxonomy %>% filter( path %in% taxonomy_agg$parent )
taxonomy_no_sub_category <- left_join(taxonomy_no_sub_category, taxonomy_agg, by=c("path" = "parent"))
taxonomy_no_sub_category$count <- taxonomy_no_sub_category$count - taxonomy_no_sub_category$child_count
taxonomy_no_sub_category$path <- paste(taxonomy_no_sub_category$path, "-" , sep = ".")
taxonomy_no_sub_category$level <- as.numeric(taxonomy_no_sub_category$level) + 1
taxonomy_no_sub_category$parent_id <- taxonomy_no_sub_category$id
taxonomy_no_sub_category$children_ids <- ""
taxonomy_no_sub_category$name <- "-"
taxonomy_no_sub_category <- taxonomy_no_sub_category %>% select( -child_count, -starts_with("pathlevel" ))

pretty_path <- str_to_title(str_replace_all(taxonomy_no_sub_category$path, "_", " "))
pretty_path <- str_replace_all(pretty_path, "And", "and")
taxonomy_levels <- as.data.frame(str_split_fixed(pretty_path, "\\.", 8))
names(taxonomy_levels) <- paste("pathlevel", 1:8, sep="")
taxonomy_no_sub_category <- cbind(taxonomy_no_sub_category, taxonomy_levels)

taxonomy <- rbind(taxonomy, taxonomy_no_sub_category)

taxonomy <- taxonomy %>% dplyr::arrange(path)

#reset any negative counts to 0
taxonomy$count[ taxonomy$count < 0 ] <- 0

tm <- treemap(taxonomy, index=paste("pathlevel", 1:7, sep=""), vSize="count")
widget <- d3tree3( tm, rootname = "Etsy" , width="1024px", height="750px")

orig_dir <- setwd("widget_tmp/")
saveWidget(widget, "etsy-treemap.html", selfcontained = FALSE)
setwd(orig_dir)

51 changes: 34 additions & 17 deletions scrape.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,15 @@ category_url <- function(path){
return(url)
}

get_event_data <- function(url){

load_page <- function(url){
#uastring <- "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
#html <- html_session(url, user_agent(uastring))
html <- read_html(url)

return(html)
}

get_event_data <- function(html){

javascript_tags <- html %>% html_nodes("script[type='text/javascript']") %>% html_text(trim = TRUE)

event_data_regex <- ".*EventPipe\\.enabled=true;EventPipe\\.init\\(([^;]*)\\);.*"
Expand Down Expand Up @@ -44,33 +47,47 @@ extract_listings <- function(event_data){
return(event_data$events$attributes$listing_ids)
}

scrape <- function(url, file){
get_count_from_html <- function(html){

count <- tryCatch({
spans <- html %>% html_nodes("span") %>% html_text(trim = TRUE)
spans_with_count <- str_match(spans, "\\((.*) items\\)")[,2]
spans_with_count <- spans_with_count[!is.na(spans_with_count)]
as.numeric(str_replace_all(spans_with_count, "," , ""))
},
warning = function(warn) {
print(warn)
},
error = function(err) {
return(0)
},
finally = {
})

return(count)
}


scrape <- function(url, file){

count <- 0
attempts <- 0
while(count == 0 && attempts < 10){
Sys.sleep(1.0)
print(url)
event_data <- get_event_data(url)
count <- extract_count(event_data)
listings <- extract_listings(event_data)
facet_data <- extract_facet_data(event_data)
attempts <- attempts + 1
}
html <- load_page(url)
count <- get_count_from_html(html)
#event_data <- get_event_data(html)

error_file <- "error.log"
if(!file.exists(error_file)){
file.create(error_file)
}

if(attempts == 10){
print(paste("Could not get count after 10 tries: ", url))
if(length(count) == 0 || count == 0){
print(paste("Could not get count: ", url))
write(paste(url, "no count", sep = "~"), file = error_file, append = TRUE)
return(NULL)
}

df <- tryCatch({
data.frame( url = url, count = count, listings = listings, taxonomy_facet_data = facet_data, stringsAsFactors = FALSE)
data.frame( url = url, count = count, stringsAsFactors = FALSE)
},
warning = function(warn) {
print(warn)
Expand Down
55 changes: 0 additions & 55 deletions scrape_category_counts.R

This file was deleted.

17 changes: 17 additions & 0 deletions taxonomy.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,22 @@
library(jsonlite)
library(plyr)
library(httr)
library(lubridate)

save_taxonomy <- function(etsyAPIkey, filename = ""){

url <- paste("https://openapi.etsy.com/v2/taxonomy/buyer/get?api_key=", etsyAPIkey, sep = "")
taxonomy <- GET(url)
taxonomy_json <- content(taxonomy, as = "text")

if(str_length(filename) == 0 ) {
filename <- paste("taxonomy", format(today(), format="%d%b%Y"), sep = "_")
filename <- paste(filename, "json", sep=".")
}

write(taxonomy_json, filename)

}

load_taxonomy <- function(file){
rawdata <- fromJSON(file)
Expand Down

0 comments on commit 8cc895b

Please sign in to comment.