################################## # Analysis of Marketing data (orders and customers) from Nespresso # Author: Diana Marek # Date: December 2016 ################################## # list workspace ls() # Reset R'brain rm(list=ls()) # check wd getwd() #set wd setwd("/Users/dmarek/EducationSIB/Courses_2016/Nespresso_R_course") # confirm wd getwd() # Read the data (dates have been changed in the csv to the format yyyy-mm-dd and times removed) customers<-read.csv2('FR_Customers_sample_small.csv',dec=".", stringsAsFactors=TRUE) customers$CLUB_DATE<-as.Date(customers$CLUB_DATE) customers$CREATION_DATE<-as.Date(customers$CREATION_DATE) customers$FIRST_MACHINE_DATE<-as.Date(customers$FIRST_MACHINE_DATE) customers$LAST_MACHINE_DATE<-as.Date(customers$LAST_MACHINE_DATE) customers$LAST_CAPS_ORDER_DATE<-as.Date(customers$LAST_CAPS_ORDER_DATE) orders<-read.csv2('FR_Orders_sample_small.csv', dec=".", stringsAsFactors=TRUE) # characters converted to factors #some columns contain lots of strings of characters orders$DAY1<-as.Date(orders$DAY1) # converts factors to dates # Structure, info about orders class(orders) dim(orders) str(orders) colnames(orders) head(orders) tail(orders) class(orders$REVENUE) class(orders$QTY) levels(orders$PROD_INT_DESCR) # no controlled vocabulary # Basic stats summary(orders) table(orders$ORDER_NR) # number of entries per order id table(orders$THIRD_NR) # number of entries per customer id table(orders$ORDER_NR, orders$THIRD_NR) #counts number of orders per customer id # Subsetting and operation on subgroups subset(orders,THIRD_NR==83) #subset of data for a customer subset(orders,THIRD_NR==83 & L2_TD_CAT_DESCR=="Espresso Classic") #subset of data for a customer and one type of coffee mean(orders[orders$CHF_REVENUE>10,"CHF_REVENUE"]) # mean of revenue for revenue above 10 CHF by(orders$CHF_REVENUE,orders$THIRD_NR, mean) # means of revenue by customer aggregate(orders$CHF_REVENUE, list(orders$THIRD_NR),mean) aggregate(data.frame(CHF_revenue=orders$CHF_REVENUE), list(customer_id=orders$THIRD_NR), mean) # to change the labels of the output table by(orders$CHF_REVENUE,list(orders$THIRD_NR,orders$ORDER_NR), mean) # mean of revenue by customer and by order (20 possibilities) aggregate(data.frame(CHF_revenue=orders$CHF_REVENUE), list(customer_id=orders$THIRD_NR,order_id=orders$ORDER_NR),mean) #eq to the above command with customised labels aggregate(orders$CHF_REVENUE, list(orders$PROD_INT_DESCR),range) # range of revenue for the different products aggregate(orders$CHF_REVENUE, list(orders$PROD_INT_DESCR),median) #median of revenue for the different products aggregate(orders$CHF_REVENUE, list(orders$PROD_INT_DESCR),mean) #mean of revenue for the different products aggregate(orders$CHF_REVENUE, list(orders$PROD_INT_DESCR),sum) #sum of revenue for the different products #Select the first (by date) order for each customer in the dataframe #Let's randomize the dataset # then ensure the data frame is sorted by DAY1 which determines the first instance random_index <- sample(1:dim(orders)[1]) random_orders<-orders[random_index,] random_orders_sorted_by_date<-random_orders[with(random_orders, order(DAY1)), ] # eq to the original dataframe # With evaluate an R expression in an environment constructed from data, possibly modifying (a copy of) the original data. #random_orders_sorted_by_date[!duplicated(random_orders_sorted_by_date$ORDER_NR), "DAY1"] random_orders_sorted_by_date[!duplicated(random_orders_sorted_by_date$ORDER_NR),] #duplicated determines which elements of a vector or data frame are duplicates of elements with #smaller subscripts, and returns a logical vector indicating which elements (rows) are duplicates. random_orders_sorted_by_date[match(unique(random_orders_sorted_by_date$ORDER_NR), random_orders_sorted_by_date$ORDER_NR),] # eq to the above command #unique returns a vector, data frame or array like x but with duplicate elements/rows removed. #match returns indices of the first match in the compared vectors, which give you the rows you need. #Export subset regarding a customer cust513<- subset(orders,THIRD_NR==513) #write.csv2(cust513, file="cust513.csv", quote=FALSE,row.names=FALSE) # puts dec=, and sep= ; write.table(cust513, file="cust513.csv", quote=FALSE, sep=";",row.names=FALSE) rm(cust513) cust513<-read.csv2('cust513.csv',dec=".", stringsAsFactors=TRUE) hist(cust513$CHF_REVENUE) #####Graphics ## Hist hist(orders$CHF_REVENUE,breaks=50,xlab="Customer revenue (CHF)", main= "Customer revenue distribution", freq = FALSE, col= "lightblue", ylim=c(0,0.20), xlim=c(0,80)); lines(density(orders$CHF_REVENUE),col="orange3",lwd=2) ## Boxplot boxplot(CONS_CAPS~L2_TD_CAT_DESCR,data=orders, main="Number of caps per product category", col="pink", cex.axis=1.5, las=2, # axis labels perpendicalar to axis col.axis="gray28", col.main="gray28") # title and axis annotation in gray # more colour coding here: http://research.stowers-institute.org/efg/R/Color/Chart/ ## Scatter plot plot(orders$REVENUE, orders$CHF_REVENUE,type="p",pch=3, main=" Revenue - Euros vs CHF" , xlab="Euros", ylab="CHF", col = c("red", "green", "blue")[orders$SOURCE_INT_GRP_2_CODE]) abline(lm(orders$CHF_REVENUE~orders$REVENUE),col="grey") legend("bottomright", legend= levels(orders$SOURCE_INT_GRP_2_CODE), pch= c("+","+","+"),col = c("red", "green", "blue"),bg = "gray90") cor.test(orders$REVENUE, orders$CHF_REVENUE) ## Exporting to a pdf pdf(file="revenue_scatterplot.pdf", width=10, height=5) plot(orders$REVENUE, orders$CHF_REVENUE,type="p",pch=3, main="Revenue - Euros vs CHF" , xlab="Euros", ylab="CHF", col = c("red", "green", "blue")[orders$SOURCE_INT_GRP_2_CODE]) abline(lm(orders$CHF_REVENUE~orders$REVENUE),col="grey") legend("bottomright", legend= levels(orders$SOURCE_INT_GRP_2_CODE), pch= c("+","+","+"),col = c("red", "green", "blue"),bg = "gray90") cor.test(orders$REVENUE, orders$CHF_REVENUE) dev.off() #####Graphics with ggplot2 ## Hist ggplot(data=orders, aes(x=CHF_REVENUE, ..density..)) + geom_histogram(bins=70, colour="black",fill = "lightblue") + geom_density(color="orange") + ggtitle("CHF Revenue distribution") ## Boxplot ggplot(data=orders, aes(x=L2_TD_CAT_DESCR, y=CONS_CAPS)) + geom_jitter(alpha = 0.3, color = "pink") + geom_boxplot(alpha=0) + labs(title="Number of caps per product category",x="Type of product(L2_TD_CAT_DESCR)",y="Number of caps per product category (CONS_CAPS)") ## Scatter plot ggplot(data=orders, aes(x=REVENUE, y=CHF_REVENUE)) + geom_point(alpha=0.3, aes(color=SOURCE_INT_GRP_2_CODE), size=3) + ggtitle("Revenue - Euros vs CHF") + theme(plot.title = element_text(size=15, face="bold", margin= margin(10,0,10,0))) + theme(plot.title = element_text(hjust = 0.45))