##################################
# Analysis of Marketing data (orders and customers) from Nespresso
# Author: Diana Marek
# Date: December 2016
##################################
# list workspace
ls()

# Reset R'brain
rm(list=ls())

# check wd
getwd()

#set wd
setwd("/Users/dmarek/EducationSIB/Courses_2016/Nespresso_R_course")

# confirm wd
getwd()

# Read the data (dates have been changed in the csv to the format yyyy-mm-dd and times removed)
customers<-read.csv2('FR_Customers_sample_small.csv',dec=".", stringsAsFactors=TRUE)
customers$CLUB_DATE<-as.Date(customers$CLUB_DATE)
customers$CREATION_DATE<-as.Date(customers$CREATION_DATE)
customers$FIRST_MACHINE_DATE<-as.Date(customers$FIRST_MACHINE_DATE)
customers$LAST_MACHINE_DATE<-as.Date(customers$LAST_MACHINE_DATE)
customers$LAST_CAPS_ORDER_DATE<-as.Date(customers$LAST_CAPS_ORDER_DATE)

orders<-read.csv2('FR_Orders_sample_small.csv', dec=".", stringsAsFactors=TRUE) # characters converted to factors
#some columns contain lots of strings of characters
orders$DAY1<-as.Date(orders$DAY1)  # converts factors to dates


# Structure, info about orders
class(orders)
dim(orders)
str(orders)
colnames(orders)
head(orders)
tail(orders)
class(orders$REVENUE)
class(orders$QTY)
levels(orders$PROD_INT_DESCR)  # no controlled vocabulary

# Basic stats
summary(orders)
table(orders$ORDER_NR) # number of entries per order id
table(orders$THIRD_NR) # number of entries per customer id 
table(orders$ORDER_NR, orders$THIRD_NR) #counts number of orders per customer id

# Subsetting and operation on subgroups
subset(orders,THIRD_NR==83) #subset of data for a customer
subset(orders,THIRD_NR==83 & L2_TD_CAT_DESCR=="Espresso Classic") #subset of data for a customer and one type of coffee

mean(orders[orders$CHF_REVENUE>10,"CHF_REVENUE"]) # mean of revenue for revenue above 10 CHF

by(orders$CHF_REVENUE,orders$THIRD_NR, mean) # means of revenue by customer
aggregate(orders$CHF_REVENUE, list(orders$THIRD_NR),mean)
aggregate(data.frame(CHF_revenue=orders$CHF_REVENUE), list(customer_id=orders$THIRD_NR), mean) # to change the labels of the output table

by(orders$CHF_REVENUE,list(orders$THIRD_NR,orders$ORDER_NR), mean) # mean of revenue by customer and by order (20 possibilities)
aggregate(data.frame(CHF_revenue=orders$CHF_REVENUE), list(customer_id=orders$THIRD_NR,order_id=orders$ORDER_NR),mean) #eq to the above command with customised labels

aggregate(orders$CHF_REVENUE, list(orders$PROD_INT_DESCR),range) # range of revenue for the different products
aggregate(orders$CHF_REVENUE, list(orders$PROD_INT_DESCR),median) #median of revenue for the different products
aggregate(orders$CHF_REVENUE, list(orders$PROD_INT_DESCR),mean) #mean of revenue for the different products
aggregate(orders$CHF_REVENUE, list(orders$PROD_INT_DESCR),sum) #sum of revenue for the different products

#Select the first (by date) order for each customer in the dataframe
#Let's  randomize the dataset
# then ensure the data frame is sorted by DAY1 which determines the first instance
random_index <- sample(1:dim(orders)[1])
random_orders<-orders[random_index,]
random_orders_sorted_by_date<-random_orders[with(random_orders, order(DAY1)), ] # eq to the original dataframe
# With evaluate an R expression in an environment constructed from data, possibly modifying (a copy of) the original data.

#random_orders_sorted_by_date[!duplicated(random_orders_sorted_by_date$ORDER_NR), "DAY1"]
random_orders_sorted_by_date[!duplicated(random_orders_sorted_by_date$ORDER_NR),]
#duplicated determines which elements of a vector or data frame are duplicates of elements with 
#smaller subscripts, and returns a logical vector indicating which elements (rows) are duplicates.
random_orders_sorted_by_date[match(unique(random_orders_sorted_by_date$ORDER_NR),
                                   random_orders_sorted_by_date$ORDER_NR),] # eq to the above command
#unique returns a vector, data frame or array like x but with duplicate elements/rows removed.
#match returns indices of the first match in the compared vectors, which give you the rows you need.

#Export subset regarding a customer
cust513<- subset(orders,THIRD_NR==513)
#write.csv2(cust513, file="cust513.csv", quote=FALSE,row.names=FALSE)  # puts dec=, and sep= ;
write.table(cust513, file="cust513.csv", quote=FALSE, sep=";",row.names=FALSE)
rm(cust513)
cust513<-read.csv2('cust513.csv',dec=".", stringsAsFactors=TRUE)
hist(cust513$CHF_REVENUE)

#####Graphics
## Hist
hist(orders$CHF_REVENUE,breaks=50,xlab="Customer revenue (CHF)", 
     main= "Customer revenue distribution", freq = FALSE, col= "lightblue", ylim=c(0,0.20), xlim=c(0,80)); 
lines(density(orders$CHF_REVENUE),col="orange3",lwd=2)

## Boxplot
boxplot(CONS_CAPS~L2_TD_CAT_DESCR,data=orders, main="Number of caps per product category", 
        col="pink", cex.axis=1.5, las=2, # axis labels perpendicalar to axis
        col.axis="gray28", col.main="gray28") # title and axis annotation in gray
# more colour coding here: http://research.stowers-institute.org/efg/R/Color/Chart/


## Scatter plot
plot(orders$REVENUE, orders$CHF_REVENUE,type="p",pch=3, main=" Revenue - Euros vs CHF" , xlab="Euros", ylab="CHF", col = c("red", "green", "blue")[orders$SOURCE_INT_GRP_2_CODE])
abline(lm(orders$CHF_REVENUE~orders$REVENUE),col="grey")
legend("bottomright", legend= levels(orders$SOURCE_INT_GRP_2_CODE), pch= c("+","+","+"),col = c("red", "green", "blue"),bg = "gray90")
cor.test(orders$REVENUE, orders$CHF_REVENUE)

## Exporting to a pdf
pdf(file="revenue_scatterplot.pdf", width=10, height=5)
plot(orders$REVENUE, orders$CHF_REVENUE,type="p",pch=3, main="Revenue - Euros vs CHF" , xlab="Euros", ylab="CHF", col = c("red", "green", "blue")[orders$SOURCE_INT_GRP_2_CODE])
abline(lm(orders$CHF_REVENUE~orders$REVENUE),col="grey")
legend("bottomright", legend= levels(orders$SOURCE_INT_GRP_2_CODE), pch= c("+","+","+"),col = c("red", "green", "blue"),bg = "gray90")
cor.test(orders$REVENUE, orders$CHF_REVENUE)
dev.off()

#####Graphics with ggplot2
## Hist
ggplot(data=orders, aes(x=CHF_REVENUE, ..density..)) + geom_histogram(bins=70, colour="black",fill = "lightblue") +
  geom_density(color="orange") + ggtitle("CHF Revenue distribution")

## Boxplot
ggplot(data=orders, aes(x=L2_TD_CAT_DESCR, y=CONS_CAPS)) + geom_jitter(alpha = 0.3, color = "pink") + geom_boxplot(alpha=0) +
  labs(title="Number of caps per product category",x="Type of product(L2_TD_CAT_DESCR)",y="Number of caps per product category (CONS_CAPS)")

## Scatter plot
ggplot(data=orders, aes(x=REVENUE, y=CHF_REVENUE)) + geom_point(alpha=0.3, aes(color=SOURCE_INT_GRP_2_CODE), size=3) +
  ggtitle("Revenue - Euros vs CHF") + theme(plot.title = element_text(size=15, face="bold", margin= margin(10,0,10,0))) + 
  theme(plot.title = element_text(hjust = 0.45))