#Import Library
library(ggplot2)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## <U+221A> tibble 3.0.4 <U+221A> dplyr 1.0.2
## <U+221A> tidyr 1.1.2 <U+221A> stringr 1.4.0
## <U+221A> readr 1.4.0 <U+221A> forcats 0.5.0
## <U+221A> purrr 0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
#Read data
setwd("C:/Users/asus/Desktop")
Prem_21 <- read.csv("Premier_20.21.csv")
Prem_20 <- read.csv("Premier_19.20.csv")
Prem_19 <- read.csv("Premier_18.19.csv")
Prem_2120 <- rbind(Prem_21,Prem_20)
all_data <- merge(Prem_19, Prem_2120, all= T)
hist_1 <- ggplot(data = all_data) + geom_histogram(mapping = aes(x = FTHG, col = I("Black")),binwidth = 1,show.legend = F) + xlab("Home Goals") + ylab("Number of matches") +theme_light() + ggtitle("Hıstogram for Home Scores") + theme(plot.title = element_text(hjust=0.5))
hist_1
hist_2 <- ggplot(data = all_data) + geom_histogram(mapping = aes(x=FTAG, col = I("Black")),show.legend = F, binwidth = 1) + xlab("Away goals") + ylab("Number of matches") +theme_light()+ ggtitle("Hıstogram for Away Scores") + theme(plot.title = element_text(hjust=0.5))
hist_2
hist_3 <- ggplot(data = all_data) + geom_histogram(mapping = aes(x = (FTHG - FTAG), col = I("Black")),show.legend = F, binwidth = 1) + xlab("Home goals - Away goals") + ylab("Number of matches") +theme_light()+ ggtitle("Hıstogram for Home-Away Scores") + theme(plot.title = element_text(hjust=0.5))
hist_3
# Histogram with Poisson Distr. for Home Goals
hist(all_data$FTHG, col = "blue",right = F, xlab = "Home Goals", ylab = "Number of Matches",breaks = 0:10,main="Histogram with Poisson Distribution", ylim = c(0,300))
pois_distr = dpois(c(0:8),lambda = mean(all_data$FTHG))* length(all_data$FTHG)
lines(pois_distr,col = "red")
# Histogram with Poisson Distr. for Away Goals
x <- all_data$FTAG
h<-hist(x, col="red",right = F, xlab="Away Goals", ylab = "Number of Matches",breaks = 0:10,ylim = c(0,300),
main="Histogram with Poisson Distribution")
xfit<-seq(0.5,10.5, by = 1)
yfit<-dpois(c(0:10), lambda = mean(all_data$FTAG) )*length(x)
lines(xfit,yfit, col="blue", lwd=3)
These distributions are similar to poisson distribution
In this part, P(home),P(tie) and P(win) are calculated.
# 2.1 calculate p(home win), p(tie), P(away win) for bet365, BetandWin, Pinnacle and
# bet365
all_data$home_prob1 <- as.numeric(1 / all_data$B365H)
all_data$away_prob1 <- as.numeric(1 / all_data$B365A)
all_data$draw_prob1 <- as.numeric(1 / all_data$B365D)
# Betandwin
all_data$home_prob2 <- as.numeric(1 / all_data$BWH)
all_data$away_prob2 <- as.numeric(1 / all_data$BWA)
all_data$draw_prob2 <- as.numeric(1 / all_data$BWD)
#Pinnacle
all_data$home_prob3 <- as.numeric(1 / all_data$PSH)
all_data$away_prob3 <- as.numeric(1 / all_data$PSA)
all_data$draw_prob3 <- as.numeric(1 / all_data$PSD)
# William Hill
all_data$home_prob4 <- as.numeric(1 / all_data$WHH)
all_data$away_prob4 <- as.numeric(1 / all_data$WHA)
all_data$draw_prob4 <- as.numeric(1 / all_data$WHD)
Probability calculation with normalization is done in this part of the homework.
# 2.2 probability calculation with normalization
for(i in 1:nrow(all_data)){
all_data$home_prob1[i] <- all_data$home_prob1[i] / (all_data$home_prob1[i] + all_data$away_prob1[i] + all_data$draw_prob1[i])
all_data$away_prob1[i] <- all_data$away_prob1[i] / (all_data$home_prob1[i] + all_data$away_prob1[i] + all_data$draw_prob1[i])
all_data$draw_prob1[i] <- all_data$draw_prob1[i] / (all_data$home_prob1[i] + all_data$away_prob1[i] + all_data$draw_prob1[i])
all_data$home_prob2[i] <- all_data$home_prob2[i] / (all_data$home_prob2[i] + all_data$away_prob2[i] + all_data$draw_prob2[i])
all_data$away_prob2[i] <- all_data$away_prob2[i] / (all_data$home_prob2[i] + all_data$away_prob2[i] + all_data$draw_prob2[i])
all_data$draw_prob2[i] <- all_data$draw_prob2[i] / (all_data$home_prob2[i] + all_data$away_prob2[i] + all_data$draw_prob2[i])
all_data$home_prob3[i] <- all_data$home_prob3[i] / (all_data$home_prob3[i] + all_data$away_prob3[i] + all_data$draw_prob3[i])
all_data$away_prob3[i] <- all_data$away_prob3[i] / (all_data$home_prob3[i] + all_data$away_prob3[i] + all_data$draw_prob3[i])
all_data$draw_prob3[i] <- all_data$draw_prob3[i] / (all_data$home_prob3[i] + all_data$away_prob3[i] + all_data$draw_prob3[i])
all_data$home_prob4[i] <- all_data$home_prob4[i] / (all_data$home_prob4[i] + all_data$away_prob4[i] + all_data$draw_prob4[i])
all_data$away_prob4[i] <- all_data$away_prob4[i] / (all_data$home_prob4[i] + all_data$away_prob4[i] + all_data$draw_prob4[i])
all_data$draw_prob4[i] <- all_data$draw_prob4[i] / (all_data$home_prob4[i] + all_data$away_prob4[i] + all_data$draw_prob4[i])
}
# 2.3
# calculating P(home)- P(away)
all_data$home_away1 <-as.numeric(all_data$home_prob1 - all_data$away_prob1)
all_data$home_away2 <-as.numeric(all_data$home_prob2 - all_data$away_prob2)
all_data$home_away3 <-as.numeric(all_data$home_prob3 - all_data$away_prob3)
all_data$home_away4 <-as.numeric(all_data$home_prob4 - all_data$away_prob4)
# we can create a new column showing discrete values of P(home) - P(away) for [1 0.8) = 1 [0.8 0.6) = 2 .... [-0.8 -1) = 10
# for Bet365
for(i in 1:nrow(all_data)){
if( 0.8 < all_data$home_away1[i] & all_data$home_away1[i] <= 1){
all_data$discrete_val1[i] = 1
}else if(0.6 < all_data$home_away1[i] & all_data$home_away1[i] <= 0.8){
all_data$discrete_val1[i] = 2
}else if(0.4 < all_data$home_away1[i] & all_data$home_away1[i] <= 0.6){
all_data$discrete_val1[i] = 3
}else if(0.2 < all_data$home_away1[i] & all_data$home_away1[i] <= 0.4){
all_data$discrete_val1[i] = 4
}else if(0 < all_data$home_away1[i] & all_data$home_away1[i] <= 0.2){
all_data$discrete_val1[i] = 5
}else if(-0.2 < all_data$home_away1[i] & all_data$home_away1[i] <= 0){
all_data$discrete_val1[i] = 6
}else if(-0.4 < all_data$home_away1[i] & all_data$home_away1[i] <= -0.2){
all_data$discrete_val1[i] = 7
}else if(-0.6 < all_data$home_away1[i] & all_data$home_away1[i] <= -0.4){
all_data$discrete_val1[i] = 8
}else if(-0.8< all_data$home_away1[i] & all_data$home_away1[i] <= -0.6){
all_data$discrete_val1[i] = 9
}else if(-1 < all_data$home_away1[i] & all_data$home_away1[i] <= -0.8){
all_data$discrete_val1[i] = 10
}
}
# for Bet and Win
for(i in 1:nrow(all_data)){
if( 0.8 < all_data$home_away2[i] & all_data$home_away2[i] <= 1){
all_data$discrete_val2[i] = 1
}else if(0.6 < all_data$home_away2[i] & all_data$home_away2[i] <= 0.8){
all_data$discrete_val2[i] = 2
}else if(0.4 < all_data$home_away2[i] & all_data$home_away2[i] <= 0.6){
all_data$discrete_val2[i] = 3
}else if(0.2 < all_data$home_away2[i] & all_data$home_away2[i] <= 0.4){
all_data$discrete_val2[i] = 4
}else if(0 < all_data$home_away2[i] & all_data$home_away2[i] <= 0.2){
all_data$discrete_val2[i] = 5
}else if(-0.2 < all_data$home_away2[i] & all_data$home_away2[i] <= 0){
all_data$discrete_val2[i] = 6
}else if(-0.4 < all_data$home_away2[i] & all_data$home_away2[i] <= -0.2){
all_data$discrete_val2[i] = 7
}else if(-0.6 < all_data$home_away2[i] & all_data$home_away2[i] <= -0.4){
all_data$discrete_val2[i] = 8
}else if(-0.8< all_data$home_away2[i] & all_data$home_away2[i] <= -0.6){
all_data$discrete_val2[i] = 9
}else if(-1 < all_data$home_away2[i] & all_data$home_away2[i] <= -0.8){
all_data$discrete_val2[i] = 10
}
}
# for Pinnacle
for(i in 1:nrow(all_data)){
if( 0.8 < all_data$home_away3[i] & all_data$home_away3[i] <= 1){
all_data$discrete_val3[i] = 1
}else if(0.6 < all_data$home_away3[i] & all_data$home_away3[i] <= 0.8){
all_data$discrete_val3[i] = 2
}else if(0.4 < all_data$home_away3[i] & all_data$home_away3[i] <= 0.6){
all_data$discrete_val3[i] = 3
}else if(0.2 < all_data$home_away3[i] & all_data$home_away3[i] <= 0.4){
all_data$discrete_val3[i] = 4
}else if(0 < all_data$home_away3[i] & all_data$home_away3[i] <= 0.2){
all_data$discrete_val3[i] = 5
}else if(-0.2 < all_data$home_away3[i] & all_data$home_away3[i] <= 0){
all_data$discrete_val3[i] = 6
}else if(-0.4 < all_data$home_away3[i] & all_data$home_away3[i] <= -0.2){
all_data$discrete_val3[i] = 7
}else if(-0.6 < all_data$home_away3[i] & all_data$home_away3[i] <= -0.4){
all_data$discrete_val3[i] = 8
}else if(-0.8< all_data$home_away3[i] & all_data$home_away3[i] <= -0.6){
all_data$discrete_val3[i] = 9
}else if(-1 < all_data$home_away3[i] & all_data$home_away3[i] <= -0.8){
all_data$discrete_val3[i] = 10
}
}
# for William Hill
for(i in 1:nrow(all_data)){
if( 0.8 < all_data$home_away4[i] & all_data$home_away4[i] <= 1){
all_data$discrete_val4[i] = 1
}else if(0.6 < all_data$home_away4[i] & all_data$home_away4[i] <= 0.8){
all_data$discrete_val4[i] = 2
}else if(0.4 < all_data$home_away4[i] & all_data$home_away4[i] <= 0.6){
all_data$discrete_val4[i] = 3
}else if(0.2 < all_data$home_away4[i] & all_data$home_away4[i] <= 0.4){
all_data$discrete_val1[i] = 4
}else if(0 < all_data$home_away4[i] & all_data$home_away4[i] <= 0.2){
all_data$discrete_val4[i] = 5
}else if(-0.2 < all_data$home_away4[i] & all_data$home_away4[i] <= 0){
all_data$discrete_val4[i] = 6
}else if(-0.4 < all_data$home_away4[i] & all_data$home_away4[i] <= -0.2){
all_data$discrete_val4[i] = 7
}else if(-0.6 < all_data$home_away4[i] & all_data$home_away4[i] <= -0.4){
all_data$discrete_val4[i] = 8
}else if(-0.8< all_data$home_away4[i] & all_data$home_away4[i] <= -0.6){
all_data$discrete_val4[i] = 9
}else if(-1 < all_data$home_away4[i] & all_data$home_away4[i] <= -0.8){
all_data$discrete_val4[i] = 10
}
}
# Create data frames for calculation of P(draw)
df1 = as.data.frame(summary((as.factor(all_data$discrete_val1))))
df1 = rename(df1, total_game1 = 'summary((as.factor(all_data$discrete_val1)))')
df2 = as.data.frame(summary((as.factor(all_data$discrete_val2))))
df2 = rename(df2, total_game1 = 'summary((as.factor(all_data$discrete_val2)))')
df2 = rbind(df2,"10" = 0)
df3 = as.data.frame(summary((as.factor(all_data$discrete_val3))))
df3 = rename(df3, total_game1 = 'summary((as.factor(all_data$discrete_val3)))')
df4 = as.data.frame(summary((as.factor(all_data$discrete_val4))))
df4 = rbind(df4,"4" = 0)
df4 = df4[ order(as.numeric(row.names(df4))), ]
df4 = as.data.frame(df4)
df4 = rename(df4, total_game1 = 'df4')
# calculating number of games ended draw and P(draw)
for(i in 1:10){
df1$num_of_draw[i] = as.numeric(summary(as.factor(all_data$FTR) == "D" & as.factor(all_data$discrete_val1) == i))[3]
df2$num_of_draw[i] = as.numeric(summary(as.factor(all_data$FTR) == "D" & as.factor(all_data$discrete_val2) == i))[3]
df3$num_of_draw[i] = as.numeric(summary(as.factor(all_data$FTR) == "D" & as.factor(all_data$discrete_val3) == i))[3]
df4$num_of_draw[i] = as.numeric(summary(as.factor(all_data$FTR) == "D" & as.factor(all_data$discrete_val4) == i))[3]
}
df1$num_of_draw[is.na(df1$num_of_draw)] = 0
df2$num_of_draw[is.na(df2$num_of_draw)] = 0
df3$num_of_draw[is.na(df3$num_of_draw)] = 0
df4$num_of_draw[is.na(df4$num_of_draw)] = 0
for(i in 1:10){
df1$calc_pdraw[i] = df1$num_of_draw[i] / df1$total_game[i]
df2$calc_pdraw[i] = df2$num_of_draw[i] / df2$total_game[i]
df3$calc_pdraw[i] = df3$num_of_draw[i] / df3$total_game[i]
df4$calc_pdraw[i] = df4$num_of_draw[i] / df4$total_game[i]
}
df1$calc_pdraw[is.nan(df1$calc_pdraw)] = 0
df2$calc_pdraw[is.nan(df2$calc_pdraw)] = 0
df3$calc_pdraw[is.nan(df3$calc_pdraw)] = 0
df4$calc_pdraw[is.nan(df4$calc_pdraw)] = 0
plot_Bet365 <- ggplot() +
theme_gray() +
geom_point(data = all_data, aes(x = home_away1, y = draw_prob1,col = I("gray"))) +
geom_point(data = df1, aes(x = seq(0.9,-0.9,length.out = 10), y = calc_pdraw, col = I("red"))) +
xlab("P(Home)- P(Away)") + ylab("P(tie)") + ggtitle("BET365")+theme(plot.title = element_text(hjust=0.5))+
scale_color_identity(name = "",labels = c("Bet365", "Calculated P(draw)"), breaks = c("gray", "red"), guide = "legend")
plot_Bet365
plot_BetandWin <- ggplot() +
theme_gray() +
geom_point(data = all_data, aes(x = home_away2, y = draw_prob2, col = I("red"))) +
geom_point(data = df2, aes(x = seq(0.9,-0.9,length.out = 10), y = calc_pdraw, col = I("black"))) +
xlab("P(Home)- P(Away)") + ylab("P(tie)") + ggtitle("BET & WIN")+theme(plot.title = element_text(hjust=0.5))+
scale_color_identity(name = "",labels = c("BetandWin ","Calculated P(draw)"), breaks = c("red", "black"), guide = "legend")
plot_BetandWin
plot_Pinnacle <- ggplot() +
theme_gray() +
geom_point(data = all_data, aes(x = home_away3, y = draw_prob3, col = I("green"))) +
geom_point(data = df3, aes(x = seq(0.9,-0.9,length.out = 10), y = calc_pdraw, color = I("red")))+
xlab("P(Home)- P(Away)") + ylab("P(tie)") + ggtitle("PINNACLE")+theme(plot.title = element_text(hjust=0.5))+
scale_color_identity(name = "",labels = c("Pinnacle ","Calculated P(draw)"), breaks = c("green", "red"), guide = "legend")
plot_Pinnacle
plot_WilliamHill <- ggplot() +
theme_gray() +
geom_point(data = all_data, aes(x = home_away4, y = draw_prob4, col = I("gray"))) +
geom_point(data = df4, aes(x = seq(0.9,-0.9,length.out = 10), y = calc_pdraw, color = I("green")))+
xlab("P(Home)- P(Away)") + ylab("P(tie)") + ggtitle("WILLIAM HILL")+theme(plot.title = element_text(hjust=0.5))+
scale_color_identity(name = "",labels = c("William Hill ","Calculated P(draw)"), breaks = c("gray", "green"), guide = "legend")
plot_WilliamHill
# Remove matches one or more player are shown red card
without_red <- all_data %>% filter(HR == 0 & AR == 0)
without_red$discrete_val1 <- as.factor(without_red$discrete_val1)
levels(without_red$discrete_val1) <- c(levels(without_red$discrete_val1),"10")
df1$num_of_total_no_red = as.numeric(summary((as.factor(without_red$discrete_val1))))
without_red$discrete_val2 <- as.factor(without_red$discrete_val2)
levels(without_red$discrete_val2) <- c(levels(without_red$discrete_val2),"10")
df2$num_of_total_no_red = as.numeric(summary((as.factor(without_red$discrete_val2))))
without_red$discrete_val3 <- as.factor(without_red$discrete_val3)
df3$num_of_total_no_red = as.numeric(summary((as.factor(without_red$discrete_val3))))
without_red$discrete_val4 <- as.factor(without_red$discrete_val4)
levels(without_red$discrete_val4) <- c(levels(without_red$discrete_val4),"4")
without_red$discrete_val4 <- factor(without_red$discrete_val4, levels = c("1","2","3","4","5","6","7","8","9","10"))
df4$num_of_total_no_red = as.numeric(summary((as.factor(without_red$discrete_val4))))
for(i in 1:10){
df1$numberofdraw_without_red[i] <-as.numeric(summary(as.factor(without_red$FTR) == "D"
& as.factor(without_red$discrete_val1) == i))[3]
df2$numberofdraw_without_red[i] <-as.numeric(summary(as.factor(without_red$FTR) == "D"
& as.factor(without_red$discrete_val2) == i))[3]
df3$numberofdraw_without_red[i] <-as.numeric(summary(as.factor(without_red$FTR) == "D"
& as.factor(without_red$discrete_val3) == i))[3]
df4$numberofdraw_without_red[i] <-as.numeric(summary(as.factor(without_red$FTR) == "D"
& as.factor(without_red$discrete_val4) == i))[3]
}
#Replace NA with zero
df1$numberofdraw_without_red[is.na(df1$numberofdraw_without_red)] <- 0
df2$numberofdraw_without_red[is.na(df2$numberofdraw_without_red)] <- 0
df3$numberofdraw_without_red[is.na(df3$numberofdraw_without_red)] <- 0
df4$numberofdraw_without_red[is.na(df4$numberofdraw_without_red)] <- 0
# calculate p(draw) for matches without red card
for(i in 1:10){
df1$calc_pdraw_withoutred[i] = df1$numberofdraw_without_red[i] / df1$num_of_total_no_red [i]
df2$calc_pdraw_withoutred[i] = df2$numberofdraw_without_red[i] / df2$num_of_total_no_red [i]
df3$calc_pdraw_withoutred[i] = df3$numberofdraw_without_red[i] / df3$num_of_total_no_red [i]
df4$calc_pdraw_withoutred[i] = df4$numberofdraw_without_red[i] / df4$num_of_total_no_red [i]
}
df1$calc_pdraw_withoutred[is.nan(df1$calc_pdraw_withoutred)] <- 0
df2$calc_pdraw_withoutred[is.nan(df2$calc_pdraw_withoutred)] <- 0
df3$calc_pdraw_withoutred[is.nan(df3$calc_pdraw_withoutred)] <- 0
df4$calc_pdraw_withoutred[is.nan(df4$calc_pdraw_withoutred)] <- 0
plot1.1<- ggplot() +
theme_gray() +
geom_point(data = all_data, aes(x = home_away1, y = draw_prob1,col = I("gray"))) +
geom_point(data = df1, aes(x = seq(0.9,-0.9,length.out = 10), y = calc_pdraw, col = I("red"))) +
xlab("P(Home)- P(Away)") + ylab("P(tie)") + ggtitle("BET365")+theme(plot.title = element_text(hjust=0.5))+
geom_point(data = df1, aes(x = seq(0.9,-0.9,length.out = 10), y = calc_pdraw_withoutred, col = I("black")))+
scale_color_identity(name = "",labels = c("Bet365","With Red Card","Without Red Card"), breaks = c("gray","red","black"), guide = "legend")
plot1.1
plot2.1<- ggplot() +
theme_gray() +
geom_point(data = all_data, aes(x = home_away2, y = draw_prob2, col = I("red"))) +
geom_point(data = df2, aes(x = seq(0.9,-0.9,length.out = 10), y = calc_pdraw, col = I("black"))) +
xlab("P(Home)- P(Away)") + ylab("P(tie)") + ggtitle("BET & WIN")+theme(plot.title = element_text(hjust=0.5))+
geom_point(data = df2, aes(x = seq(0.9,-0.9,length.out = 10), y = calc_pdraw_withoutred, color = I("blue")))+
scale_color_identity(name = "",labels = c("BetandWin ","With RC", "Without RC"), breaks = c("red", "black", "blue"), guide = "legend")
plot2.1
plot3.1<- ggplot() +
theme_gray() +
geom_point(data = all_data, aes(x = home_away3, y = draw_prob3, col = I("green"))) +
geom_point(data = df3, aes(x = seq(0.9,-0.9,length.out = 10), y = calc_pdraw, color = I("red")))+
xlab("P(Home)- P(Away)") + ylab("P(tie)") + ggtitle("PINNACLE")+theme(plot.title = element_text(hjust=0.5)) +
geom_point(data = df3, aes(x = seq(0.9,-0.9,length.out = 10), y = calc_pdraw_withoutred, color = I("black")))+
scale_color_identity(name = "",labels = c("Pinnacle ","With RC","Without RC"), breaks = c("green", "red","black"), guide = "legend")
plot3.1
plot4.1<- ggplot() +
theme_gray() +
geom_point(data = all_data, aes(x = home_away4, y = draw_prob4, col = I("gray"))) +
geom_point(data = df4, aes(x = seq(0.9,-0.9,length.out = 10), y = calc_pdraw, color = I("green")))+
xlab("P(Home)- P(Away)") + ylab("P(tie)") + ggtitle("WILLIAM HILL")+theme(plot.title = element_text(hjust=0.5))+
geom_point(data = df4, aes(x = seq(0.9,-0.9,length.out = 10), y = calc_pdraw_withoutred, color = I("black")))+
scale_color_identity(name = "",labels = c("William Hill ","With RC","Without RC"), breaks = c("gray", "green", "black"), guide = "legend")
plot4.1
There is no big change in my values when matches one or more players are shown red card are removed from data sets.It is safe to say that a little increase in P(draw) is generally seen but removing matches do not significantly impact on my results.