HOMEWORK 1

#Import Library
library(ggplot2)
library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## <U+221A> tibble  3.0.4     <U+221A> dplyr   1.0.2
## <U+221A> tidyr   1.1.2     <U+221A> stringr 1.4.0
## <U+221A> readr   1.4.0     <U+221A> forcats 0.5.0
## <U+221A> purrr   0.3.4

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

#Read data
setwd("C:/Users/asus/Desktop")
Prem_21 <- read.csv("Premier_20.21.csv")
Prem_20 <- read.csv("Premier_19.20.csv")
Prem_19 <- read.csv("Premier_18.19.csv")

Prem_2120 <- rbind(Prem_21,Prem_20)
all_data <- merge(Prem_19, Prem_2120, all= T)

TASK 1

Task 1.1

 hist_1 <- ggplot(data = all_data) + geom_histogram(mapping = aes(x = FTHG, col = I("Black")),binwidth = 1,show.legend = F) + xlab("Home Goals") + ylab("Number of matches") +theme_light() + ggtitle("Hıstogram for Home Scores") + theme(plot.title = element_text(hjust=0.5))

hist_1

 hist_2 <- ggplot(data = all_data) + geom_histogram(mapping = aes(x=FTAG, col = I("Black")),show.legend = F, binwidth = 1) + xlab("Away goals") + ylab("Number of matches") +theme_light()+ ggtitle("Hıstogram for Away Scores") + theme(plot.title = element_text(hjust=0.5))

hist_2

hist_3 <- ggplot(data = all_data) + geom_histogram(mapping = aes(x = (FTHG - FTAG),  col = I("Black")),show.legend = F, binwidth = 1) + xlab("Home goals - Away goals") + ylab("Number of matches") +theme_light()+ ggtitle("Hıstogram for Home-Away Scores") + theme(plot.title = element_text(hjust=0.5))

hist_3

Task 1.2

# Histogram with Poisson Distr. for Home Goals
hist(all_data$FTHG, col = "blue",right = F, xlab = "Home Goals", ylab = "Number of Matches",breaks = 0:10,main="Histogram with Poisson Distribution", ylim = c(0,300))
pois_distr = dpois(c(0:8),lambda = mean(all_data$FTHG))* length(all_data$FTHG)
lines(pois_distr,col = "red")

# Histogram with Poisson Distr. for Away Goals
x <- all_data$FTAG
h<-hist(x, col="red",right = F, xlab="Away Goals", ylab = "Number of Matches",breaks = 0:10,ylim = c(0,300),
         main="Histogram with Poisson Distribution")
xfit<-seq(0.5,10.5, by = 1)
yfit<-dpois(c(0:10), lambda = mean(all_data$FTAG) )*length(x)
lines(xfit,yfit, col="blue", lwd=3)

These distributions are similar to poisson distribution

TASK 2

Task 2.1

In this part, P(home),P(tie) and P(win) are calculated.

# 2.1 calculate p(home win), p(tie), P(away win) for bet365, BetandWin, Pinnacle and


#  bet365
all_data$home_prob1 <- as.numeric(1 / all_data$B365H)
all_data$away_prob1 <- as.numeric(1 / all_data$B365A)
all_data$draw_prob1 <- as.numeric(1 / all_data$B365D)
# Betandwin
all_data$home_prob2 <- as.numeric(1 / all_data$BWH)
all_data$away_prob2 <- as.numeric(1 / all_data$BWA)
all_data$draw_prob2 <- as.numeric(1 / all_data$BWD)
#Pinnacle
all_data$home_prob3 <- as.numeric(1 / all_data$PSH)
all_data$away_prob3 <- as.numeric(1 / all_data$PSA)
all_data$draw_prob3 <- as.numeric(1 / all_data$PSD)
# William Hill
all_data$home_prob4 <- as.numeric(1 / all_data$WHH)
all_data$away_prob4 <- as.numeric(1 / all_data$WHA)
all_data$draw_prob4 <- as.numeric(1 / all_data$WHD)

Task 2.2

Probability calculation with normalization is done in this part of the homework.

# 2.2  probability calculation with normalization
for(i in 1:nrow(all_data)){
  
all_data$home_prob1[i] <- all_data$home_prob1[i] / (all_data$home_prob1[i] + all_data$away_prob1[i] + all_data$draw_prob1[i])
all_data$away_prob1[i] <- all_data$away_prob1[i] / (all_data$home_prob1[i] + all_data$away_prob1[i] + all_data$draw_prob1[i])
all_data$draw_prob1[i] <- all_data$draw_prob1[i] / (all_data$home_prob1[i] + all_data$away_prob1[i] + all_data$draw_prob1[i])


all_data$home_prob2[i] <- all_data$home_prob2[i] / (all_data$home_prob2[i] + all_data$away_prob2[i] + all_data$draw_prob2[i])
all_data$away_prob2[i] <- all_data$away_prob2[i] / (all_data$home_prob2[i] + all_data$away_prob2[i] + all_data$draw_prob2[i])
all_data$draw_prob2[i] <- all_data$draw_prob2[i] / (all_data$home_prob2[i] + all_data$away_prob2[i] + all_data$draw_prob2[i])  

all_data$home_prob3[i] <- all_data$home_prob3[i] / (all_data$home_prob3[i] + all_data$away_prob3[i] + all_data$draw_prob3[i])
all_data$away_prob3[i] <- all_data$away_prob3[i] / (all_data$home_prob3[i] + all_data$away_prob3[i] + all_data$draw_prob3[i])
all_data$draw_prob3[i] <- all_data$draw_prob3[i] / (all_data$home_prob3[i] + all_data$away_prob3[i] + all_data$draw_prob3[i])

all_data$home_prob4[i] <- all_data$home_prob4[i] / (all_data$home_prob4[i] + all_data$away_prob4[i] + all_data$draw_prob4[i])
all_data$away_prob4[i] <- all_data$away_prob4[i] / (all_data$home_prob4[i] + all_data$away_prob4[i] + all_data$draw_prob4[i])
all_data$draw_prob4[i] <- all_data$draw_prob4[i] / (all_data$home_prob4[i] + all_data$away_prob4[i] + all_data$draw_prob4[i])
}

# 2.3

# calculating P(home)- P(away)
all_data$home_away1 <-as.numeric(all_data$home_prob1 - all_data$away_prob1)
all_data$home_away2 <-as.numeric(all_data$home_prob2 - all_data$away_prob2)
all_data$home_away3 <-as.numeric(all_data$home_prob3 - all_data$away_prob3)
all_data$home_away4 <-as.numeric(all_data$home_prob4 - all_data$away_prob4)

# we can create a new column showing discrete values of P(home) - P(away) for [1 0.8) = 1 [0.8 0.6) = 2 ....  [-0.8 -1) = 10

# for Bet365
for(i in 1:nrow(all_data)){
  if( 0.8 < all_data$home_away1[i] & all_data$home_away1[i] <= 1){ 
    all_data$discrete_val1[i] = 1
    
    }else if(0.6 < all_data$home_away1[i] & all_data$home_away1[i] <= 0.8){
      all_data$discrete_val1[i] = 2
      
    }else if(0.4 < all_data$home_away1[i] & all_data$home_away1[i] <= 0.6){
      all_data$discrete_val1[i] = 3
      
    }else if(0.2 < all_data$home_away1[i] & all_data$home_away1[i] <= 0.4){
      all_data$discrete_val1[i] = 4
      
    }else if(0 < all_data$home_away1[i] & all_data$home_away1[i] <= 0.2){
      all_data$discrete_val1[i] = 5
      
    }else if(-0.2 < all_data$home_away1[i] & all_data$home_away1[i] <= 0){
      all_data$discrete_val1[i] = 6
      
    }else if(-0.4 < all_data$home_away1[i] & all_data$home_away1[i] <= -0.2){
      all_data$discrete_val1[i] = 7
      
    }else if(-0.6 < all_data$home_away1[i] & all_data$home_away1[i] <= -0.4){
      all_data$discrete_val1[i] = 8
      
      
    }else if(-0.8< all_data$home_away1[i] & all_data$home_away1[i] <= -0.6){
      all_data$discrete_val1[i] = 9
      
    }else if(-1 < all_data$home_away1[i] & all_data$home_away1[i] <= -0.8){
      all_data$discrete_val1[i] = 10
      
    }
}

# for Bet and Win
for(i in 1:nrow(all_data)){
  if( 0.8 < all_data$home_away2[i] & all_data$home_away2[i] <= 1){ 
    all_data$discrete_val2[i] = 1
    
  }else if(0.6 < all_data$home_away2[i] & all_data$home_away2[i] <= 0.8){
    all_data$discrete_val2[i] = 2
    
  }else if(0.4 < all_data$home_away2[i] & all_data$home_away2[i] <= 0.6){
    all_data$discrete_val2[i] = 3
    
  }else if(0.2 < all_data$home_away2[i] & all_data$home_away2[i] <= 0.4){
    all_data$discrete_val2[i] = 4
    
  }else if(0 < all_data$home_away2[i] & all_data$home_away2[i] <= 0.2){
    all_data$discrete_val2[i] = 5
    
  }else if(-0.2 < all_data$home_away2[i] & all_data$home_away2[i] <= 0){
    all_data$discrete_val2[i] = 6
    
  }else if(-0.4 < all_data$home_away2[i] & all_data$home_away2[i] <= -0.2){
    all_data$discrete_val2[i] = 7
    
  }else if(-0.6 < all_data$home_away2[i] & all_data$home_away2[i] <= -0.4){
    all_data$discrete_val2[i] = 8
    
    
  }else if(-0.8< all_data$home_away2[i] & all_data$home_away2[i] <= -0.6){
    all_data$discrete_val2[i] = 9
    
  }else if(-1 < all_data$home_away2[i] & all_data$home_away2[i] <= -0.8){
    all_data$discrete_val2[i] = 10
    
  }
}

# for Pinnacle
for(i in 1:nrow(all_data)){
  if( 0.8 < all_data$home_away3[i] & all_data$home_away3[i] <= 1){ 
    all_data$discrete_val3[i] = 1
    
  }else if(0.6 < all_data$home_away3[i] & all_data$home_away3[i] <= 0.8){
    all_data$discrete_val3[i] = 2
    
  }else if(0.4 < all_data$home_away3[i] & all_data$home_away3[i] <= 0.6){
    all_data$discrete_val3[i] = 3
    
  }else if(0.2 < all_data$home_away3[i] & all_data$home_away3[i] <= 0.4){
    all_data$discrete_val3[i] = 4
    
  }else if(0 < all_data$home_away3[i] & all_data$home_away3[i] <= 0.2){
    all_data$discrete_val3[i] = 5
    
  }else if(-0.2 < all_data$home_away3[i] & all_data$home_away3[i] <= 0){
    all_data$discrete_val3[i] = 6
    
  }else if(-0.4 < all_data$home_away3[i] & all_data$home_away3[i] <= -0.2){
    all_data$discrete_val3[i] = 7
    
  }else if(-0.6 < all_data$home_away3[i] & all_data$home_away3[i] <= -0.4){
    all_data$discrete_val3[i] = 8
    
    
  }else if(-0.8< all_data$home_away3[i] & all_data$home_away3[i] <= -0.6){
    all_data$discrete_val3[i] = 9
    
  }else if(-1 < all_data$home_away3[i] & all_data$home_away3[i] <= -0.8){
    all_data$discrete_val3[i] = 10
    
  }
}


# for William Hill
for(i in 1:nrow(all_data)){
  if( 0.8 < all_data$home_away4[i] & all_data$home_away4[i] <= 1){ 
    all_data$discrete_val4[i] = 1
    
  }else if(0.6 < all_data$home_away4[i] & all_data$home_away4[i] <= 0.8){
    all_data$discrete_val4[i] = 2
    
  }else if(0.4 < all_data$home_away4[i] & all_data$home_away4[i] <= 0.6){
    all_data$discrete_val4[i] = 3
    
  }else if(0.2 < all_data$home_away4[i] & all_data$home_away4[i] <= 0.4){
    all_data$discrete_val1[i] = 4
    
  }else if(0 < all_data$home_away4[i] & all_data$home_away4[i] <= 0.2){
    all_data$discrete_val4[i] = 5
    
  }else if(-0.2 < all_data$home_away4[i] & all_data$home_away4[i] <= 0){
    all_data$discrete_val4[i] = 6
    
  }else if(-0.4 < all_data$home_away4[i] & all_data$home_away4[i] <= -0.2){
    all_data$discrete_val4[i] = 7
    
  }else if(-0.6 < all_data$home_away4[i] & all_data$home_away4[i] <= -0.4){
    all_data$discrete_val4[i] = 8
    
    
  }else if(-0.8< all_data$home_away4[i] & all_data$home_away4[i] <= -0.6){
    all_data$discrete_val4[i] = 9
    
  }else if(-1 < all_data$home_away4[i] & all_data$home_away4[i] <= -0.8){
    all_data$discrete_val4[i] = 10
    
  }
}

# Create data frames for calculation of P(draw)
df1 = as.data.frame(summary((as.factor(all_data$discrete_val1))))
df1 = rename(df1, total_game1 = 'summary((as.factor(all_data$discrete_val1)))')

df2 = as.data.frame(summary((as.factor(all_data$discrete_val2))))
df2 = rename(df2, total_game1 = 'summary((as.factor(all_data$discrete_val2)))')
df2 = rbind(df2,"10" = 0)

df3 = as.data.frame(summary((as.factor(all_data$discrete_val3))))
df3 = rename(df3, total_game1 = 'summary((as.factor(all_data$discrete_val3)))')

df4 = as.data.frame(summary((as.factor(all_data$discrete_val4))))
df4 = rbind(df4,"4" = 0)
df4 = df4[ order(as.numeric(row.names(df4))), ]
df4 = as.data.frame(df4)
df4 = rename(df4, total_game1 = 'df4')

# calculating number of games ended draw and P(draw)
for(i in 1:10){ 
 df1$num_of_draw[i] = as.numeric(summary(as.factor(all_data$FTR) == "D" & as.factor(all_data$discrete_val1) == i))[3]
 df2$num_of_draw[i] = as.numeric(summary(as.factor(all_data$FTR) == "D" & as.factor(all_data$discrete_val2) == i))[3] 
 df3$num_of_draw[i] = as.numeric(summary(as.factor(all_data$FTR) == "D" & as.factor(all_data$discrete_val3) == i))[3]
 df4$num_of_draw[i] = as.numeric(summary(as.factor(all_data$FTR) == "D" & as.factor(all_data$discrete_val4) == i))[3] 
}
df1$num_of_draw[is.na(df1$num_of_draw)] = 0
df2$num_of_draw[is.na(df2$num_of_draw)] = 0
df3$num_of_draw[is.na(df3$num_of_draw)] = 0
df4$num_of_draw[is.na(df4$num_of_draw)] = 0
for(i in 1:10){
  df1$calc_pdraw[i] = df1$num_of_draw[i] / df1$total_game[i]
  df2$calc_pdraw[i] = df2$num_of_draw[i] / df2$total_game[i]
  df3$calc_pdraw[i] = df3$num_of_draw[i] / df3$total_game[i]
  df4$calc_pdraw[i] = df4$num_of_draw[i] / df4$total_game[i]
}
df1$calc_pdraw[is.nan(df1$calc_pdraw)] = 0
df2$calc_pdraw[is.nan(df2$calc_pdraw)] = 0
df3$calc_pdraw[is.nan(df3$calc_pdraw)] = 0
df4$calc_pdraw[is.nan(df4$calc_pdraw)] = 0

Task 2.3

Bet 365

plot_Bet365 <-  ggplot() +
  theme_gray() +
  geom_point(data = all_data, aes(x = home_away1, y = draw_prob1,col = I("gray"))) +
  geom_point(data = df1, aes(x = seq(0.9,-0.9,length.out = 10), y = calc_pdraw, col = I("red"))) +
  xlab("P(Home)- P(Away)") + ylab("P(tie)") + ggtitle("BET365")+theme(plot.title = element_text(hjust=0.5))+
  scale_color_identity(name = "",labels = c("Bet365", "Calculated P(draw)"), breaks = c("gray", "red"), guide = "legend")


plot_Bet365

Bet & Win

plot_BetandWin <- ggplot() +
  theme_gray() +
  geom_point(data = all_data, aes(x = home_away2, y = draw_prob2, col = I("red"))) +
  geom_point(data = df2, aes(x = seq(0.9,-0.9,length.out = 10), y = calc_pdraw, col = I("black"))) + 
  xlab("P(Home)- P(Away)") + ylab("P(tie)") + ggtitle("BET & WIN")+theme(plot.title = element_text(hjust=0.5))+
  scale_color_identity(name = "",labels = c("BetandWin ","Calculated P(draw)"), breaks = c("red", "black"), guide = "legend")


plot_BetandWin

Pinnacle

plot_Pinnacle  <- ggplot() +
  theme_gray() +
  geom_point(data = all_data, aes(x = home_away3, y = draw_prob3, col = I("green"))) +
  geom_point(data = df3, aes(x = seq(0.9,-0.9,length.out = 10), y = calc_pdraw, color = I("red")))+
  xlab("P(Home)- P(Away)") + ylab("P(tie)") + ggtitle("PINNACLE")+theme(plot.title = element_text(hjust=0.5))+
  scale_color_identity(name = "",labels = c("Pinnacle ","Calculated P(draw)"), breaks = c("green", "red"), guide = "legend")



plot_Pinnacle

William Hill

plot_WilliamHill <- ggplot() +
  theme_gray() +
  geom_point(data = all_data, aes(x = home_away4, y = draw_prob4, col = I("gray"))) +
  geom_point(data = df4, aes(x = seq(0.9,-0.9,length.out = 10), y = calc_pdraw, color = I("green")))+
  xlab("P(Home)- P(Away)") + ylab("P(tie)") + ggtitle("WILLIAM HILL")+theme(plot.title = element_text(hjust=0.5))+
  scale_color_identity(name = "",labels = c("William Hill ","Calculated P(draw)"), breaks = c("gray", "green"), guide = "legend")


plot_WilliamHill

TASK 3

# Remove matches one or more player are shown red card

without_red <- all_data %>% filter(HR == 0 & AR == 0)


without_red$discrete_val1 <- as.factor(without_red$discrete_val1)
levels(without_red$discrete_val1) <- c(levels(without_red$discrete_val1),"10")
df1$num_of_total_no_red = as.numeric(summary((as.factor(without_red$discrete_val1))))

without_red$discrete_val2 <- as.factor(without_red$discrete_val2)
levels(without_red$discrete_val2) <- c(levels(without_red$discrete_val2),"10")
df2$num_of_total_no_red = as.numeric(summary((as.factor(without_red$discrete_val2))))

without_red$discrete_val3 <- as.factor(without_red$discrete_val3)
df3$num_of_total_no_red = as.numeric(summary((as.factor(without_red$discrete_val3))))


without_red$discrete_val4 <- as.factor(without_red$discrete_val4)
levels(without_red$discrete_val4) <- c(levels(without_red$discrete_val4),"4")
without_red$discrete_val4 <- factor(without_red$discrete_val4, levels = c("1","2","3","4","5","6","7","8","9","10"))
df4$num_of_total_no_red = as.numeric(summary((as.factor(without_red$discrete_val4))))


for(i in 1:10){
  
  df1$numberofdraw_without_red[i] <-as.numeric(summary(as.factor(without_red$FTR) == "D" 
                                                           & as.factor(without_red$discrete_val1) == i))[3]
  df2$numberofdraw_without_red[i] <-as.numeric(summary(as.factor(without_red$FTR) == "D" 
                                                       & as.factor(without_red$discrete_val2) == i))[3]
  df3$numberofdraw_without_red[i] <-as.numeric(summary(as.factor(without_red$FTR) == "D" 
                                                       & as.factor(without_red$discrete_val3) == i))[3]
  df4$numberofdraw_without_red[i] <-as.numeric(summary(as.factor(without_red$FTR) == "D" 
                                                       & as.factor(without_red$discrete_val4) == i))[3]
  
}

#Replace NA with zero
df1$numberofdraw_without_red[is.na(df1$numberofdraw_without_red)] <- 0
df2$numberofdraw_without_red[is.na(df2$numberofdraw_without_red)] <- 0
df3$numberofdraw_without_red[is.na(df3$numberofdraw_without_red)] <- 0
df4$numberofdraw_without_red[is.na(df4$numberofdraw_without_red)] <- 0

# calculate p(draw) for matches without red card
for(i in 1:10){
  df1$calc_pdraw_withoutred[i] = df1$numberofdraw_without_red[i] / df1$num_of_total_no_red [i]
  df2$calc_pdraw_withoutred[i] = df2$numberofdraw_without_red[i] / df2$num_of_total_no_red [i]
  df3$calc_pdraw_withoutred[i] = df3$numberofdraw_without_red[i] / df3$num_of_total_no_red [i]
  df4$calc_pdraw_withoutred[i] = df4$numberofdraw_without_red[i] / df4$num_of_total_no_red [i]
}

df1$calc_pdraw_withoutred[is.nan(df1$calc_pdraw_withoutred)] <- 0
df2$calc_pdraw_withoutred[is.nan(df2$calc_pdraw_withoutred)] <- 0
df3$calc_pdraw_withoutred[is.nan(df3$calc_pdraw_withoutred)] <- 0
df4$calc_pdraw_withoutred[is.nan(df4$calc_pdraw_withoutred)] <- 0

Bet365 without Red Card

plot1.1<- ggplot() +
  theme_gray() +
  geom_point(data = all_data, aes(x = home_away1, y = draw_prob1,col = I("gray"))) +
  geom_point(data = df1, aes(x = seq(0.9,-0.9,length.out = 10), y = calc_pdraw, col = I("red"))) +
  xlab("P(Home)- P(Away)") + ylab("P(tie)") + ggtitle("BET365")+theme(plot.title = element_text(hjust=0.5))+
  geom_point(data = df1, aes(x = seq(0.9,-0.9,length.out = 10), y = calc_pdraw_withoutred, col = I("black")))+
  scale_color_identity(name = "",labels = c("Bet365","With Red Card","Without Red Card"), breaks = c("gray","red","black"), guide = "legend")


plot1.1

Bet & Win without Red Card

plot2.1<- ggplot() +
  theme_gray() +
  geom_point(data = all_data, aes(x = home_away2, y = draw_prob2, col = I("red"))) +
  geom_point(data = df2, aes(x = seq(0.9,-0.9,length.out = 10), y = calc_pdraw, col = I("black"))) + 
  xlab("P(Home)- P(Away)") + ylab("P(tie)") + ggtitle("BET & WIN")+theme(plot.title = element_text(hjust=0.5))+
  geom_point(data = df2, aes(x = seq(0.9,-0.9,length.out = 10), y = calc_pdraw_withoutred, color = I("blue")))+
  scale_color_identity(name = "",labels = c("BetandWin ","With RC", "Without RC"), breaks = c("red", "black", "blue"), guide = "legend")


plot2.1

Pinnacle without Red Card

plot3.1<- ggplot() +
  theme_gray() +
  geom_point(data = all_data, aes(x = home_away3, y = draw_prob3, col = I("green"))) +
  geom_point(data = df3, aes(x = seq(0.9,-0.9,length.out = 10), y = calc_pdraw, color = I("red")))+
  xlab("P(Home)- P(Away)") + ylab("P(tie)") + ggtitle("PINNACLE")+theme(plot.title = element_text(hjust=0.5)) +
  geom_point(data = df3, aes(x = seq(0.9,-0.9,length.out = 10), y = calc_pdraw_withoutred, color = I("black")))+
  scale_color_identity(name = "",labels = c("Pinnacle ","With RC","Without RC"), breaks = c("green", "red","black"), guide = "legend")


plot3.1

William Hill without Red Card

plot4.1<- ggplot() +
  theme_gray() +
  geom_point(data = all_data, aes(x = home_away4, y = draw_prob4, col = I("gray"))) +
  geom_point(data = df4, aes(x = seq(0.9,-0.9,length.out = 10), y = calc_pdraw, color = I("green")))+
  xlab("P(Home)- P(Away)") + ylab("P(tie)") + ggtitle("WILLIAM HILL")+theme(plot.title = element_text(hjust=0.5))+
  geom_point(data = df4, aes(x = seq(0.9,-0.9,length.out = 10), y = calc_pdraw_withoutred, color = I("black")))+
  scale_color_identity(name = "",labels = c("William Hill ","With RC","Without RC"), breaks = c("gray", "green", "black"), guide = "legend")



plot4.1

There is no big change in my values when matches one or more players are shown red card are removed from data sets.It is safe to say that a little increase in P(draw) is generally seen but removing matches do not significantly impact on my results.