In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn import tree
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix,classification_report
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV,TimeSeriesSplit, cross_val_score



# Veriyi yükleme
data = pd.read_csv('match_data')

Data Preprocessing

In [20]:
# Eksik değerleri olan sütunların adları ve sayıları
missing_columns = data.isnull().sum()

# 0'dan fazla eksik değeri olan sütunları filtreleyip yazdırma
missing_columns = missing_columns[missing_columns > 0]

# Eksik değerleri olan tüm sütunları ekrana yazdırma
print("Eksik değeri olan sütunlar ve sayıları:")
print(missing_columns.to_string()) 

# current_state sütunundaki eksik değerleri silme
data.dropna(subset=['current_state'], inplace=True)

# Diğer sütunlar için daha hızlı eksik veri doldurma
# Önce fixture_id'ye göre grupla ve forward fill yap
columns_with_nulls = [col for col in data.columns if col != 'current_state' and data[col].isnull().sum() > 0]
data.sort_values(['fixture_id', 'current_time'], inplace=True)
data[columns_with_nulls] = data.groupby('fixture_id')[columns_with_nulls].transform(lambda x: x.ffill().round())

# Kalan eksik değerleri 0 ile doldur
data.fillna(0, inplace=True)

# Güncellenmiş veriyi kontrol etme
print("Eksik değerlerin bulunduğu sütunlar (temizlemeden sonra):\n", data.isnull().sum())
Eksik değeri olan sütunlar ve sayıları:
Accurate Crosses - away                 3028
Accurate Crosses - home                 3007
Assists - away                         27934
Assists - home                         27931
Attacks - away                            39
Attacks - home                            33
Ball Possession % - away                  23
Ball Possession % - home                  22
Ball Safe - away                       23428
Ball Safe - home                       23424
Challenges - away                       4681
Challenges - home                       4674
Corners - away                            74
Corners - home                            66
Counter Attacks - away                 41211
Counter Attacks - home                 41209
Dangerous Attacks - away                  53
Dangerous Attacks - home                  47
Dribble Attempts - away                 7869
Dribble Attempts - home                 7844
Fouls - away                            1627
Fouls - home                            1615
Free Kicks - away                      58465
Free Kicks - home                      58465
Goal Attempts - away                   27536
Goal Attempts - home                   27529
Goal Kicks - away                       4685
Goal Kicks - home                       4678
Goals - away                              39
Goals - home                              36
Headers - away                         10413
Headers - home                         10406
Hit Woodwork - away                      198
Hit Woodwork - home                      177
Injuries - away                        46318
Injuries - home                        46316
Interceptions - away                    4255
Interceptions - home                    4234
Key Passes - away                       5517
Key Passes - home                       5513
Long Passes - away                      5525
Long Passes - home                      5514
Offsides - away                        17877
Offsides - home                        17872
Passes - away                            292
Passes - home                            277
Penalties - away                          79
Penalties - home                          72
Redcards - away                           65
Redcards - home                           55
Saves - away                           10769
Saves - home                           10758
Shots Blocked - away                     236
Shots Blocked - home                     223
Shots Insidebox - away                   267
Shots Insidebox - home                   247
Shots Off Target - away                   36
Shots Off Target - home                   28
Shots On Target - away                    35
Shots On Target - home                    28
Shots Outsidebox - away                  272
Shots Outsidebox - home                  248
Substitutions - away                      85
Substitutions - home                      75
Successful Dribbles - away              5323
Successful Dribbles - home              5306
Successful Headers - away              10429
Successful Headers - home              10418
Successful Interceptions - away         2676
Successful Interceptions - home         2670
Successful Passes - away                 391
Successful Passes - home                 389
Successful Passes Percentage - away      163
Successful Passes Percentage - home      153
Tackles - away                          1960
Tackles - home                          1949
Throwins - away                         1133
Throwins - home                         1126
Total Crosses - away                    2215
Total Crosses - home                    2204
Yellowcards - away                        80
Yellowcards - home                        69
Yellowred Cards - away                 23244
Yellowred Cards - home                 23237
current_state                             39
Eksik değerlerin bulunduğu sütunlar (temizlemeden sonra):
 fixture_id                0
halftime                  0
current_time              0
half_start_datetime       0
match_start_datetime      0
                         ..
Yellowred Cards - away    0
Yellowred Cards - home    0
current_state             0
final_score               0
result                    0
Length: 106, dtype: int64
In [21]:
data.loc[data['result'] == 'X', 'result'] = 0
data.loc[data['current_state'] == 'X', 'current_state'] = 0

data['current_state'] = data['current_state'].astype(int)
data['result'] = data['result'].astype(int)

# Convert current_time to datetime
data['current_time'] = pd.to_datetime(data['current_time'])
data.to_csv('filled_match_groups_2.csv', index=False)
In [22]:
# Olasılıkların hesaplanması
data['P_home'] = 1 / data['1']
data['P_draw'] = 1 / data['X']
data['P_away'] = 1 / data['2']

data['P_sum'] = data['P_home'] + data['P_draw'] + data['P_away']
data['P_home_norm'] = data['P_home'] / data['P_sum']
data['P_draw_norm'] = data['P_draw'] / data['P_sum']
data['P_away_norm'] = data['P_away'] / data['P_sum']
/var/folders/ys/74vbprgx5wb7ggv44nbd6f240000gn/T/ipykernel_4648/2818227534.py:9: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  data['P_away_norm'] = data['P_away'] / data['P_sum']
In [23]:
def calculate_last_five_minute_average(data):
    # Convert current_time to datetime once for the entire DataFrame
    data['current_time'] = pd.to_datetime(data['current_time'], errors='coerce')
    
    # Create a new DataFrame to store results
    results = pd.DataFrame(index=data.index, columns=['last_5_min_avg_1', 'last_5_min_avg_X', 'last_5_min_avg_2', 
                                                      'last_5_min_std_1', 'last_5_min_std_X', 'last_5_min_std_2'])

    for index, row in data.iterrows():
        current_time = row['current_time']
        fixture_id = row['fixture_id']
        
        # Check if current_time is valid
        if pd.isnull(current_time):
            results.loc[index] = row[['1', 'X', '2']].values.tolist() + [0, 0, 0]
            continue
        
        # Calculate the time window for the last 5 minutes
        time_window_start = current_time - pd.Timedelta(minutes=5)
        
        # Filter the data for the same fixture_id and within the last 5 minutes
        recent_data = data[(data['fixture_id'] == fixture_id) & 
                           (data['current_time'] >= time_window_start) & 
                           (data['current_time'] <= current_time)]
        
        # Calculate the average and standard deviation for '1', 'X', and '2' columns
        if not recent_data.empty:
            avg_values = recent_data[['1', 'X', '2']].mean().values
            std_values = recent_data[['1', 'X', '2']].std().values
            results.loc[index] = avg_values.tolist() + std_values.tolist()
        else:
            results.loc[index] = row[['1', 'X', '2']].values.tolist() + [0, 0, 0]

    # Assign results back to the original DataFrame
    data[['last_5_min_avg_1', 'last_5_min_avg_X', 'last_5_min_avg_2', 
          'last_5_min_std_1', 'last_5_min_std_X', 'last_5_min_std_2']] = results

# Call the function to add the columns to the data DataFrame
calculate_last_five_minute_average(data)
/var/folders/ys/74vbprgx5wb7ggv44nbd6f240000gn/T/ipykernel_4648/1763149710.py:35: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  data[['last_5_min_avg_1', 'last_5_min_avg_X', 'last_5_min_avg_2',
/var/folders/ys/74vbprgx5wb7ggv44nbd6f240000gn/T/ipykernel_4648/1763149710.py:35: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  data[['last_5_min_avg_1', 'last_5_min_avg_X', 'last_5_min_avg_2',
/var/folders/ys/74vbprgx5wb7ggv44nbd6f240000gn/T/ipykernel_4648/1763149710.py:35: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  data[['last_5_min_avg_1', 'last_5_min_avg_X', 'last_5_min_avg_2',
/var/folders/ys/74vbprgx5wb7ggv44nbd6f240000gn/T/ipykernel_4648/1763149710.py:35: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  data[['last_5_min_avg_1', 'last_5_min_avg_X', 'last_5_min_avg_2',
/var/folders/ys/74vbprgx5wb7ggv44nbd6f240000gn/T/ipykernel_4648/1763149710.py:35: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  data[['last_5_min_avg_1', 'last_5_min_avg_X', 'last_5_min_avg_2',
/var/folders/ys/74vbprgx5wb7ggv44nbd6f240000gn/T/ipykernel_4648/1763149710.py:35: PerformanceWarning: DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
  data[['last_5_min_avg_1', 'last_5_min_avg_X', 'last_5_min_avg_2',
In [24]:
data[['last_5_min_std_1', 'last_5_min_std_X', 'last_5_min_std_2']] = data[[ 
      'last_5_min_std_1', 'last_5_min_std_X', 'last_5_min_std_2']].fillna(0)

data.head()
/var/folders/ys/74vbprgx5wb7ggv44nbd6f240000gn/T/ipykernel_4648/1277698379.py:1: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
  data[['last_5_min_std_1', 'last_5_min_std_X', 'last_5_min_std_2']] = data[[
Out[24]:
fixture_id halftime current_time half_start_datetime match_start_datetime minute second latest_bookmaker_update suspended stopped ... P_sum P_home_norm P_draw_norm P_away_norm last_5_min_avg_1 last_5_min_avg_X last_5_min_avg_2 last_5_min_std_1 last_5_min_std_X last_5_min_std_2
952 19134453.0 1st-half 2024-08-16 19:01:19 2024-08-16 19:00:31 2024-08-16 19:00:31 0 48 2024-08-16 19:01:17 False False ... 1.052410 0.572410 0.237550 0.190040 1.66 4.0 5.0 0.000000 0.0 0.000000
953 19134453.0 1st-half 2024-08-16 19:02:18 2024-08-16 19:00:31 2024-08-16 19:00:31 1 47 2024-08-16 19:02:16 False False ... 1.052410 0.572410 0.237550 0.190040 1.66 4.0 5.0 0.000000 0.0 0.000000
954 19134453.0 1st-half 2024-08-16 19:03:19 2024-08-16 19:00:31 2024-08-16 19:00:31 2 48 2024-08-16 19:03:15 False False ... 1.052936 0.589891 0.237431 0.172677 1.643333 4.0 5.166667 0.028868 0.0 0.288675
955 19134453.0 1st-half 2024-08-16 19:04:18 2024-08-16 19:00:31 2024-08-16 19:00:31 3 47 2024-08-16 19:04:08 False False ... 1.071118 0.579878 0.233401 0.186721 1.635 4.0 5.125 0.028868 0.0 0.250000
956 19134453.0 1st-half 2024-08-16 19:05:19 2024-08-16 19:00:31 2024-08-16 19:00:31 4 48 2024-08-16 19:05:15 False False ... 1.071118 0.579878 0.233401 0.186721 1.63 4.0 5.1 0.027386 0.0 0.223607

5 rows × 119 columns

In [25]:
# Kategorik değişkenleri ayrı ayrı encode et
halftime_encoder = LabelEncoder()

# Kategorik değişkenleri sayısala çevirme
data['halftime'] = halftime_encoder.fit_transform(data['halftime'])
In [26]:
# Split into training and test sets based on date
data = data.sort_values(['fixture_id', 'current_time'])
test_start_date = '2024-11-01'
validation_start_date =  '2024-10-20'
train_data = data[data['current_time'] < validation_start_date]
validation_data = data[(data['current_time'] < test_start_date)& (data['current_time'] >= validation_start_date)]
test_data = data[data['current_time'] >= test_start_date]

validation_data.head()
Out[26]:
fixture_id halftime current_time half_start_datetime match_start_datetime minute second latest_bookmaker_update suspended stopped ... P_sum P_home_norm P_draw_norm P_away_norm last_5_min_avg_1 last_5_min_avg_X last_5_min_avg_2 last_5_min_std_1 last_5_min_std_X last_5_min_std_2
45325 19134516.0 0 2024-10-20 15:33:19 2024-10-20 15:31:56 2024-10-20 15:31:56 1 23 2024-10-20 15:33:17 False False ... 1.046854 0.575448 0.212276 0.212276 1.66 4.5 4.5 0.000000 0.000000 0.000000
45326 19134516.0 0 2024-10-20 15:34:24 2024-10-20 15:31:56 2024-10-20 15:31:56 2 28 2024-10-20 15:34:17 False False ... 1.055579 0.570691 0.210522 0.218787 1.66 4.5 4.415 0.000000 0.000000 0.120208
45327 19134516.0 0 2024-10-20 15:35:28 2024-10-20 15:31:56 2024-10-20 15:31:56 3 32 2024-10-20 15:35:18 False False ... 1.064303 0.566013 0.216993 0.216993 1.66 4.443333 4.386667 0.000000 0.098150 0.098150
45328 19134516.0 0 2024-10-20 15:36:19 2024-10-20 15:31:56 2024-10-20 15:31:56 4 23 2024-10-20 15:36:13 False False ... 1.062342 0.547277 0.217394 0.235329 1.675 4.415 4.29 0.030000 0.098150 0.209284
45329 19134516.0 0 2024-10-20 15:37:18 2024-10-20 15:31:56 2024-10-20 15:31:56 5 22 2024-10-20 15:37:13 False False ... 1.062342 0.547277 0.217394 0.235329 1.684 4.398 4.232 0.032863 0.093113 0.222868

5 rows × 119 columns

In [27]:
print("Validation seti içindeki fixture sayısı:", validation_data["fixture_id"].nunique())
print("Test seti içindeki fixture sayısı:", test_data["fixture_id"].nunique())
print("Train seti içindeki fixture sayısı:", train_data["fixture_id"].nunique())
Validation seti içindeki fixture sayısı: 92
Test seti içindeki fixture sayısı: 111
Train seti içindeki fixture sayısı: 445
In [28]:
## Simple Decision Tree

X_train = train_data.drop(columns=['current_time', 'half_start_datetime', 'match_start_datetime', 
                                  'latest_bookmaker_update', 'suspended', 'stopped', 'ticking', 'final_score',
                                  'name','fixture_id','result'])

X_train_logistic = X_train.copy()

X_val = validation_data.drop(columns=['current_time', 'half_start_datetime', 'match_start_datetime', 
                                  'latest_bookmaker_update', 'suspended', 'stopped', 'ticking', 'final_score',
                                  'name','fixture_id','result'])

X_val_logistic = X_val.copy()

X_test = test_data.drop(columns=['current_time', 'half_start_datetime', 'match_start_datetime',
                                'latest_bookmaker_update', 'suspended', 'stopped', 'ticking', 'final_score', 
                                'name','fixture_id','result'])
X_test_logistic = X_test.copy()

y_train = train_data['result'].copy()  # result değerini güncellemiyoruz
y_train_logistic = y_train.copy()


y_val = validation_data['result'].copy()  # result değerini güncellemiyoruz
y_val_logistic = y_val.copy()

y_test = test_data['result'].copy()    # result değerini güncellemiyoruz
y_test_logistic = y_test.copy()

clf = DecisionTreeClassifier(random_state=42, max_depth=15, min_samples_split=20)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
y_pred_val = clf.predict(X_val)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')


accuracy_val = accuracy_score(y_val, y_pred_val)
print(f'Validation Accuracy: {accuracy_val * 100:.2f}%')
Accuracy: 56.39%
Validation Accuracy: 54.56%
In [29]:
## Logistic Regression

# Ölçeklendirme (Standartlaştırma)
scaler = StandardScaler()
X_train_logistic_scaled = scaler.fit_transform(X_train_logistic)
X_val_logistic_scaled = scaler.transform(X_val_logistic)
X_test_logistic_scaled = scaler.transform(X_test_logistic)

param_grid = {
    'solver': ['lbfgs', 'saga'],
    'C': [0.1, 1.0, 10.0],  # Regularization strength
    'max_iter': [1000, 2000, 3000],
}

#rid_search = GridSearchCV(
#   LogisticRegression(multi_class='multinomial', random_state=42),
#   param_grid,
#   cv=5,
#   scoring='accuracy'
#
#
#rid_search.fit(X_train_logistic_scaled, y_train_logistic)
#rint(f"Best parameters: {grid_search.best_params_}")


# Logistic Regression modelini oluşturma
logis_reg_model = LogisticRegression(
    solver='lbfgs',  # Varsayılan çözümleyici (uygun ve hızlı)
    max_iter=1000,   # Daha yüksek iterasyon sınırı
    C=0.1,
    random_state=42
)

# Modeli eğitme
logis_reg_model.fit(X_train_logistic_scaled, y_train_logistic)

# Tahmin olasılıklarını hesaplama
train_probabilities_logis_reg = logis_reg_model.predict_proba(X_train_logistic_scaled)

val_probabilities_logis_reg = logis_reg_model.predict_proba(X_val_logistic_scaled)
test_probabilities_logis_reg = logis_reg_model.predict_proba(X_test_logistic_scaled)

# Veri çerçevesine olasılıkları ekleme
data.loc[train_data.index, 'win_probability_1_logis_reg'] = train_probabilities_logis_reg[:, 0]  # 0 için olasılık ekleniyor
data.loc[train_data.index, 'win_probability_2_logis_reg'] = train_probabilities_logis_reg[:, 1]  # 1 için olasılık ekleniyor
data.loc[train_data.index, 'win_probability_X_logis_reg'] = train_probabilities_logis_reg[:, 2]  # 2 için olasılık ekleniyor

data.loc[validation_data.index, 'win_probability_1_logis_reg'] = val_probabilities_logis_reg[:, 0]  # 0 için olasılık ekleniyor
data.loc[validation_data.index, 'win_probability_2_logis_reg'] = val_probabilities_logis_reg[:, 1]  # 1 için olasılık ekleniyor
data.loc[validation_data.index, 'win_probability_X_logis_reg'] = val_probabilities_logis_reg[:, 2]  # 2 için olasılık ekleniyor

data.loc[test_data.index, 'win_probability_1_logis_reg'] = test_probabilities_logis_reg[:, 0]  # 0 için olasılık ekleniyor
data.loc[test_data.index, 'win_probability_2_logis_reg'] = test_probabilities_logis_reg[:, 1]  # 1 için olasılık ekleniyor
data.loc[test_data.index, 'win_probability_X_logis_reg'] = test_probabilities_logis_reg[:, 2]  # 2 için olasılık ekleniyor

# Test seti üzerinde tahminler

y_pred_logis_reg = logis_reg_model.predict(X_test_logistic_scaled)
y_pred_val_logis_reg = logis_reg_model.predict(X_val_logistic_scaled)

# Tahminleri veri setine ekleme
data.loc[test_data.index, 'predictions_logis_reg'] = y_pred_logis_reg  # Tahminleri ekleme
data.loc[validation_data.index, 'predictions_logis_reg'] = y_pred_val_logis_reg  # Tahminleri ekleme
# Model performansı değerlendirme
accuracy_logis_reg = accuracy_score(y_test_logistic, y_pred_logis_reg)
print(f"\nDoğru Tahmin Oranı: {accuracy_logis_reg:.2%}")

accuracy_logis_reg_val = accuracy_score(y_val_logistic, y_pred_val_logis_reg)
print(f"\nValidation Doğru Tahmin Oranı: {accuracy_logis_reg_val:.2%}")


# Detaylı performans metrikleri
precision_logis_reg = precision_score(y_test_logistic, y_pred_logis_reg, average=None)
recall_logis_reg = recall_score(y_test_logistic, y_pred_logis_reg, average=None)
f1_logis_reg = f1_score(y_test_logistic, y_pred_logis_reg, average=None)

metrics_logis_reg_df = pd.DataFrame({
    'Sınıf': ['0', '1', '2'],
    'Precision': precision_logis_reg,
    'Recall': recall_logis_reg,
    'F1-score': f1_logis_reg
})
print("\nModel Performans Metrikleri (Logistic Regression):")
print(metrics_logis_reg_df.to_string(index=False))
Doğru Tahmin Oranı: 61.91%

Validation Doğru Tahmin Oranı: 58.26%

Model Performans Metrikleri (Logistic Regression):
Sınıf  Precision   Recall  F1-score
    0   0.363043 0.402642  0.381818
    1   0.746410 0.713125  0.729388
    2   0.616645 0.614832  0.615737
In [30]:
# RANDOM FOREST KODU:

# NaN değerleri doldur
imputer = SimpleImputer(strategy='mean')
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)

X_val = pd.DataFrame(imputer.transform(X_val), columns=X_val.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

# Sınıf dağılımını kontrol et
print("Eğitim setindeki sınıf dağılımı:")
print(y_train.value_counts(normalize=True))

# SMOTE ile dengesiz veri setini dengele
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)
Eğitim setindeki sınıf dağılımı:
result
1    0.423456
2    0.300708
0    0.275835
Name: proportion, dtype: float64
In [ ]:
## Parameter tuning for random forest ( uzun sürüyor)
# Grid search için parametre setleri
param_grid = {
    'n_estimators': [500, 1000,1500],
    'max_depth': [15, 20],
    'min_samples_split': [5, 10, 15],
    'min_samples_leaf': [4, 6]
}

# Model oluştur
rf = RandomForestClassifier(max_features='sqrt', random_state=42, n_jobs=-1)

# GridSearchCV tanımla
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,        
    n_jobs=-1,
    verbose=2
)

# Modeli eğit
grid_search.fit(X_train_balanced, y_train_balanced)


# En iyi parametreler ve skor
print("En iyi parametreler:", grid_search.best_params_)
In [31]:
# Random Forest modelini oluştur
rf_model = RandomForestClassifier(
    n_estimators= 500,#1000,
    max_depth=20,
    min_samples_split=5, #10,
    min_samples_leaf=4,
    max_features='sqrt',
    random_state=42,
    bootstrap=True,
    oob_score=True,
    n_jobs=-1
)

#tscv = TimeSeriesSplit(n_splits=5)
#cv_scores = cross_val_score(rf_model, X_train, y_train, cv=tscv)
#print(f"Time Series Cross-validation scores: {cv_scores}")
#print(f"Average CV score: {cv_scores.mean():.2f} (+/- {cv_scores.std() * 2:.2f})")

# Modeli eğit
rf_model.fit(X_train_balanced, y_train_balanced)

# Tahmin olasılıklarını hesapla
train_probabilities = rf_model.predict_proba(X_train)
validation_probabilities = rf_model.predict_proba(X_val)
test_probabilities = rf_model.predict_proba(X_test)

# Kazanma olasılıklarını data frame'e ekle
data.loc[train_data.index, 'win_probability_1'] = train_probabilities[:, 1]  # 1 için olasılık ekleniyor
data.loc[train_data.index, 'win_probability_2'] = train_probabilities[:, 2]  # 2 için olasılık ekleniyor
data.loc[train_data.index, 'win_probability_X'] = train_probabilities[:, 0]  # X için olasılık ekleniyor

data.loc[validation_data.index, 'win_probability_1'] = validation_probabilities[:, 1]  # 1 için olasılık ekleniyor
data.loc[validation_data.index, 'win_probability_2'] = validation_probabilities[:, 2]  # 2 için olasılık ekleniyor
data.loc[validation_data.index, 'win_probability_X'] = validation_probabilities[:, 0]  # X için olasılık ekleniyor

data.loc[test_data.index, 'win_probability_1'] = test_probabilities[:, 1]  # 1 için olasılık ekleniyor
data.loc[test_data.index, 'win_probability_2'] = test_probabilities[:, 2]  # 2 için olasılık ekleniyor
data.loc[test_data.index, 'win_probability_X'] = test_probabilities[:, 0]  # X için olasılık ekleniyor

# Validation seti üstünden tahmin yap
y_pred_val = rf_model.predict(X_val)

# Test seti üzerinde tahmin yap
y_pred = rf_model.predict(X_test)

# Tahminleri ana veri setine ekleyelim
data.loc[test_data.index, 'predictions'] = y_pred  # Tahmin edilen değerleri ekliyoruz
data.loc[validation_data.index, 'predictions'] = y_pred_val  # Tahmin edilen değerleri ekliyoruz
# Model performansını değerlendir
accuracy = accuracy_score(y_test, y_pred)
print(f"\nTest Seti Doğru Tahmin Oranı: {accuracy:.2%}")

val_accuracy = accuracy_score(y_val, y_pred_val)
print(f"\nValidation Seti Doğru Tahmin Oranı: {val_accuracy:.2%}")
Test Seti Doğru Tahmin Oranı: 64.29%

Validation Seti Doğru Tahmin Oranı: 63.96%
In [32]:
# Detaylı performans metrikleri
precision = precision_score(y_test, y_pred, labels=[0, 1, 2], average=None)
recall = recall_score(y_test, y_pred, labels=[0, 1, 2], average=None)
f1 = f1_score(y_test, y_pred, labels=[0, 1, 2], average=None)

metrics_df = pd.DataFrame({
    'Sınıf': ['0', '1', '2'],
    'Precision': precision,
    'Recall': recall,
    'F1-score': f1
})
print("\nTest Set Model Model Performance Metrics:")
print(metrics_df.to_string(index=False))

# Confusion Matrix görselleştirme
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Time Series Model - Test Set Confusion Matrix')
plt.ylabel('Gerçek Değerler')
plt.xlabel('Tahmin Edilen Değerler')
plt.show()


precision_val = precision_score(y_val, y_pred_val, labels=[0, 1, 2], average=None)
recall_val = recall_score(y_val, y_pred_val, labels=[0, 1, 2], average=None)
f1_val = f1_score(y_val, y_pred_val, labels=[0, 1, 2], average=None)

metrics_df_val = pd.DataFrame({
    'Sınıf': ['0', '1', '2'],
    'Precision': precision_val,
    'Recall': recall_val,
    'F1-score': f1_val
})
print("\nValidation Set Model Performance Metrics:")
print(metrics_df_val.to_string(index=False))

plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_val, y_pred_val)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Time Series Model - Validation Set Confusion Matrix')
plt.ylabel('Gerçek Değerler')
plt.xlabel('Tahmin Edilen Değerler')
plt.show()
Test Set Model Model Performance Metrics:
Sınıf  Precision   Recall  F1-score
    0   0.417528 0.430337  0.423836
    1   0.799876 0.701390  0.747403
    2   0.583765 0.700098  0.636661
No description has been provided for this image
Validation Set Model Performance Metrics:
Sınıf  Precision   Recall  F1-score
    0   0.426326 0.417709  0.421974
    1   0.735064 0.684964  0.709130
    2   0.666570 0.731248  0.697412
No description has been provided for this image
In [33]:
# Özellik önemliliklerini görüntüle
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)
print("\nEn önemli 10 özellik:")
print(feature_importance.head(10))
En önemli 10 özellik:
              feature  importance
101       P_away_norm    0.045956
99        P_home_norm    0.042057
3                   1    0.039627
97             P_away    0.035815
4                   2    0.033244
95             P_home    0.031039
102  last_5_min_avg_1    0.030210
104  last_5_min_avg_2    0.029256
100       P_draw_norm    0.025206
96             P_draw    0.021635
In [34]:
# Tahminleri ana veri setine ekleyelim
data.loc[test_data.index, 'predictions'] = y_pred
data.loc[validation_data.index, 'predictions'] = y_pred_val
# Test verisindeki tahminleri analiz edelim
test_analysis = test_data.copy()
test_analysis['predictions'] = y_pred

# Tahmin-sonuç kombinasyonlarını oluştur
test_analysis['prediction_result'] = test_analysis['predictions'].astype(str) + '_tahmin_' + test_analysis['result'].astype(str)

# Her kombinasyon için analiz
combination_analysis = test_analysis.groupby(['minute', 'halftime']).apply(
    lambda x: pd.Series({
        '0_tahmin_0': len(x[x['prediction_result'] == '0_tahmin_0']),
        '0_tahmin_1': len(x[x['prediction_result'] == '0_tahmin_1']),
        '0_tahmin_2': len(x[x['prediction_result'] == '0_tahmin_2']),
        '1_tahmin_0': len(x[x['prediction_result'] == '1_tahmin_0']),
        '1_tahmin_1': len(x[x['prediction_result'] == '1_tahmin_1']),
        '1_tahmin_2': len(x[x['prediction_result'] == '1_tahmin_2']),
        '2_tahmin_0': len(x[x['prediction_result'] == '2_tahmin_0']),
        '2_tahmin_1': len(x[x['prediction_result'] == '2_tahmin_1']),
        '2_tahmin_2': len(x[x['prediction_result'] == '2_tahmin_2'])
    })
).fillna(0)

# Toplam ve doğru tahminleri hesapla
test_analysis['correct'] = test_analysis['predictions'] == test_analysis['result']
summary_analysis = test_analysis.groupby(['minute', 'halftime']).agg({
    'fixture_id': 'count',  # toplam tahmin sayısı
    'correct': 'sum'  # doğru tahmin sayısı
}).sort_index(level=['halftime', 'minute'])

summary_analysis.columns = ['Toplam_Tahmin', 'Doğru_Tahmin']
summary_analysis['Başarı_Oranı'] = (summary_analysis['Doğru_Tahmin'] / summary_analysis['Toplam_Tahmin']).round(3)
summary_analysis['Hata_Oranı'] = (1 - summary_analysis['Başarı_Oranı']).round(3)

# Dakika bazında ortalama başarı oranları
minute_success = test_analysis.groupby('minute')['correct'].agg(['mean', 'count']).round(3)
minute_success.columns = ['Ortalama_Başarı', 'Toplam_Maç']

# Sonuçları görsel tablolar halinde göster
plt.figure(figsize=(20, 40))

# İlk tablo - Kombinasyon Analizi
plt.subplot(1, 2, 1)
combination_analysis = combination_analysis.sort_index(level=['halftime', 'minute'])  # Y eksenini halftime ve minute'a göre sırala
sns.heatmap(combination_analysis, annot=True, fmt='d', cmap='YlOrRd', 
            annot_kws={'size': 10, 'weight': 'bold'}, cbar=False)  # Renk barını kaldırdık
plt.title('Tahmin-Sonuç Kombinasyonlarına Göre Dağılım\n(1: Ev Sahibi Kazanır, 0: Berabere, 2: Deplasman Kazanır)', 
          fontsize=14, pad=20)
plt.xlabel('Tahmin-Sonuç Kombinasyonları', fontsize=12)
plt.ylabel('(Dakika, Yarı)', fontsize=12)
plt.xticks(rotation=45)

# İkinci tablo - Özet Analiz
plt.subplot(1, 2, 2)
sns.heatmap(summary_analysis, annot=True, fmt='.3f', cmap='YlOrRd',
            annot_kws={'size': 10, 'weight': 'bold'}, cbar=False)  # Renk barını kaldırdık
plt.title('Dakika ve Yarıya Göre Tahmin Performansı', fontsize=14, pad=20)
plt.xlabel('Performans Metrikleri', fontsize=12)
plt.ylabel('(Dakika, Yarı)', fontsize=12)

plt.tight_layout(pad=3.0)
plt.show()
/var/folders/ys/74vbprgx5wb7ggv44nbd6f240000gn/T/ipykernel_4648/2813062140.py:12: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  combination_analysis = test_analysis.groupby(['minute', 'halftime']).apply(
No description has been provided for this image
In [35]:
plt.figure(figsize=(12, 6))
filtered_data = data[data['fixture_id'] == 19134533.0]
plt.plot(filtered_data['current_time'], (filtered_data['win_probability_1'] - (1/filtered_data['1'])), label='Win Probability 1', color='blue')
plt.plot(filtered_data['current_time'], (filtered_data['win_probability_2'] - (1/filtered_data['2'])), label='Win Probability 2', color='orange')
plt.plot(filtered_data['current_time'], (filtered_data['win_probability_X'] - (1/filtered_data['X'])), label='Win Probability X', color='green')
plt.title('Win Probabilities Over Time')
plt.xlabel('Current Time')
plt.ylabel('Win Probability')
plt.legend()
plt.grid()
plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 6))
filtered_data = data[data['fixture_id'] == 19134533.0]
plt.plot(filtered_data['current_time'], filtered_data['1'], label='Win Bet 1', color='blue')
plt.plot(filtered_data['current_time'], filtered_data['2'], label='Win Bet 2', color='orange')
plt.plot(filtered_data['current_time'], filtered_data['X'], label='Win Bet X', color='green')
plt.title('Win Probabilities Over Time')
plt.xlabel('Current Time')
plt.ylabel('Win Probability')
plt.legend()
plt.grid()
plt.tight_layout()
plt.show()
No description has been provided for this image
No description has been provided for this image
In [36]:
# İddia oynama ve puan hesaplama
data['points'] = 0.0  # Puanları tutmak için yeni bir sütun ekleyelim, float olarak başlatıyoruz
data['bet_made'] = False  # İddia yapılıp yapılmadığını kontrol etmek için

# Kazanma olasılığı için eşik değerleri ve row'lar için olasılık aralıkları
thresholds = [round(0.33 + i * 0.01, 2) for i in range(13)]  
row_probabilities = [round(1.8 + i * 0.1, 1) for i in range(8)]

#thresholds = [0.7]
#row_probabilities = [1.9]

# Fixture ID'leri için bir set oluştur
played_fixtures = set()

# İddia oynama ve puan hesaplama
total_bets = 0  # İddia oynanan maç sayısı
total_points = 0.0  # Toplam puan, float olarak başlatıyoruz
correct_predictions = 0  # Doğru tahmin sayısı

# Fixture ID'lerine göre toplam maç sayısını belirle
total_matches = data['fixture_id'].nunique()  # Toplam fixture_id sayısını al

# DataFrame'i current_time'a göre sıralama
data = data.sort_values(by='current_time')

# Sonuçları saklamak için bir liste
results = []
bet_made_true_rows = []  # Bet yapılmış satırları saklamak için yeni bir liste

for threshold in thresholds:
    for row_prob in row_probabilities:
        # Her kombinasyon için toplam puanları sıfırla
        total_points_combination = 0.0
        total_bets_combination = 0  # İddia oynanan maç sayısını sıfırla
        correct_predictions_combination = 0  # Doğru tahmin sayısını sıfırla
        
        # Fixture ID'lerine göre gruplama ve sıralama
        for fixture_id, fixture_data in data.groupby('fixture_id'):
            fixture_data = fixture_data.sort_values(by='current_time')  # Her fixture_id için current_time'a göre sıralama
            
            # İddia oynama kararı için bir bayrak
            bet_made_for_fixture = False
            
            for index, row in fixture_data.iterrows():
                if (not row['bet_made'] and 
                    row['current_time'] >=  pd.Timestamp(validation_start_date) and row['current_time'] < pd.Timestamp(test_start_date) and
                    #row['current_time'] <= pd.Timestamp.now() and 
                    not bet_made_for_fixture):  # İddia yapılmamış ve geçerli zaman, sadece bir iddia oynanacak
                
                    # Kazanma olasılıklarını kontrol et
                    win_prob_1 = row['win_probability_1']
                    win_prob_2 = row['win_probability_2']
                    win_prob_X = row['win_probability_X']
                    
                    # İddia oynama kararı
                    prediction = None
                    max_prob = max(win_prob_1, win_prob_2, win_prob_X)  # En yüksek olasılığı bul
                    if win_prob_1 == max_prob and win_prob_1 > threshold and row['1'] > row_prob and win_prob_1 <= (1 / row['1']):
                        prediction = 1  # Ev sahibi kazanır
                    elif win_prob_2 == max_prob and win_prob_2 > threshold and row['2'] > row_prob and win_prob_2 <= (1 / row['2']):
                        prediction = 2  # Deplasman
                    elif win_prob_X == max_prob and win_prob_X > threshold and row['X'] > row_prob and win_prob_X <= (1 / row['X']):
                        prediction = 0  # Beraberlik
                    
                    if prediction is not None:
                        # Bet yapılacak satırı kaydet
                        bet_made_true_rows.append(row)  # Bet yapılacak satırı ekle
                        # İddia yapıldı
                        data.at[index, 'bet_made'] = True  # Değişiklik burada yapılıyor
                        played_fixtures.add(row['fixture_id'])  # Fixture ID'yi ekle
                        total_bets_combination += 1  # İddia oynanan maç sayısını artır
                        bet_made_for_fixture = True  # Bu fixture için iddia yapıldı
                        
                        # Gerçek sonuç ile tahmini karşılaştır
                        if row['result'] == prediction:
                            # Kazanma durumu, puanı doğrudan ilgili kolondan al
                            if prediction == 0:
                                data.at[index, 'points'] += row['X'] - 1  # Beraberlik
                            elif prediction == 1:
                                data.at[index, 'points'] += row['1'] - 1  # Ev
                            elif prediction == 2:
                                data.at[index, 'points'] += row['2'] - 1  # Deplasman kazanır
                            total_points_combination += data.at[index, 'points']  # Toplam puanı güncelle
                            correct_predictions_combination += 1  # Doğru tahmin sayısını artır
                        else:
                            # Kaybetme durumu
                            data.at[index, 'points'] -= 1
                            total_points_combination += data.at[index, 'points']  # Toplam puanı güncelle

        # Her kombinasyon için sonuçları sakla
        results.append({
            'Threshold': threshold,
            'Row Probability': row_prob,
            'Total Bets': total_bets_combination,
            'Total Points': total_points_combination,
            'Correct Prediction Rate': correct_predictions_combination / total_bets_combination if total_bets_combination > 0 else 0
        })

        # Her kombinasyon için bet_made'leri false'a çek ve points'i sıfırla
        data['bet_made'] = False
        data['points'] = 0.0  # points'i sıfırla, float olarak

        # Her kombinasyon için sonuçları print et
        print(f"Threshold: {threshold}, Row Probability: {row_prob}, Total Bets: {total_bets_combination}, Total Points: {total_points_combination}, Correct Prediction Rate: {correct_predictions_combination / total_bets_combination if total_bets_combination > 0 else 0}")

# DataFrame olarak sonuçları çıkar ve toplam puana göre sırala
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by='Total Points', ascending=False)  # Total Points'e göre büyükten küçüğe sırala
results_df
Threshold: 0.33, Row Probability: 1.8, Total Bets: 67, Total Points: -6.550000000000001, Correct Prediction Rate: 0.417910447761194
Threshold: 0.33, Row Probability: 1.9, Total Bets: 59, Total Points: -3.290000000000001, Correct Prediction Rate: 0.423728813559322
Threshold: 0.33, Row Probability: 2.0, Total Bets: 55, Total Points: -3.090000000000001, Correct Prediction Rate: 0.41818181818181815
Threshold: 0.33, Row Probability: 2.1, Total Bets: 43, Total Points: -2.9899999999999998, Correct Prediction Rate: 0.3953488372093023
Threshold: 0.33, Row Probability: 2.2, Total Bets: 38, Total Points: 2.3599999999999994, Correct Prediction Rate: 0.4473684210526316
Threshold: 0.33, Row Probability: 2.3, Total Bets: 32, Total Points: 4.410000000000004, Correct Prediction Rate: 0.46875
Threshold: 0.33, Row Probability: 2.4, Total Bets: 21, Total Points: 1.9900000000000002, Correct Prediction Rate: 0.42857142857142855
Threshold: 0.33, Row Probability: 2.5, Total Bets: 12, Total Points: -3.8899999999999997, Correct Prediction Rate: 0.25
Threshold: 0.34, Row Probability: 1.8, Total Bets: 67, Total Points: -6.550000000000001, Correct Prediction Rate: 0.417910447761194
Threshold: 0.34, Row Probability: 1.9, Total Bets: 59, Total Points: -3.290000000000001, Correct Prediction Rate: 0.423728813559322
Threshold: 0.34, Row Probability: 2.0, Total Bets: 55, Total Points: -3.090000000000001, Correct Prediction Rate: 0.41818181818181815
Threshold: 0.34, Row Probability: 2.1, Total Bets: 43, Total Points: -2.9899999999999998, Correct Prediction Rate: 0.3953488372093023
Threshold: 0.34, Row Probability: 2.2, Total Bets: 38, Total Points: 2.3599999999999994, Correct Prediction Rate: 0.4473684210526316
Threshold: 0.34, Row Probability: 2.3, Total Bets: 32, Total Points: 4.410000000000004, Correct Prediction Rate: 0.46875
Threshold: 0.34, Row Probability: 2.4, Total Bets: 21, Total Points: 1.9900000000000002, Correct Prediction Rate: 0.42857142857142855
Threshold: 0.34, Row Probability: 2.5, Total Bets: 12, Total Points: -3.8899999999999997, Correct Prediction Rate: 0.25
Threshold: 0.35, Row Probability: 1.8, Total Bets: 66, Total Points: -3.3000000000000007, Correct Prediction Rate: 0.4393939393939394
Threshold: 0.35, Row Probability: 1.9, Total Bets: 58, Total Points: -0.040000000000000924, Correct Prediction Rate: 0.4482758620689655
Threshold: 0.35, Row Probability: 2.0, Total Bets: 54, Total Points: 0.15999999999999925, Correct Prediction Rate: 0.4444444444444444
Threshold: 0.35, Row Probability: 2.1, Total Bets: 42, Total Points: 0.25999999999999934, Correct Prediction Rate: 0.42857142857142855
Threshold: 0.35, Row Probability: 2.2, Total Bets: 38, Total Points: 4.609999999999999, Correct Prediction Rate: 0.47368421052631576
Threshold: 0.35, Row Probability: 2.3, Total Bets: 31, Total Points: 5.410000000000004, Correct Prediction Rate: 0.4838709677419355
Threshold: 0.35, Row Probability: 2.4, Total Bets: 19, Total Points: 1.12, Correct Prediction Rate: 0.42105263157894735
Threshold: 0.35, Row Probability: 2.5, Total Bets: 10, Total Points: -4.76, Correct Prediction Rate: 0.2
Threshold: 0.36, Row Probability: 1.8, Total Bets: 65, Total Points: -7.100000000000001, Correct Prediction Rate: 0.4153846153846154
Threshold: 0.36, Row Probability: 1.9, Total Bets: 57, Total Points: -3.8400000000000007, Correct Prediction Rate: 0.42105263157894735
Threshold: 0.36, Row Probability: 2.0, Total Bets: 53, Total Points: -3.640000000000001, Correct Prediction Rate: 0.41509433962264153
Threshold: 0.36, Row Probability: 2.1, Total Bets: 41, Total Points: -3.5399999999999996, Correct Prediction Rate: 0.3902439024390244
Threshold: 0.36, Row Probability: 2.2, Total Bets: 35, Total Points: 2.8099999999999996, Correct Prediction Rate: 0.45714285714285713
Threshold: 0.36, Row Probability: 2.3, Total Bets: 28, Total Points: 3.54, Correct Prediction Rate: 0.4642857142857143
Threshold: 0.36, Row Probability: 2.4, Total Bets: 17, Total Points: 0.6200000000000001, Correct Prediction Rate: 0.4117647058823529
Threshold: 0.36, Row Probability: 2.5, Total Bets: 10, Total Points: -4.76, Correct Prediction Rate: 0.2
Threshold: 0.37, Row Probability: 1.8, Total Bets: 64, Total Points: -6.020000000000001, Correct Prediction Rate: 0.421875
Threshold: 0.37, Row Probability: 1.9, Total Bets: 56, Total Points: -2.7600000000000007, Correct Prediction Rate: 0.42857142857142855
Threshold: 0.37, Row Probability: 2.0, Total Bets: 52, Total Points: -2.510000000000001, Correct Prediction Rate: 0.4230769230769231
Threshold: 0.37, Row Probability: 2.1, Total Bets: 40, Total Points: -2.4100000000000006, Correct Prediction Rate: 0.4
Threshold: 0.37, Row Probability: 2.2, Total Bets: 34, Total Points: 3.9399999999999986, Correct Prediction Rate: 0.47058823529411764
Threshold: 0.37, Row Probability: 2.3, Total Bets: 27, Total Points: 4.670000000000002, Correct Prediction Rate: 0.48148148148148145
Threshold: 0.37, Row Probability: 2.4, Total Bets: 15, Total Points: 2.62, Correct Prediction Rate: 0.4666666666666667
Threshold: 0.37, Row Probability: 2.5, Total Bets: 8, Total Points: -2.76, Correct Prediction Rate: 0.25
Threshold: 0.38, Row Probability: 1.8, Total Bets: 62, Total Points: -7.090000000000002, Correct Prediction Rate: 0.41935483870967744
Threshold: 0.38, Row Probability: 1.9, Total Bets: 54, Total Points: 0.669999999999999, Correct Prediction Rate: 0.46296296296296297
Threshold: 0.38, Row Probability: 2.0, Total Bets: 49, Total Points: -0.08000000000000274, Correct Prediction Rate: 0.4489795918367347
Threshold: 0.38, Row Probability: 2.1, Total Bets: 37, Total Points: -1.9299999999999997, Correct Prediction Rate: 0.40540540540540543
Threshold: 0.38, Row Probability: 2.2, Total Bets: 31, Total Points: 4.419999999999998, Correct Prediction Rate: 0.4838709677419355
Threshold: 0.38, Row Probability: 2.3, Total Bets: 23, Total Points: 3.6500000000000004, Correct Prediction Rate: 0.4782608695652174
Threshold: 0.38, Row Probability: 2.4, Total Bets: 13, Total Points: 2.0, Correct Prediction Rate: 0.46153846153846156
Threshold: 0.38, Row Probability: 2.5, Total Bets: 2, Total Points: -2.0, Correct Prediction Rate: 0.0
Threshold: 0.39, Row Probability: 1.8, Total Bets: 57, Total Points: -9.34, Correct Prediction Rate: 0.40350877192982454
Threshold: 0.39, Row Probability: 1.9, Total Bets: 48, Total Points: -0.5800000000000005, Correct Prediction Rate: 0.4583333333333333
Threshold: 0.39, Row Probability: 2.0, Total Bets: 43, Total Points: 0.8699999999999983, Correct Prediction Rate: 0.46511627906976744
Threshold: 0.39, Row Probability: 2.1, Total Bets: 30, Total Points: -2.1800000000000006, Correct Prediction Rate: 0.4
Threshold: 0.39, Row Probability: 2.2, Total Bets: 25, Total Points: 3.1399999999999997, Correct Prediction Rate: 0.48
Threshold: 0.39, Row Probability: 2.3, Total Bets: 18, Total Points: 3.75, Correct Prediction Rate: 0.5
Threshold: 0.39, Row Probability: 2.4, Total Bets: 7, Total Points: 3.0, Correct Prediction Rate: 0.5714285714285714
Threshold: 0.39, Row Probability: 2.5, Total Bets: 0, Total Points: 0.0, Correct Prediction Rate: 0
Threshold: 0.4, Row Probability: 1.8, Total Bets: 52, Total Points: -9.41, Correct Prediction Rate: 0.40384615384615385
Threshold: 0.4, Row Probability: 1.9, Total Bets: 43, Total Points: -0.9000000000000006, Correct Prediction Rate: 0.46511627906976744
Threshold: 0.4, Row Probability: 2.0, Total Bets: 36, Total Points: -1.4000000000000008, Correct Prediction Rate: 0.4444444444444444
Threshold: 0.4, Row Probability: 2.1, Total Bets: 24, Total Points: -3.5500000000000003, Correct Prediction Rate: 0.375
Threshold: 0.4, Row Probability: 2.2, Total Bets: 17, Total Points: 1.5199999999999987, Correct Prediction Rate: 0.47058823529411764
Threshold: 0.4, Row Probability: 2.3, Total Bets: 11, Total Points: -1.4899999999999998, Correct Prediction Rate: 0.36363636363636365
Threshold: 0.4, Row Probability: 2.4, Total Bets: 0, Total Points: 0.0, Correct Prediction Rate: 0
Threshold: 0.4, Row Probability: 2.5, Total Bets: 0, Total Points: 0.0, Correct Prediction Rate: 0
Threshold: 0.41, Row Probability: 1.8, Total Bets: 51, Total Points: -10.56, Correct Prediction Rate: 0.39215686274509803
Threshold: 0.41, Row Probability: 1.9, Total Bets: 40, Total Points: -4.249999999999999, Correct Prediction Rate: 0.425
Threshold: 0.41, Row Probability: 2.0, Total Bets: 34, Total Points: -3.750000000000001, Correct Prediction Rate: 0.4117647058823529
Threshold: 0.41, Row Probability: 2.1, Total Bets: 21, Total Points: -2.85, Correct Prediction Rate: 0.38095238095238093
Threshold: 0.41, Row Probability: 2.2, Total Bets: 14, Total Points: -0.15000000000000124, Correct Prediction Rate: 0.42857142857142855
Threshold: 0.41, Row Probability: 2.3, Total Bets: 8, Total Points: -3.23, Correct Prediction Rate: 0.25
Threshold: 0.41, Row Probability: 2.4, Total Bets: 0, Total Points: 0.0, Correct Prediction Rate: 0
Threshold: 0.41, Row Probability: 2.5, Total Bets: 0, Total Points: 0.0, Correct Prediction Rate: 0
Threshold: 0.42, Row Probability: 1.8, Total Bets: 49, Total Points: -11.06, Correct Prediction Rate: 0.3877551020408163
Threshold: 0.42, Row Probability: 1.9, Total Bets: 35, Total Points: -5.550000000000001, Correct Prediction Rate: 0.4
Threshold: 0.42, Row Probability: 2.0, Total Bets: 30, Total Points: -4.1, Correct Prediction Rate: 0.4
Threshold: 0.42, Row Probability: 2.1, Total Bets: 18, Total Points: -4.55, Correct Prediction Rate: 0.3333333333333333
Threshold: 0.42, Row Probability: 2.2, Total Bets: 11, Total Points: -1.8500000000000005, Correct Prediction Rate: 0.36363636363636365
Threshold: 0.42, Row Probability: 2.3, Total Bets: 1, Total Points: -1.0, Correct Prediction Rate: 0.0
Threshold: 0.42, Row Probability: 2.4, Total Bets: 0, Total Points: 0.0, Correct Prediction Rate: 0
Threshold: 0.42, Row Probability: 2.5, Total Bets: 0, Total Points: 0.0, Correct Prediction Rate: 0
Threshold: 0.43, Row Probability: 1.8, Total Bets: 44, Total Points: -12.73, Correct Prediction Rate: 0.36363636363636365
Threshold: 0.43, Row Probability: 1.9, Total Bets: 31, Total Points: -8.149999999999999, Correct Prediction Rate: 0.3548387096774194
Threshold: 0.43, Row Probability: 2.0, Total Bets: 26, Total Points: -6.7, Correct Prediction Rate: 0.34615384615384615
Threshold: 0.43, Row Probability: 2.1, Total Bets: 13, Total Points: -4.05, Correct Prediction Rate: 0.3076923076923077
Threshold: 0.43, Row Probability: 2.2, Total Bets: 7, Total Points: -0.15000000000000036, Correct Prediction Rate: 0.42857142857142855
Threshold: 0.43, Row Probability: 2.3, Total Bets: 0, Total Points: 0.0, Correct Prediction Rate: 0
Threshold: 0.43, Row Probability: 2.4, Total Bets: 0, Total Points: 0.0, Correct Prediction Rate: 0
Threshold: 0.43, Row Probability: 2.5, Total Bets: 0, Total Points: 0.0, Correct Prediction Rate: 0
Threshold: 0.44, Row Probability: 1.8, Total Bets: 41, Total Points: -11.729999999999999, Correct Prediction Rate: 0.36585365853658536
Threshold: 0.44, Row Probability: 1.9, Total Bets: 26, Total Points: -7.449999999999998, Correct Prediction Rate: 0.34615384615384615
Threshold: 0.44, Row Probability: 2.0, Total Bets: 21, Total Points: -2.0, Correct Prediction Rate: 0.42857142857142855
Threshold: 0.44, Row Probability: 2.1, Total Bets: 8, Total Points: -1.3999999999999995, Correct Prediction Rate: 0.375
Threshold: 0.44, Row Probability: 2.2, Total Bets: 1, Total Points: -1.0, Correct Prediction Rate: 0.0
Threshold: 0.44, Row Probability: 2.3, Total Bets: 0, Total Points: 0.0, Correct Prediction Rate: 0
Threshold: 0.44, Row Probability: 2.4, Total Bets: 0, Total Points: 0.0, Correct Prediction Rate: 0
Threshold: 0.44, Row Probability: 2.5, Total Bets: 0, Total Points: 0.0, Correct Prediction Rate: 0
Threshold: 0.45, Row Probability: 1.8, Total Bets: 35, Total Points: -11.629999999999999, Correct Prediction Rate: 0.34285714285714286
Threshold: 0.45, Row Probability: 1.9, Total Bets: 23, Total Points: -6.8500000000000005, Correct Prediction Rate: 0.34782608695652173
Threshold: 0.45, Row Probability: 2.0, Total Bets: 16, Total Points: -5.65, Correct Prediction Rate: 0.3125
Threshold: 0.45, Row Probability: 2.1, Total Bets: 1, Total Points: -1.0, Correct Prediction Rate: 0.0
Threshold: 0.45, Row Probability: 2.2, Total Bets: 0, Total Points: 0.0, Correct Prediction Rate: 0
Threshold: 0.45, Row Probability: 2.3, Total Bets: 0, Total Points: 0.0, Correct Prediction Rate: 0
Threshold: 0.45, Row Probability: 2.4, Total Bets: 0, Total Points: 0.0, Correct Prediction Rate: 0
Threshold: 0.45, Row Probability: 2.5, Total Bets: 0, Total Points: 0.0, Correct Prediction Rate: 0
Out[36]:
Threshold Row Probability Total Bets Total Points Correct Prediction Rate
21 0.35 2.3 31 5.41 0.483871
37 0.37 2.3 27 4.67 0.481481
20 0.35 2.2 38 4.61 0.473684
44 0.38 2.2 31 4.42 0.483871
13 0.34 2.3 32 4.41 0.468750
... ... ... ... ... ...
64 0.41 1.8 51 -10.56 0.392157
72 0.42 1.8 49 -11.06 0.387755
96 0.45 1.8 35 -11.63 0.342857
88 0.44 1.8 41 -11.73 0.365854
80 0.43 1.8 44 -12.73 0.363636

104 rows × 5 columns

In [37]:
results_df['Profit Margin'] = ((results_df['Total Bets'] + results_df['Total Points']) / results_df['Total Bets']) - 1
results_df = results_df.sort_values(by='Profit Margin', ascending=False)  # Yeni kolona göre büyükten küçüğe sırala

results_df[results_df['Total Bets'] >= 10].head(20)

results_df.head(20)
Out[37]:
Threshold Row Probability Total Bets Total Points Correct Prediction Rate Profit Margin
54 0.39 2.4 7 3.00 0.571429 0.428571
53 0.39 2.3 18 3.75 0.500000 0.208333
38 0.37 2.4 15 2.62 0.466667 0.174667
21 0.35 2.3 31 5.41 0.483871 0.174516
37 0.37 2.3 27 4.67 0.481481 0.172963
45 0.38 2.3 23 3.65 0.478261 0.158696
46 0.38 2.4 13 2.00 0.461538 0.153846
44 0.38 2.2 31 4.42 0.483871 0.142581
13 0.34 2.3 32 4.41 0.468750 0.137813
5 0.33 2.3 32 4.41 0.468750 0.137813
29 0.36 2.3 28 3.54 0.464286 0.126429
52 0.39 2.2 25 3.14 0.480000 0.125600
20 0.35 2.2 38 4.61 0.473684 0.121316
36 0.37 2.2 34 3.94 0.470588 0.115882
6 0.33 2.4 21 1.99 0.428571 0.094762
14 0.34 2.4 21 1.99 0.428571 0.094762
60 0.40 2.2 17 1.52 0.470588 0.089412
28 0.36 2.2 35 2.81 0.457143 0.080286
12 0.34 2.2 38 2.36 0.447368 0.062105
4 0.33 2.2 38 2.36 0.447368 0.062105
In [38]:
# Total Bets'i 0 olmayanları filtrele
filtered_results_df = results_df[results_df['Total Bets'] > 5]

# Row Probability'ye göre gruplama ve ortalamaları ile standart sapmaları hesaplama
grouped_results = filtered_results_df.groupby('Row Probability').agg({
    'Profit Margin': ['mean', 'std'],
    'Correct Prediction Rate': ['mean', 'std']
}).reset_index()

# Kolon adlarını düzelterek daha okunabilir hale getirme
grouped_results.columns = ['Row Probability', 'Profit Margin Mean', 'Profit Margin Std', 
                           'Correct Prediction Rate Mean', 'Correct Prediction Rate Std']

# Sonuçları bir tablo olarak gösterme
grouped_results
Out[38]:
Row Probability Profit Margin Mean Profit Margin Std Correct Prediction Rate Mean Correct Prediction Rate Std
0 1.8 -0.172959 0.089427 0.399342 0.027794
1 1.9 -0.104736 0.110934 0.415815 0.042188
2 2.0 -0.092281 0.106303 0.413578 0.041957
3 2.1 -0.118937 0.090913 0.382241 0.032721
4 2.2 0.054451 0.090912 0.450126 0.034493
5 2.3 0.064151 0.202254 0.439893 0.081305
6 2.4 0.148861 0.132601 0.455656 0.055000
7 2.5 -0.389067 0.079813 0.230000 0.027386

İlk bakışta 2.4 doğru tahmin oranı ve profit margin'ine göre seçilebilecek en iyi değer olsa bile standart sapması çok yüksek değere sahiptir ve bu riskli bir seçim doğurabilir. Doğru tahmin oranı ve kar marjını dengeleyebilmek için 2.2 aralarındaki en iyi seçimdir.

In [39]:
# Total Bets'i 0 olmayanları filtrele
filtered_results_df = results_df[results_df['Total Bets'] > 5]

# Threshold'a göre gruplama ve ortalamaları ile standart sapmaları hesaplama
grouped_results = filtered_results_df.groupby('Threshold').agg({
    'Profit Margin': ['mean', 'std'],
    'Correct Prediction Rate': ['mean', 'std']
}).reset_index()

# Kolon adlarını düzelterek daha okunabilir hale getirme
grouped_results.columns = ['Threshold', 'Profit Margin Mean', 'Profit Margin Std', 
                           'Correct Prediction Rate Mean', 'Correct Prediction Rate Std']

# Sonuçları bir tablo olarak gösterme
grouped_results
Out[39]:
Threshold Profit Margin Mean Profit Margin Std Correct Prediction Rate Mean Correct Prediction Rate Std
0 0.33 -0.038591 0.144250 0.406232 0.066775
1 0.34 -0.038591 0.144250 0.406232 0.066775
2 0.35 -0.020345 0.198012 0.417412 0.090361
3 0.36 -0.070554 0.184837 0.396871 0.083205
4 0.37 -0.016669 0.172334 0.417782 0.073547
5 0.38 0.042769 0.109797 0.451482 0.029327
6 0.39 0.076304 0.197814 0.468341 0.058807
7 0.40 -0.072457 0.101409 0.420439 0.046160
8 0.41 -0.162297 0.134039 0.381408 0.066978
9 0.42 -0.188382 0.048764 0.376945 0.028544
10 0.43 -0.228576 0.117801 0.360179 0.043788
11 0.44 -0.210719 0.093170 0.378895 0.035237
12 0.45 -0.327746 0.027928 0.334394 0.019123

Bu tabloya baktığımızda da 0.39 ve 0.38'in pozitif kar marjı ve çok daha yüksek doğruluk oranları nedeniyle seçilmesi doğru olacaktır. Ancak bu değerler 2.4'ün en iyi olduğu durum için geçerlidir. Row probability ve threshold arasında hesaplama nedeniyle ters bir orantı mevcuttur. Bu yüzden biz 2.2 row probability seçeceğimiz için 0.39'dan daha büyük bir değer seçmek daha doğru olacaktır:

2.4 --- 0.39

2.2 --- ≈0.42

In [40]:
# İddia oynama ve puan hesaplama
data['points'] = 0.0  # Puanları tutmak için yeni bir sütun ekleyelim, float olarak başlatıyoruz
data['bet_made'] = False  # İddia yapılıp yapılmadığını kontrol etmek için

# Kazanma olasılığı için eşik değerleri ve row'lar için olasılık aralıkları
#thresholds_test = [round(0.35+ i * 0.01, 2) for i in range(11)]  
#row_probabilities_test = [round(2 + i * 0.1, 1) for i in range(6)]
### train setine göre en iyi threshold ve row probability 0.40 ve 2.0
thresholds_test = [0.39,0.42]
row_probabilities_test = [2.2]

# Fixture ID'leri için bir set oluştur
played_fixtures_test = set()

# İddia oynama ve puan hesaplamalar
total_bets_test = 0  # İddia oynanan maç sayısı
total_points_test = 0.0  # Toplam puan, float olarak başlatıyoruz
correct_predictions_test = 0  # Doğru tahmin sayısı

# Fixture ID'lerine göre toplam maç sayısını belirle
total_matches_test = data['fixture_id'].nunique()  # Toplam fixture_id sayısını al

# DataFrame'i current_time'a göre sıralama
data = data.sort_values(by='current_time')

# Sonuçları saklamak için bir liste
results_test = []
bet_made_true_rows_test = []  # Bet yapılmış satırları saklamak için yeni bir liste

for threshold in thresholds_test:
    for row_prob in row_probabilities_test:
        # Her kombinasyon için toplam puanları sıfırla
        total_points_combination_test = 0.0
        total_bets_combination_test = 0  # İddia oynanan maç sayısını sıfırla
        correct_predictions_combination_test = 0  # Doğru tahmin sayısını sıfırla
        
        # Fixture ID'lerine göre gruplama ve sıralama
        for fixture_id, fixture_data in data.groupby('fixture_id'):
            fixture_data = fixture_data.sort_values(by='current_time')  # Her fixture_id için current_time'a göre sıralama
            
            # İddia oynama kararı için bir bayrak
            bet_made_for_fixture_test = False
            
            for index, row in fixture_data.iterrows():
                if (not row['bet_made'] and 
                    row['current_time'] >= pd.Timestamp(test_start_date) and 
                    row['current_time'] <= pd.Timestamp.now() and 
                    not bet_made_for_fixture_test):  # İddia yapılmamış ve geçerli zaman, sadece bir iddia oynanacak
                
                    # Kazanma olasılıklarını kontrol et
                    win_prob_1 = row['win_probability_1']
                    win_prob_2 = row['win_probability_2']
                    win_prob_X = row['win_probability_X']
                    
                    # İddia oynama kararı
                    prediction = None
                    max_prob = max(win_prob_1, win_prob_2, win_prob_X)  # En yüksek olasılığı bul
                    if win_prob_1 == max_prob and win_prob_1 > threshold and row['1'] > row_prob and win_prob_1 <= (1 / row['1']):
                        prediction = 1  # Ev sahibi kazanır
                    elif win_prob_2 == max_prob and win_prob_2 > threshold and row['2'] > row_prob and win_prob_2 <= (1 / row['2']):
                        prediction = 2  # Deplasman
                    elif win_prob_X == max_prob and win_prob_X > threshold and row['X'] > row_prob and win_prob_X <= (1 / row['X']):
                        prediction = 0  # Beraberlik
                    
                    if prediction is not None:
                        # Bet yapılacak satırı kaydet
                        bet_made_true_rows_test.append(row)  # Bet yapılacak satırı ekle
                        # İddia yapıldı
                        data.at[index, 'bet_made'] = True  # Değişiklik burada yapılıyor
                        played_fixtures_test.add(row['fixture_id'])  # Fixture ID'yi ekle
                        total_bets_combination_test += 1  # İddia oynanan maç sayısını artır
                        bet_made_for_fixture_test = True  # Bu fixture için iddia yapıldı
                        
                        # Gerçek sonuç ile tahmini karşılaştır
                        if row['result'] == prediction:
                            # Kazanma durumu, puanı doğrudan ilgili kolondan al
                            if prediction == 0:
                                data.at[index, 'points'] += row['X'] - 1  # Beraberlik
                            elif prediction == 1:
                                data.at[index, 'points'] += row['1'] - 1  # Ev
                            elif prediction == 2:
                                data.at[index, 'points'] += row['2'] - 1  # Deplasman kazanır
                            total_points_combination_test += data.at[index, 'points']  # Toplam puanı güncelle
                            correct_predictions_combination_test += 1  # Doğru tahmin sayısını artır
                        else:
                            # Kaybetme durumu
                            data.at[index, 'points'] -= 1
                            total_points_combination_test += data.at[index, 'points']  # Toplam puanı güncelle

        # Her kombinasyon için sonuçları sakla
        results_test.append({
            'Threshold': threshold,
            'Row Probability': row_prob,
            'Total Bets': total_bets_combination_test,
            'Total Points': total_points_combination_test,
            'Correct Prediction Rate': correct_predictions_combination_test / total_bets_combination_test if total_bets_combination_test > 0 else 0
        })

        # Her kombinasyon için bet_made'leri false'a çek ve points'i sıfırla
        data['bet_made'] = False
        data['points'] = 0.0  # points'i sıfırla, float olarak

        # Her kombinasyon için sonuçları print et
        print(f"Threshold: {threshold}, Row Probability: {row_prob}, Total Bets: {total_bets_combination_test}, Total Points: {total_points_combination_test}, Correct Prediction Rate: {correct_predictions_combination_test / total_bets_combination_test if total_bets_combination_test > 0 else 0}")

# DataFrame olarak sonuçları çıkar ve toplam puana göre sırala
results_df_test = pd.DataFrame(results_test)
results_df_test = results_df_test.sort_values(by='Total Points', ascending=False)  # Total Points'e göre büyükten küçüğe sırala
results_df_test
Threshold: 0.39, Row Probability: 2.2, Total Bets: 42, Total Points: 2.149999999999999, Correct Prediction Rate: 0.4523809523809524
Threshold: 0.42, Row Probability: 2.2, Total Bets: 20, Total Points: 2.7699999999999996, Correct Prediction Rate: 0.5
Out[40]:
Threshold Row Probability Total Bets Total Points Correct Prediction Rate
1 0.42 2.2 20 2.77 0.500000
0 0.39 2.2 42 2.15 0.452381
In [41]:
results_df_test['Profit Margin'] = ((results_df_test['Total Bets'] + results_df_test['Total Points']) / results_df_test['Total Bets']) - 1
results_df_test = results_df_test.sort_values(by='Profit Margin', ascending=False)  # Yeni kolona göre büyükten küçüğe sırala

results_df_test.head(20)
Out[41]:
Threshold Row Probability Total Bets Total Points Correct Prediction Rate Profit Margin
1 0.42 2.2 20 2.77 0.500000 0.13850
0 0.39 2.2 42 2.15 0.452381 0.05119