%%capture
!pip install lightgbm

import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import plot_tree
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, root_mean_squared_error
from sklearn.inspection import permutation_importance
import scipy.stats as stats

#Importar Datos
df= pd.read_csv(r"C:\\Users\\Abraham\\OneDrive\\Desktop\\NO ENTRAR - Abraham - Personal\\PERSONAL - NO ENTRAR\\Proyecto SP500\\PEPConsolidated_202502.csv")

#Convertir el objeto "Date" a fecha
df["Date"] = pd.to_datetime(df["Date"],format='%m/%d/%Y')

#Ordenar datos por fecha
df.sort_values('Date',inplace=True)
df.reset_index(drop=True,inplace=True)

%%capture

#Quitar vacios
df_M = df.dropna(subset=['Close_M'])
df_M.reset_index(drop=True, inplace=True)

ema_12_M = df_M['Close_M'].ewm(span=12, adjust=False).mean()
ema_26_M = df_M['Close_M'].ewm(span=26, adjust=False).mean()
# Calculamos la línea MACD
macd_line_M = ema_12_M - ema_26_M
# Calculamos la línea de señal
signal_line_M = macd_line_M.ewm(span=9, adjust=False).mean()
# Calculamos las variables derivadas
df_M['MACD_M'] = macd_line_M
df_M['Signal Line_M'] = signal_line_M
df_M['MACD-Signal Diff_M'] = df_M['MACD_M'] - df_M['Signal Line_M']
df_M['MACD Crossover_M'] = np.where(
    (df_M['MACD_M'] > df_M['Signal Line_M']) & (df_M['MACD_M'].shift(1) <= df_M['Signal Line_M'].shift(1)), 1, 0)
df_M['MACD Slope_M'] = df_M['MACD_M'] - df_M['MACD_M'].shift(1)
df_M['Signal Line Slope_M'] = df_M['Signal Line_M'] - df_M['Signal Line_M'].shift(1)
df_M['MACD Above Zero_M'] = np.where(df_M['MACD_M'] > 0, 1, 0)
df_M['MACD Below Zero_M'] = np.where(df_M['MACD_M'] < 0, 1, 0)

conditions_M = [
    (df_M['MACD_M'] > df_M['Signal Line_M']) & (df_M['MACD_M'].shift(1) <= df_M['Signal Line_M'].shift(1)),  # Cruce alcista
    (df_M['MACD_M'] < df_M['Signal Line_M']) & (df_M['MACD_M'].shift(1) >= df_M['Signal Line_M'].shift(1))   # Cruce bajista
]
choices_M = ['1', '-1']
# Crear una nueva columna con la señal actual
df_M['Trend Signal_M'] = np.select(conditions_M, choices_M, default=0)
df_M.drop(['MACD_M','Signal Line_M','MACD-Signal Diff_M','MACD Slope_M','Signal Line Slope_M','MACD Crossover_M'],axis=1,inplace=True)





#RSI_Q
def calculate_rsi(df_M, window=14):
    # Calculate daily price changes
    delta = df_M['Close_M'].diff()
    gains = (delta.where(delta > 0, 0)).fillna(0)
    losses = (-delta.where(delta < 0, 0)).fillna(0)
    avg_gain = gains.rolling(window=window, min_periods=1).mean()
    avg_loss = losses.rolling(window=window, min_periods=1).mean()
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    df_M['RSI_M'] = rsi
    return df_M
df_M = calculate_rsi(df_M)

df_M['High_M%'] = df_M['High_M'].pct_change()*100
df_M['High_Q%'] = df_M['High_Q'].pct_change()*100
df_M['Close_M%'] = df_M['Close_M'].pct_change()*100
df_M['Close_Q%'] = df_M['Close_Q'].pct_change()*100
df_M['Low_M%'] = df_M['Low_M'].pct_change()*100
df_M['Low_Q%'] = df_M['Low_Q'].pct_change()*100
df_M['Volume_M%'] = df_M['Volume_M'].pct_change()*100
df_M['Volume_Q%'] = df_M['Volume_Q'].pct_change()*100
df_M['High_M%'] = df_M['High_M%'].shift(+1)
df_M['High_Q%'] = df_M['High_Q%'].shift(+3)
df_M['Close_M%'] = df_M['Close_M%'].shift(+1)
df_M['Close_Q%'] = df_M['Close_Q%'].shift(+3)
df_M['Low_M%'] = df_M['Low_M%'].shift(+1)
df_M['Low_Q%'] = df_M['Low_Q%'].shift(+3)
df_M['Volume_M%'] = df_M['Volume_M%'].shift(+1)
df_M['Volume_Q%'] = df_M['Volume_Q%'].shift(+3)
df_M['High_Q%'].replace(0, np.nan, inplace=True)
df_M['Close_Q%'].replace(0, np.nan, inplace=True)
df_M['Low_Q%'].replace(0, np.nan, inplace=True)
df_M['Volume_Q%'].replace(0, np.nan, inplace=True)
df_M['MACD Above Zero_M'] = df_M['MACD Above Zero_M'].shift(+1)
df_M['Trend Signal_M'] = df_M['Trend Signal_M'].shift(+1)
df_M['RSI_M'] = df_M['RSI_M'].shift(+1)


df_M.fillna(method='ffill', inplace=True)
df.reset_index(drop=True, inplace=True)
df = pd.merge(df, df_M[['Date','MACD Above Zero_M','Trend Signal_M','RSI_M','High_M%','High_Q%','Close_M%','Close_Q%','Low_M%','Low_Q%','Volume_M%','Volume_Q%']], on='Date', how='left')
df['Trend Signal_M'].replace(np.nan,0,inplace=True)
columna_excluida = df['High_D']
df = df.drop(columns=['High_D']).fillna(method='ffill')
df['High_D'] = columna_excluida
df.reset_index(drop=True, inplace=True)
df = df.dropna(subset=['High_D'])

df['High_D%'] = df['High_D'].pct_change()*100
df['Close_D%'] = df['Close_D'].pct_change()*100
df['Low_D%'] = df['Low_D'].pct_change()*100
df['Volume_D%'] = df['Volume_D'].pct_change()*100
df['High_D%'] = df['High_D%'].shift(+1)
df['Close_D%'] = df['Close_D%'].shift(+1)
df['Low_D%'] = df['Low_D%'].shift(+1)
df['Volume_D%'] = df['Volume_D%'].shift(+1)
df['High_D'] = df['High_D'].shift(+1)
df['Close_D'] = df['Close_D'].shift(+1)
df['Low_D'] = df['Low_D'].shift(+1)
df['Volume_D'] = df['Volume_D'].shift(+1)


#Fechas de entrenamiento
Start_date = '2000-06-30'
End_date = '2025-02-09'
df = df[(df['Date'] >= Start_date) & (df['Date'] <= End_date)]
df.reset_index(drop=True,inplace=True)
max_close_so_far = df['Close_D'].iloc[0]
last_max_close_date = df['Date'].iloc[0]
for i in range(len(df)):
    if df['Close_D'].iloc[i] > max_close_so_far:
        max_close_so_far = df['Close_D'].iloc[i]
        last_max_close_date = df['Date'].iloc[i]

    timedelta = df['Date'].iloc[i] - last_max_close_date
    df.at[i, 'Months Since Last Max Close'] = timedelta.days
    df.at[i, 'Current Drawdown % from Past Close'] = 1 - ((df['Low_D'].iloc[i]) / max_close_so_far)




#Variable Objetivo
df['Future Drawdown % from Close'] = float('NaN')

for i in range(len(df) - 1):  # -1 because we can't calculate drawdown for the last year
    # Get the HIGH price for the current year
    current_high = df.loc[i, 'Close_D']
    future_lows = df.loc[i + 1:, 'Low_D']
    if not future_lows.empty:
        future_min_low = future_lows.min()
        drawdown_percentage = 1 - ((future_min_low / current_high))
        df.at[i, 'Future Drawdown % from Close'] = drawdown_percentage

df['Price_To_Equity_Daily'] = df['Close_D']/df['normalized_epsdiluted']

#Selecciono solo los features que tienen relevancia (ya los estudie con anàlisis exploratorio)
selected_columns = [
    "Date",
    "Revenue_Growth%_12M", "Net_Income_Growth%_3M",
    "Net_Income_Growth%_6M", "Net_Income_Growth%_12M", "grossProfitRatio", "debt_to_equity", "return_on_equity"
    , "Current_Ratio", "Cash_To_Debt_Ratio", "Unemployment_Rate", "CAPE",
    "Price_To_Equity_Daily",
    "Months Since Last Max Close","Current Drawdown % from Past Close", "Future Drawdown % from Close", "Operating_Income_Ratio", "RSI_M", "Close_M%", "Revenue_Growth%_6M"
]

# Filtrar el DataFrame
df = df[selected_columns]

%%capture
#Divido la data en entrenamiento y prueba, no lo hago azar porque quiero ver que tal se comporta el modelo en los datos màs actuales.
df.replace(np.nan,0,inplace=True)
train_df = df[df['Date']< '2021-01-01']
train_df.reset_index(drop=True,inplace=True)
train_df.drop(['Date'],axis=1,inplace=True)
test_df = df[(df['Date'] >= '2021-01-01')]
test_df.reset_index(drop=True,inplace=True)
test_df.drop(['Date'],axis=1,inplace=True)

#Observo la data de entrenamiento para validar que esten correctos los últimos valores (comparar con fuente)
test_df

# Boxplots para análisis exploratorio
batch_size = 20 
num_batches = (len(train_df.columns) + batch_size - 1) // batch_size  
for batch in range(num_batches):
    start_idx = batch * batch_size
    end_idx = start_idx + batch_size
    columns_batch = train_df.columns[start_idx:end_idx]
    fig, axes = plt.subplots(nrows=len(columns_batch), ncols=1, figsize=(12, len(columns_batch) * 3))

    if len(columns_batch) == 1:
        axes = [axes]  

    for i, col in enumerate(columns_batch):
        if col in train_df.columns:  
            train_df[col].plot(kind='box', ax=axes[i])
            axes[i].set_title(col)

    plt.tight_layout()
    plt.show()
    plt.close()

# Histogramas para análisis exploratorio
batch_size = 20  
num_batches = (len(train_df.columns) + batch_size - 1) // batch_size  # Calcular el número de batches
for batch in range(num_batches):
    start_idx = batch * batch_size
    end_idx = start_idx + batch_size
    columns_batch = train_df.columns[start_idx:end_idx]
    fig, axes = plt.subplots(nrows=len(columns_batch), ncols=1, figsize=(12, len(columns_batch) * 3))

    if len(columns_batch) == 1:
        axes = [axes] 

    for i, col in enumerate(columns_batch):
        if i < len(axes):
            axes[i].hist(train_df[col], bins=30, color='skyblue', edgecolor='black')
            axes[i].set_title(col)

    plt.tight_layout()
    plt.show()
    plt.close()

# Scatterplots para análisis exploratorio
batch_size = 20  
num_batches = (len(train_df.columns) + batch_size - 1) // batch_size  
for batch in range(num_batches):
    start_idx = batch * batch_size
    end_idx = start_idx + batch_size
    columns_batch = train_df.columns[start_idx:end_idx]
    fig, axes = plt.subplots(nrows=len(columns_batch), ncols=1, figsize=(12, len(columns_batch) * 3))

    if len(columns_batch) == 1:
        axes = [axes]  

    for i, col in enumerate(columns_batch):
        train_df.plot(kind='scatter', x=col, y='Future Drawdown % from Close', ax=axes[i])

    plt.tight_layout()
    plt.show()
    plt.close()

# Dividir dataframes en uno con los features y otro con la variable objetivo
X = train_df.drop(['Future Drawdown % from Close'], axis=1)
y = train_df['Future Drawdown % from Close']

#Normalizaciòn de datos para poder entrenar el KNN
X_Normalized = (X-X.min())/(X.max()-X.min())
X_Normalized

X_test = test_df.drop(['Future Drawdown % from Close'],axis=1)
y_test = test_df['Future Drawdown % from Close']
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)
X_test_normalized = scaler.transform(X_test)

# Convierte los arrays normalizados de vuelta a DataFrames
X_normalized_df = pd.DataFrame(X_normalized, columns=X.columns)
X_test_normalized_df = pd.DataFrame(X_test_normalized, columns=X_test.columns)

#Usar GridSearch para encontrar la cantidad óptima de vecinos para KNN
"""
# Machine Learning Models
#Model 3: K-Nearest-Neighborhood

#Cross Validation to select the quantity of neighbors
# Define the parameter grid
param_grid = {'n_neighbors': list(range(1, 1000))}

# Create a KNeighborsClassifier instance
knn = KNeighborsRegressor()

# Create a GridSearchCV instance
grid_search = GridSearchCV(knn, param_grid, cv=5, verbose=2, n_jobs=8)

# Fit the grid search to the data
grid_search.fit(X_Normalized, y)

# Best parameters foundIn cross-validation, especially with k-NN, it's also common to vary the number of neighbors (the k value) to find the best model. You can do this using a loop or more systematically using GridSearchCV or RandomizedSearchCV, which not only tries different values of k but also performs cross-validation at the same time.

#Using GridSearchCV to find the best k
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_}")
"""

Fitting 5 folds for each of 999 candidates, totalling 4995 fits
Best parameters: {'n_neighbors': 395}
Best cross-validation score: -0.3178203738438885

# Definir 395 vecinos cercanos
knn = KNeighborsRegressor(n_neighbors=395)

# Entrenar el modelo con la data normalizada
knn.fit(X_Normalized, y)

KNeighborsRegressor(n_neighbors=395)

KNeighborsRegressor(n_neighbors=395)

# Validaciòn cruzada para evaluar el modelo
mae_scores = cross_val_score(knn, X_Normalized, y, cv=5, scoring='neg_mean_absolute_error')
rmse_scores = cross_val_score(knn, X_normalized, y, cv=5, scoring='neg_root_mean_squared_error')
mse_scores = cross_val_score(knn, X_normalized, y, cv=5, scoring='neg_mean_squared_error')

print(f"CV MAE Scores: {mae_scores}")
print(f"Average CV MAE Score: {mae_scores.mean()}")

print(f"CV RMSE Scores: {rmse_scores}")
print(f"Average CV RMSE Score: {rmse_scores.mean()}")

print(f"CV MSE Scores: {mse_scores}")
print(f"Average CV MSE Score: {mse_scores.mean()}")

CV MAE Scores: [-0.0847544  -0.11860772 -0.05519311 -0.08362805 -0.05659351]
Average CV MAE Score: -0.07975535558916422
CV RMSE Scores: [-0.1036206  -0.1437206  -0.0703934  -0.09633739 -0.06858211]
Average CV RMSE Score: -0.09653082125505856
CV MSE Scores: [-0.01073723 -0.02065561 -0.00495523 -0.00928089 -0.00470351]
Average CV MSE Score: -0.010066493978992942

# Hacer predicciones en data de prueba y validaciòn
y_pred = knn.predict(X_Normalized)
y_pred2 = knn.predict(X_test_normalized_df)

# Evaluating the model
mse = mean_squared_error(y_test, y_pred2)
mae = mean_absolute_error(y_test, y_pred2)

print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')

Mean Squared Error: 0.0077260913890593395
Mean Absolute Error: 0.06975593491304816

# Cálculo de residuos
residuos = y_test - y_pred2
residuos_abs = np.abs(residuos)

# Gráfico 1: Histograma de los residuos
plt.figure(figsize=(10, 6))
plt.hist(residuos, bins=30, color='blue', alpha=0.7, edgecolor='black')
plt.title('Histograma de los Residuos')
plt.xlabel('Residuos')
plt.ylabel('Frecuencia')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

# Gráfico 2: QQ Plot de los residuos
plt.figure(figsize=(10, 6))
stats.probplot(residuos, dist="norm", plot=plt)
plt.title('QQ Plot de los Residuos')
plt.grid(True)
plt.show()

# Gráfico #3: Modelo en data de entrenamiento
plt.figure(figsize=(12, 7))
# Asumiendo que 'Row ID' es el índice de df
plt.bar(train_df.index - 0.2, y, width=0.4, label='Meses de Drawdown real')
plt.bar(train_df.index + 0.2, y_pred, width=0.4, label='Meses de Drawdown predicción')
plt.title('Modelo 1-KNN: Drawdown_train')
plt.xlabel('ID de fila')
plt.ylabel('Meses')
plt.legend()
plt.show()
plt.close()

# Gráfico #4: Modelo en data de prueba
plt.figure(figsize=(12, 7))
# Asumiendo que 'Row ID' es el índice de df
plt.bar(test_df.index - 0.2, y_test, width=0.4, label='Meses de Drawdown real')
plt.bar(test_df.index + 0.2, y_pred2, width=0.4, label='Meses de Drawdown predicción')
plt.title('Modelo 1-KNN: Dradown_test')
plt.xlabel('ID de fila')
plt.ylabel('Meses')
plt.legend()
plt.show()
plt.close()

# Gráfico #5: Comparaciòn de Valores Predichos VS Valor absoluto de los residuos
plt.figure(figsize=(10, 6))
plt.scatter(y_pred2, residuos_abs, alpha=0.5, color= "purple")
plt.title('Modelo 1-KNN: Valor absoluto de los residuos VS valor predecido')
plt.xlabel('Valor predecido')
plt.ylabel('Valor absoluto de los residuos')
plt.grid(True)
plt.show()
plt.close()

%%capture
#Grid search para encontrar mejores hiperparametros para LGBM
"""
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [-1, 10, 20, 30],  
    'num_leaves': [31, 50, 100],  
    'min_child_samples': [20, 50, 100], 
    'learning_rate': [0.01, 0.05, 0.1], 
    'colsample_bytree': [0.6, 0.8, 1.0]  
}

lgbm = LGBMRegressor()

grid_search = GridSearchCV(estimator=lgbm, param_grid=param_grid, cv=5,n_jobs=-1,verbose=2, scoring='neg_mean_absolute_error')

grid_search.fit(X_normalized, y)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", -grid_search.best_score_)
"""

# Los mejores parámetros de GridSearchCV
best_params = {'colsample_bytree': 0.8,
               'learning_rate': 0.05,
               'max_depth': 20,
               'min_child_samples': 20,
               'n_estimators': 100,
               'num_leaves': 100}

# Entrenar el LGBM
lgbm = LGBMRegressor(**best_params)
lgbm.fit(X_normalized, y)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000066 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2479
[LightGBM] [Info] Number of data points in the train set: 5159, number of used features: 18
[LightGBM] [Info] Start training from score 0.137188

LGBMRegressor(colsample_bytree=0.8, learning_rate=0.05, max_depth=20,
              num_leaves=100)

LGBMRegressor(colsample_bytree=0.8, learning_rate=0.05, max_depth=20,
              num_leaves=100)

# Validación cruzada para evaluar el modelo
cv_scores = cross_val_score(lgbm, X_normalized, y, cv=5, scoring='neg_mean_absolute_error')
cv_scores2 = cross_val_score(lgbm, X_normalized, y, cv=5, scoring='neg_root_mean_squared_error')
cv_scores3 = cross_val_score(lgbm, X_normalized, y, cv=5, scoring='neg_mean_squared_error')

mae_scores = -cv_scores
rmse_scores = -cv_scores2
mse_scores = -cv_scores3

print(f"CV MAE Scores: {mae_scores}")
print(f"Average CV MAE Score: {mae_scores.mean()}")

print(f"CV RMSE Scores: {rmse_scores}")
print(f"Average CV RMSE Score: {rmse_scores.mean()}")

print(f"CV MSE Scores: {mse_scores}")
print(f"Average CV MSE Score: {mse_scores.mean()}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000150 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2145
[LightGBM] [Info] Number of data points in the train set: 4127, number of used features: 18
[LightGBM] [Info] Start training from score 0.129103
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000148 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2170
[LightGBM] [Info] Number of data points in the train set: 4127, number of used features: 18
[LightGBM] [Info] Start training from score 0.111022
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000043 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2148
[LightGBM] [Info] Number of data points in the train set: 4127, number of used features: 18
[LightGBM] [Info] Start training from score 0.154348
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000145 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2152
[LightGBM] [Info] Number of data points in the train set: 4127, number of used features: 18
[LightGBM] [Info] Start training from score 0.150336
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000114 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2154
[LightGBM] [Info] Number of data points in the train set: 4128, number of used features: 18
[LightGBM] [Info] Start training from score 0.141130
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000155 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2145
[LightGBM] [Info] Number of data points in the train set: 4127, number of used features: 18
[LightGBM] [Info] Start training from score 0.129103
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000151 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2170
[LightGBM] [Info] Number of data points in the train set: 4127, number of used features: 18
[LightGBM] [Info] Start training from score 0.111022
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000060 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2148
[LightGBM] [Info] Number of data points in the train set: 4127, number of used features: 18
[LightGBM] [Info] Start training from score 0.154348
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000143 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2152
[LightGBM] [Info] Number of data points in the train set: 4127, number of used features: 18
[LightGBM] [Info] Start training from score 0.150336
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000131 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2154
[LightGBM] [Info] Number of data points in the train set: 4128, number of used features: 18
[LightGBM] [Info] Start training from score 0.141130
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000165 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2145
[LightGBM] [Info] Number of data points in the train set: 4127, number of used features: 18
[LightGBM] [Info] Start training from score 0.129103
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000161 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2170
[LightGBM] [Info] Number of data points in the train set: 4127, number of used features: 18
[LightGBM] [Info] Start training from score 0.111022
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000156 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2148
[LightGBM] [Info] Number of data points in the train set: 4127, number of used features: 18
[LightGBM] [Info] Start training from score 0.154348
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000110 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2152
[LightGBM] [Info] Number of data points in the train set: 4127, number of used features: 18
[LightGBM] [Info] Start training from score 0.150336
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000142 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2154
[LightGBM] [Info] Number of data points in the train set: 4128, number of used features: 18
[LightGBM] [Info] Start training from score 0.141130
CV MAE Scores: [0.09306594 0.11124584 0.0735792  0.0576992  0.0393589 ]
Average CV MAE Score: 0.0749898134597873
CV RMSE Scores: [0.11199591 0.13535666 0.09367718 0.06980918 0.0467792 ]
Average CV RMSE Score: 0.09152362492197248
CV MSE Scores: [0.01254308 0.01832143 0.00877541 0.00487332 0.00218829]
Average CV MSE Score: 0.009340307405534164

# Hacer predicciones en el dataset de entrenamiento y prueba
y_pred = lgbm.predict(X_test_normalized_df)
y_pred3 = lgbm.predict(X_normalized)

# Evaluar el modelo en el dataset de prueba
rmse = root_mean_squared_error(y_test,y_pred)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f'Mean Squared Error: {rmse}')
print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')

Mean Squared Error: 0.10000052071712097
Mean Squared Error: 0.01000010414369534
Mean Absolute Error: 0.08614471247930844

#Resultados de las predicciones
results_test2 = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred
})

results_test2

# Cálculo de residuos
residuos = y_test - y_pred
residuos_abs = np.abs(residuos)

# Gráfico 1: Histograma de los residuos
plt.figure(figsize=(10, 6))
plt.hist(residuos, bins=30, color='blue', alpha=0.7, edgecolor='black')
plt.title('Histograma de los Residuos')
plt.xlabel('Residuos')
plt.ylabel('Frecuencia')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

# Gráfico 2: QQ Plot de los residuos
plt.figure(figsize=(10, 6))
stats.probplot(residuos, dist="norm", plot=plt)
plt.title('QQ Plot de los Residuos')
plt.grid(True)
plt.show()

# Gráfico 3: Predicciones en la data de entrenamiento
plt.figure(figsize=(12, 7))
plt.bar(train_df.index - 0.2, y, width=0.4, label='Meses de Drawdown real')
plt.bar(train_df.index + 0.2, y_pred3, width=0.4, label='Meses de Drawdown predicción')
plt.title('Modelo 4-LB: Drawdown_train')
plt.xlabel('ID de fila')
plt.ylabel('Meses')
plt.legend()
plt.show()
plt.close()

# Gráfico 4: Predicciones en la data de prueba
plt.figure(figsize=(12, 7))
plt.bar(test_df.index - 0.2, y_test, width=0.4, label='Drawdown real')
plt.bar(test_df.index + 0.2, y_pred, width=0.4, label='Drawdown predicción')
plt.title('Modelo 4-LB: Drawdown_Test')
plt.xlabel('ID de fila')
plt.ylabel('Drawdown')
plt.legend()
plt.show()
plt.close()

# Gráfico 5: Valores absolutos de los residuos VS valores predichos
plt.figure(figsize=(10, 6))
plt.scatter(y_pred, residuos_abs, alpha=0.5, color="black")
plt.title('Modelo 4-LightGBM: Residuos')
plt.xlabel('Valor predecido')
plt.ylabel('Valor absoluto de los residuos')
plt.grid(True)
plt.show()
plt.close()

%%capture
#Esconder resultados con capture para proteger el contenido del modelo

# Importancia de las características
feature_importances = lgbm.feature_importances_
feature_names = X.columns

# Crear un DataFrame para las importancias
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)
print(importance_df)

# Graficar la importancia de las características
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance', y='Feature', data=importance_df, palette='viridis')
plt.title('Importancia de las Características - LightGBM')
plt.xlabel('Importancia')
plt.ylabel('Características')
plt.tight_layout()
plt.show()
plt.close()

%%capture
#Esconder resultados con capture para proteger el contenido del modelo

# Calcular las importancias en porcentaje
importance_df['Importance (%)'] = (importance_df['Importance'] / importance_df['Importance'].sum()) * 100
print(importance_df)

# Graficar las importancias en porcentaje
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance (%)', y='Feature', data=importance_df, palette='viridis')
plt.title('Importancia de las Características (Porcentaje) - LightGBM')
plt.xlabel('Importancia (%)')
plt.ylabel('Características')
plt.tight_layout()

	Revenue_Growth%_12M	Net_Income_Growth%_3M	Net_Income_Growth%_6M	Net_Income_Growth%_12M	grossProfitRatio	debt_to_equity	return_on_equity	Current_Ratio	Cash_To_Debt_Ratio	Unemployment_Rate	CAPE	Price_To_Equity_Daily	Months Since Last Max Close	Current Drawdown % from Past Close	Future Drawdown % from Close	Operating_Income_Ratio	RSI_M	Close_M%	Revenue_Growth%_6M
0	0.052537	0.391859	0.712257	0.090952	0.549168	3.278485	16.851784	0.925064	0.104575	6.7	34.512432	80.112897	0.0	0.011194	0.133041	0.166436	61.514668	2.599870	0.303292
1	0.052537	0.391859	0.712257	0.090952	0.549168	3.278485	16.851784	0.925064	0.104575	6.7	34.512432	77.935865	1.0	0.042886	0.108824	0.166436	61.514668	2.599870	0.303292
2	0.052537	0.391859	0.712257	0.090952	0.549168	3.278485	16.851784	0.925064	0.104575	6.7	34.512432	78.168159	2.0	0.036413	0.111472	0.166436	61.514668	2.599870	0.303292
3	0.052537	0.391859	0.712257	0.090952	0.549168	3.278485	16.851784	0.925064	0.104575	6.7	34.512432	77.211979	3.0	0.045246	0.100469	0.166436	61.514668	2.599870	0.303292
4	0.052537	0.391859	0.712257	0.090952	0.549168	3.278485	16.851784	0.925064	0.104575	6.7	34.512432	76.963473	4.0	0.048213	0.097564	0.166436	61.514668	2.599870	0.303292
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1025	-0.005714	-0.049627	0.434868	-0.052393	0.554183	2.296047	14.945167	0.885766	0.108966	4.0	37.967247	70.746480	630.0	0.192009	0.053023	0.166045	41.708023	0.118635	0.277753
1026	-0.002370	-0.480205	-0.506001	0.169739	0.525590	2.473612	8.381487	0.818937	0.139873	4.0	37.967247	135.378382	631.0	0.199751	0.050376	0.080982	41.708023	0.118635	0.234790
1027	-0.002370	-0.480205	-0.506001	0.169739	0.525590	2.473612	8.381487	0.818937	0.139873	4.0	37.967247	129.270275	632.0	0.231099	0.005506	0.080982	41.708023	0.118635	0.234790
1028	-0.002370	-0.480205	-0.506001	0.169739	0.525590	2.473612	8.381487	0.818937	0.139873	4.0	37.967247	131.225229	633.0	0.232712	0.011671	0.080982	41.708023	0.118635	0.234790
1029	-0.002370	-0.480205	-0.506001	0.169739	0.525590	2.473612	8.381487	0.818937	0.139873	4.0	37.967247	130.945951	634.0	0.225937	0.000000	0.080982	41.708023	0.118635	0.234790

	Actual	Predicted
0	0.133041	0.119034
1	0.108824	0.096256
2	0.111472	0.101791
3	0.100469	0.091216
4	0.097564	0.091023
...	...	...
1025	0.053023	0.077364
1026	0.050376	0.083758
1027	0.005506	0.080581
1028	0.011671	0.080581
1029	0.000000	0.080581

Modelo	MAE CV	RMSE CV	MSE CV	MAE Test	RMSE Test	MSE Test
KNN	0.1689	0.2073	0.0446	0.1222	0.1356	0.0184
Decision Tree	0.1450	0.1829	0.0359	0.1749	0.2249	0.0506
Random Forest	0.1543	0.1767	0.0315	0.1636	0.1736	0.0301
LightGBM	0.1491	0.1711	0.0323	0.1291	0.1408	0.0198

Bolsa 4.0: Predicción de acciones con IA¶

PEPSI¶

Tabla de contenidos¶

Sección 1: Preparación de datos¶

Sección 2: Procesamiento y Transformación de datos¶

Sección 3: Análisis exploratorio¶

Sección 4: Ingeniería de variables¶

Sección 5: Modelos de Machine Learning¶

Modelo 1: KNN¶

Modelo 4: LightGBM¶

Sección 6: Conclusión¶