%%capture
!pip install lightgbm

import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, root_mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import plot_tree
import scipy.stats as stats
from lightgbm import LGBMRegressor
from sklearn.inspection import permutation_importance

#Importar data
df= pd.read_csv(r"C:\\Users\\Abraham\\OneDrive\\Desktop\\NO ENTRAR - Abraham - Personal\\PERSONAL - NO ENTRAR\\Proyecto SP500\\SP500Consolidated_202502_Daily.csv")


#Convertir el objeto "Date" a fecha
df["Date"] = pd.to_datetime(df["Date"],format='%m/%d/%Y')

#Ordenar datos por fecha
df.sort_values('Date',inplace=True)
df.reset_index(drop=True,inplace=True)

#Eliminar algunas columnas
df.drop(['Primary_Energy_Consumption_Change_A','Primary_Energy_Consumption_Change_Q','Year','Month','Day', 'Primary_Energy_Consumption_Change_Q', 'Primary_Energy_Consumption_Change_A','High_M_NASDAQ','Close_M_NASDAQ','Low_M_NASDAQ'],axis=1,inplace=True)

%%capture

#Crear un data frame mensual
df_M = df.dropna(subset=['Close_M'])
df_M.reset_index(drop=True, inplace=True)

def calculate_rsi(df_M, window=14):
    delta = df_M['Close_M'].diff()
    gains = (delta.where(delta > 0, 0)).fillna(0)
    losses = (-delta.where(delta < 0, 0)).fillna(0)
    avg_gain = gains.rolling(window=window, min_periods=1).mean()
    avg_loss = losses.rolling(window=window, min_periods=1).mean()
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    df_M['RSI_M'] = rsi
    return df_M

df_M = calculate_rsi(df_M)

df_M['High_M%'] = df_M['High_M'].pct_change()*100
df_M['High_Q%'] = df_M['High_Q'].pct_change()*100
df_M['Close_M%'] = df_M['Close_M'].pct_change()*100
df_M['Close_Q%'] = df_M['Close_Q'].pct_change()*100
df_M['Low_M%'] = df_M['Low_M'].pct_change()*100
df_M['Low_Q%'] = df_M['Low_Q'].pct_change()*100
df_M['Volume_M%'] = df_M['Volume_M'].pct_change()*100
df_M['Volume_Q%'] = df_M['Volume_Q'].pct_change()*100
df_M['High_M%'] = df_M['High_M%'].shift(+1)
df_M['High_Q%'] = df_M['High_Q%'].shift(+3)
df_M['Close_M%'] = df_M['Close_M%'].shift(+1)
df_M['Close_Q%'] = df_M['Close_Q%'].shift(+3)
df_M['Low_M%'] = df_M['Low_M%'].shift(+1)
df_M['Low_Q%'] = df_M['Low_Q%'].shift(+3)
df_M['Volume_M%'] = df_M['Volume_M%'].shift(+1)
df_M['Volume_Q%'] = df_M['Volume_Q%'].shift(+3)
df_M['High_Q%'].replace(0, np.nan, inplace=True)
df_M['Close_Q%'].replace(0, np.nan, inplace=True)
df_M['Low_Q%'].replace(0, np.nan, inplace=True)
df_M['Volume_Q%'].replace(0, np.nan, inplace=True)
df_M['RSI_M'] = df_M['RSI_M'].shift(+1)
df_M['Gross_Domestic_Product_Growth_Q'] = df_M['Gross_Domestic_Product_Growth_Q'].shift(+3)
df_M['Unemployment_Rate'] = df_M['Unemployment_Rate'].shift(+1)
df_M['CPI_12'] = df_M['CPI_12'].shift(+1)
df_M['CPI_3'] = df_M['CPI_3'].shift(+1)
df_M['CPI_1'] = df_M['CPI_1'].shift(+1)
df_M['DFF_Absolute'] = df_M['DFF_Absolute'].shift(+1)
df_M['DFF_Relative'] = df_M['DFF_Relative'].shift(+1)
df_M['Exports_Change'] = df_M['Exports_Change'].shift(+3)
df_M['Imports_Change'] = df_M['Imports_Change'].shift(+3)
df_M['Balance_Change'] = df_M['Balance_Change'].shift(+3)
df_M['Consumer_Sentiment'] = df_M['Consumer_Sentiment'].shift(+2)
df_M['Consumer_Sentiment_Growth'] = df_M['Consumer_Sentiment_Growth'].shift(+2)
df_M['Personal_Income_Change_Q'] = df_M['Personal_Income_Change_Q'].shift(+3) #To train the model 3 is okay but have to change depending on relase when operative
df_M['Personal_Outlays_Change_Q'] = df_M['Personal_Outlays_Change_Q'].shift(+3)
df_M['Personal_Saving_Change_Q'] = df_M['Personal_Saving_Change_Q'].shift(+3)
df_M['Personal_Income_Change_A'] = df_M['Personal_Income_Change_A'].shift(+3)
df_M['Personal_Outlays_Change_A'] = df_M['Personal_Outlays_Change_A'].shift(+3)
df_M['Personal_Saving_Change_A'] = df_M['Personal_Saving_Change_A'].shift(+3)
df_M['Personal_Income_Expenditure_Change_Q'] = df_M['Personal_Income_Expenditure_Change_Q'].shift(+3)
df_M['Personal_Income_Expenditure_Change_A'] = df_M['Personal_Income_Expenditure_Change_A'].shift(+3)
df_M['Payout_Ratio'] = df_M['Payout_Ratio'].shift(+12)
df_M['Payout_Ratio_Growth'] = df_M['Payout_Ratio_Growth'].shift(+12)
df_M['Earnings_Growth'] = df_M['Earnings_Growth'].shift(+12)
df_M['Earnings_Yield_Growth'] = df_M['Earnings_Yield_Growth'].shift(+12)
df_M['Dividends_Growth'] = df_M['Dividends_Growth'].shift(+12)
df_M['Dividends_Yield_Growth'] = df_M['Dividends_Yield_Growth'].shift(+12)
df_M['GDP_per_Capita_Growth_Q'] = df_M['GDP_per_Capita_Growth_Q'].shift(+3)
df_M['GDP_per_Capita_Growth_A'] = df_M['GDP_per_Capita_Growth_A'].shift(+12)
df_M['GDP_Residential_Growth_Q'] = df_M['GDP_Residential_Growth_Q'].shift(+3)
df_M['GDP_Fixed_Investment_Q'] = df_M['GDP_Fixed_Investment_Q'].shift(+3)
df_M['GDP_Gross_Private_Domestic_Investment_Q'] = df_M['GDP_Gross_Private_Domestic_Investment_Q'].shift(+3)
df_M['GDP_Government_Consumption_Expenditure_And_Gross_Investment_Q'] = df_M['GDP_Government_Consumption_Expenditure_And_Gross_Investment_Q'].shift(+3)
df_M['GDP_Residential_Growth_A'] = df_M['GDP_Residential_Growth_A'].shift(+12)
df_M['GDP_Fixed_Investment_A'] = df_M['GDP_Fixed_Investment_A'].shift(+12)
df_M['GDP_Gross_Private_Domestic_Investment_A'] = df_M['GDP_Gross_Private_Domestic_Investment_A'].shift(+12)
df_M['GDP_Government_Consumption_Expenditure_And_Gross_Investment_A'] = df_M['GDP_Government_Consumption_Expenditure_And_Gross_Investment_A'].shift(+12)
df_M['Exports_Goods_Services_Growth'] = df_M['Exports_Goods_Services_Growth'].shift(+3)
df_M['Imports_Goods_Services_Growth'] = df_M['Imports_Goods_Services_Growth'].shift(+3)
df_M['Primary_Income_Receipts_Growth'] = df_M['Primary_Income_Receipts_Growth'].shift(+3)
df_M['Primary_Income_Payments_Growth'] = df_M['Primary_Income_Payments_Growth'].shift(+3)
df_M['Secondary_Income_Growth'] = df_M['Secondary_Income_Growth'].shift(+3)
df_M['Net_Acquisition_Growth'] = df_M['Net_Acquisition_Growth'].shift(+3)
df_M['Direct_Investment_Asset_Growth'] = df_M['Direct_Investment_Asset_Growth'].shift(+3)
df_M['Direct_Investment_Liabilities_Growth'] = df_M['Direct_Investment_Liabilities_Growth'].shift(+3)
df_M['Portfolio_Investment_Assets_Growth'] = df_M['Portfolio_Investment_Assets_Growth'].shift(+3)
df_M['Portfolio_Investment_Liabilities_Growth'] = df_M['Portfolio_Investment_Liabilities_Growth'].shift(+3)
df_M['Other_Investment_Assets_Growth'] = df_M['Other_Investment_Assets_Growth'].shift(+3)
df_M['Other_Investment_Liabilities_Growth'] = df_M['Other_Investment_Liabilities_Growth'].shift(+3)
df_M['Reserve_Assets_Growth'] = df_M['Reserve_Assets_Growth'].shift(+3)
df_M['Balance_On_Primary_Income_Growth'] = df_M['Balance_On_Primary_Income_Growth'].shift(+3)
df_M['Balance_On_Secondary_Income_Growth'] = df_M['Balance_On_Secondary_Income_Growth'].shift(+3)
df_M['TreasuryYield_Absolute'] = df_M['TreasuryYield_Absolute'].shift(+1)
df_M['TreasuryYield_Relative'] = df_M['TreasuryYield_Relative'].shift(+1)
df_M['Business_Confidence_Index'] = df_M['Business_Confidence_Index'].shift(+1)


df_M['High_A%'] = df_M['High_A'].pct_change()*100
df_M['Close_A%'] = df_M['Close_A'].pct_change()*100
df_M['Low_A%'] = df_M['Low_A'].pct_change()*100
df_M['Volume_A%'] = df_M['Volume_A'].pct_change()*100
df_M['High_A%'] = df_M['High_A%'].shift(+12)
df_M['Close_A%'] = df_M['Close_A%'].shift(+12)
df_M['Low_A%'] = df_M['Low_A%'].shift(+12)
df_M['Volume_A%'] = df_M['Volume_A%'].shift(+12)
df_M['High_A'] = df_M['High_A'].shift(+12)
df_M['Close_A'] = df_M['Close_A'].shift(+12)
df_M['Low_A'] = df_M['Low_A'].shift(+12)
df_M['Volume_A'] = df_M['Volume_A'].shift(+12)
df_M['Gross_Domestic_Product_Growth_A'] = df_M['Gross_Domestic_Product_Growth_A'].shift(+12)
df_M['High_M_DXY%'] = df_M['High_M_DXY'].pct_change()*100
df_M['High_Q_DXY%'] = df_M['High_Q_DXY'].pct_change()*100
df_M['High_A_DXY%'] = df_M['High_A_DXY'].pct_change()*100
df_M['Close_M_DXY%'] = df_M['Close_M_DXY'].pct_change()*100
df_M['Close_Q_DXY%'] = df_M['Close_Q_DXY'].pct_change()*100
df_M['Close_A_DXY%'] = df_M['Close_A_DXY'].pct_change()*100
df_M['Low_M_DXY%'] = df_M['Low_M_DXY'].pct_change()*100
df_M['Low_Q_DXY%'] = df_M['Low_Q_DXY'].pct_change()*100
df_M['Low_A_DXY%'] = df_M['Low_A_DXY'].pct_change()*100
df_M['High_M_DXY%'] = df_M['High_M_DXY%'].shift(+1)
df_M['High_Q_DXY%'] = df_M['High_Q_DXY%'].shift(+3)
df_M['High_A_DXY%'] = df_M['High_A_DXY%'].shift(+12)
df_M['Close_M_DXY%'] = df_M['Close_M_DXY%'].shift(+1)
df_M['Close_Q_DXY%'] = df_M['Close_Q_DXY%'].shift(+3)
df_M['Close_A_DXY%'] = df_M['Close_A_DXY%'].shift(+12)
df_M['Low_M_DXY%'] = df_M['Low_M_DXY%'].shift(+1)
df_M['Low_Q_DXY%'] = df_M['Low_Q_DXY%'].shift(+3)
df_M['Low_A_DXY%'] = df_M['Low_A_DXY%'].shift(+12)
df_M['High_Q_NASDAQ%'] = df_M['High_Q_NASDAQ'].pct_change()*100
df_M['High_A_NASDAQ%'] = df_M['High_A_NASDAQ'].pct_change()*100
df_M['Close_Q_NASDAQ%'] = df_M['Close_Q_NASDAQ'].pct_change()*100
df_M['Close_A_NASDAQ%'] = df_M['Close_A_NASDAQ'].pct_change()*100
df_M['Low_Q_NASDAQ%'] = df_M['Low_Q_NASDAQ'].pct_change()*100
df_M['Low_A_NASDAQ%'] = df_M['Low_A_NASDAQ'].pct_change()*100
df_M['High_Q_NASDAQ%'] = df_M['High_Q_NASDAQ%'].shift(+3)
df_M['High_A_NASDAQ%'] = df_M['High_A_NASDAQ%'].shift(+12)
df_M['Close_Q_NASDAQ%'] = df_M['Close_Q_NASDAQ%'].shift(+3)
df_M['Close_A_NASDAQ%'] = df_M['Close_A_NASDAQ%'].shift(+12)
df_M['Low_Q_NASDAQ%'] = df_M['Low_Q_NASDAQ%'].shift(+3)
df_M['Low_A_NASDAQ%'] = df_M['Low_A_NASDAQ%'].shift(+12)
df_M['Close_M_US10Y%'] = df_M['Close_M_US10Y'].pct_change()*100
df_M['Close_Q_US10Y%'] = df_M['Close_Q_US10Y'].pct_change()*100
df_M['Close_A_US10Y%'] = df_M['Close_A_US10Y'].pct_change()*100
df_M['Close_M_US10Y%'] = df_M['Close_M_US10Y%'].shift(+1)
df_M['Close_Q_US10Y%'] = df_M['Close_Q_US10Y%'].shift(+3)
df_M['Close_A_US10Y%'] = df_M['Close_A_US10Y%'].shift(+12)
df_M['Close_M_US5Y%'] = df_M['Close_M_US5Y'].pct_change()*100
df_M['Close_Q_US5Y%'] = df_M['Close_Q_US5Y'].pct_change()*100
df_M['Close_A_US5Y%'] = df_M['Close_A_US5Y'].pct_change()*100
df_M['Close_M_US5Y%'] = df_M['Close_M_US5Y%'].shift(+1)
df_M['Close_Q_US5Y%'] = df_M['Close_Q_US5Y%'].shift(+3)
df_M['Close_A_US5Y%'] = df_M['Close_A_US5Y%'].shift(+12)
df_M['Close_M_US13W%'] = df_M['Close_M_US13W'].pct_change()*100
df_M['Close_Q_US13W%'] = df_M['Close_Q_US13W'].pct_change()*100
df_M['Close_A_US13W%'] = df_M['Close_A_US13W'].pct_change()*100
df_M['Close_M_US13W%'] = df_M['Close_M_US13W%'].shift(+1)
df_M['Close_Q_US13W%'] = df_M['Close_Q_US13W%'].shift(+3)
df_M['Close_A_US13W%'] = df_M['Close_A_US13W%'].shift(+12)
df_M['High_M_GOLD%'] = df_M['High_M_GOLD'].pct_change()*100
df_M['High_Q_GOLD%'] = df_M['High_Q_GOLD'].pct_change()*100
df_M['High_A_GOLD%'] = df_M['High_A_GOLD'].pct_change()*100
df_M['Close_M_GOLD%'] = df_M['Close_M_GOLD'].pct_change()*100
df_M['Close_Q_GOLD%'] = df_M['Close_Q_GOLD'].pct_change()*100
df_M['Close_A_GOLD%'] = df_M['Close_A_GOLD'].pct_change()*100
df_M['Low_M_GOLD%'] = df_M['Low_M_GOLD'].pct_change()*100
df_M['Low_Q_GOLD%'] = df_M['Low_Q_GOLD'].pct_change()*100
df_M['Low_A_GOLD%'] = df_M['Low_A_GOLD'].pct_change()*100
df_M['High_M_GOLD%'] = df_M['High_M_GOLD%'].shift(+1)
df_M['High_Q_GOLD%'] = df_M['High_Q_GOLD%'].shift(+3)
df_M['High_A_GOLD%'] = df_M['High_A_GOLD%'].shift(+12)
df_M['Close_M_GOLD%'] = df_M['Close_M_GOLD%'].shift(+1)
df_M['Close_Q_GOLD%'] = df_M['Close_Q_GOLD%'].shift(+3)
df_M['Close_A_GOLD%'] = df_M['Close_A_GOLD%'].shift(+12)
df_M['Low_M_GOLD%'] = df_M['Low_M_GOLD%'].shift(+1)
df_M['Low_Q_GOLD%'] = df_M['Low_Q_GOLD%'].shift(+3)
df_M['Low_A_GOLD%'] = df_M['Low_A_GOLD%'].shift(+12)
df_M['Close_M_COPPER%'] = df_M['Close_M_COPPER'].pct_change()*100
df_M['Close_Q_COPPER%'] = df_M['Close_Q_COPPER'].pct_change()*100
df_M['Close_A_COPPER%'] = df_M['Close_A_COPPER'].pct_change()*100
df_M['Close_M_COPPER%'] = df_M['Close_M_COPPER%'].shift(+1)
df_M['Close_Q_COPPER%'] = df_M['Close_Q_COPPER%'].shift(+3)
df_M['Close_A_COPPER%'] = df_M['Close_A_COPPER%'].shift(+12)

df_M.replace(0,np.nan, inplace=True)












cols=[
'Unemployment_Rate','CPI_12','CPI_3','CPI_1','DFF_Absolute','DFF_Relative','Consumer_Sentiment','Consumer_Sentiment_Growth','Payout_Ratio','Payout_Ratio_Growth',
'Earnings_Growth','Earnings_Yield_Growth','Dividends_Growth','Dividends_Yield_Growth','GDP_per_Capita_Growth_Q','GDP_per_Capita_Growth_A','GDP_Government_Consumption_Expenditure_And_Gross_Investment_Q',
'GDP_Government_Consumption_Expenditure_And_Gross_Investment_A','Primary_Income_Receipts_Growth','Primary_Income_Payments_Growth','Secondary_Income_Growth','TreasuryYield_Absolute','TreasuryYield_Relative','Business_Confidence_Index']

df.drop(cols,axis=1,inplace=True)
df_M.fillna(method='ffill', inplace=True)
df.reset_index(drop=True, inplace=True)

df = pd.merge(df, df_M[['Date','RSI_M','High_M%','High_Q%','High_A%','Close_M%','Close_Q%','Close_A%','Low_M%','Low_Q%','Low_A%','Volume_M%','Volume_Q%','Volume_A%',
'Unemployment_Rate','CPI_12','CPI_3','CPI_1','DFF_Absolute','DFF_Relative','Consumer_Sentiment','Consumer_Sentiment_Growth','Payout_Ratio','Payout_Ratio_Growth',
'Earnings_Growth','Earnings_Yield_Growth','Dividends_Growth','Dividends_Yield_Growth','GDP_per_Capita_Growth_Q','GDP_per_Capita_Growth_A','GDP_Government_Consumption_Expenditure_And_Gross_Investment_Q',
'GDP_Government_Consumption_Expenditure_And_Gross_Investment_A','Primary_Income_Receipts_Growth','Primary_Income_Payments_Growth','Secondary_Income_Growth','TreasuryYield_Absolute','TreasuryYield_Relative','Business_Confidence_Index',
'High_M_GOLD%','High_Q_GOLD%','High_A_GOLD%','Close_M_GOLD%','Close_Q_GOLD%','Close_A_GOLD%','Low_M_GOLD%','Low_Q_GOLD%','Low_A_GOLD%', 'Close_M_COPPER%', 'Close_Q_COPPER%', 'Close_A_COPPER%','Close_M_COPPER%','Close_Q_COPPER%'
                        ]], on='Date', how='left')





df_A = df.dropna(subset=['Close_A'])
df_A.reset_index(drop=True, inplace=True)
def calculate_rsi(df_A, window=14):
    delta = df_A['Close_A'].diff()
    gains = (delta.where(delta > 0, 0)).fillna(0)
    losses = (-delta.where(delta < 0, 0)).fillna(0)
    avg_gain = gains.rolling(window=window, min_periods=1).mean()
    avg_loss = losses.rolling(window=window, min_periods=1).mean()
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    df_A['RSI_A'] = rsi
    return df_A

df_A = calculate_rsi(df_A)
df_A['RSI_A'] = df_A['RSI_A'].shift(+1)
df_A.fillna(method='ffill', inplace=True)
df = pd.merge(df, df_A[['Date','RSI_A']], on='Date', how='left')
df.fillna(method='ffill', inplace=True)
df=df.dropna()
df.reset_index(drop=True, inplace=True)


df['High_D%'] = df['High_D'].pct_change()*100
df['Close_D%'] = df['Close_D'].pct_change()*100
df['Low_D%'] = df['Low_D'].pct_change()*100
df['High_D%'] = df['High_D%'].shift(+1)
df['Close_D%'] = df['Close_D%'].shift(+1)
df['Low_D%'] = df['Low_D%'].shift(+1)
df['High_D'] = df['High_D'].shift(+1)
df['Close_D'] = df['Close_D'].shift(+1)
df['Low_D'] = df['Low_D'].shift(+1)

Start_date = '1973-01-03'
End_date = '2025-02-28'
df = df[(df['Date'] >= Start_date) & (df['Date'] <= End_date)]
df.reset_index(drop=True,inplace=True)
max_close_so_far = df['Close_D'].iloc[0]
last_max_close_date = df['Date'].iloc[0]

for i in range(len(df)):
    if df['Close_D'].iloc[i] > max_close_so_far:
        max_close_so_far = df['Close_D'].iloc[i]
        last_max_close_date = df['Date'].iloc[i]
    timedelta = df['Date'].iloc[i] - last_max_close_date
    df.at[i, 'Months Since Last Max Close'] = timedelta.days
    df.at[i, 'Current Drawdown % from Past Close'] = 1 - ((df['Low_D'].iloc[i]) / max_close_so_far)
df['Future Drawdown % from Close'] = float('NaN')

for i in range(len(df) - 1):  
    current_high = df.loc[i, 'Close_D']
    future_lows = df.loc[i + 1:, 'Low_D']
    if not future_lows.empty:
        future_min_low = future_lows.min()
        drawdown_percentage = 1 - ((future_min_low / current_high))
        df.at[i, 'Future Drawdown % from Close'] = drawdown_percentage

df.replace(np.nan,0,inplace=True)

#Selecciono solo los features que tienen relevancia (ya los estudie con anàlisis exploratorio)
selected_columns = [
    "Date",'High_A%','Volume_A%','Unemployment_Rate','CPI_12','Consumer_Sentiment','Consumer_Sentiment_Growth',
'Earnings_Yield_Growth','GDP_per_Capita_Growth_A','GDP_Government_Consumption_Expenditure_And_Gross_Investment_A','Secondary_Income_Growth',
'High_A_GOLD%','Close_A_GOLD%','Low_A_GOLD%', 'Close_A_COPPER%', "Current Drawdown % from Past Close", "Future Drawdown % from Close","CAPE","CAPE_Growth","RSI_A","Gross_Domestic_Product_Growth_Q","CPI_3","DFF_Absolute","TreasuryYield_Relative","TreasuryYield_Absolute","Personal_Income_Expenditure_Change_A","Personal_Income_Expenditure_Change_Q","Business_Confidence_Index"
]

# Filtrar el DataFrame
df = df[selected_columns]

%%capture
#Divido la data en entrenamiento y prueba, no lo hago azar porque quiero ver que tal se comporta el modelo en los datos màs actuales.
train_df = df[df['Date']< '2021-01-01']
train_df.reset_index(drop=True,inplace=True)
train_df.drop(['Date'],axis=1,inplace=True)

test_df = df[(df['Date'] >= '2021-01-01') & (df['Date'] <= '2025-02-28')]
test_df.reset_index(drop=True,inplace=True)
test_df.drop(['Date'],axis=1,inplace=True)

#Observo la data de entrenamiento para validar que esten correctos los últimos valores (comparar con fuente)
test_df

%%capture
#Análisis estadístico de los features
for col in train_df.columns:
    print(f"Estadisticas de la columna (col):")
    print(train_df[col].describe())
    print("'/n")

# Boxplots para análisis exploratorio

# Dividir las columnas en batches manejables
batch_size = 25  # Ajustar el tamaño del batch según sea necesario
num_batches = (len(train_df.columns) + batch_size - 1) // batch_size  # Calcular el número de batches
for batch in range(num_batches):
    start_idx = batch * batch_size
    end_idx = start_idx + batch_size
    columns_batch = train_df.columns[start_idx:end_idx]
    fig, axes = plt.subplots(nrows=len(columns_batch), ncols=1, figsize=(12, len(columns_batch) * 3))

    if len(columns_batch) == 1:
        axes = [axes]  # Hacer que axes sea iterable si solo hay un subplot

    for i, col in enumerate(columns_batch):
        if col in train_df.columns:  # Asegurarse de que la columna exista en el DataFrame
            train_df[col].plot(kind='box', ax=axes[i])
            axes[i].set_title(col)

    plt.tight_layout()
    plt.show()
    plt.close(fig)

# Histogramas para análisis exploratorio

# Dividir las columnas en batches manejables
batch_size = 25  
num_batches = (len(train_df.columns) + batch_size - 1) // batch_size  # Calcular el número de batches
for batch in range(num_batches):
    start_idx = batch * batch_size
    end_idx = start_idx + batch_size
    columns_batch = train_df.columns[start_idx:end_idx]
    fig, axes = plt.subplots(nrows=len(columns_batch), ncols=1, figsize=(12, len(columns_batch) * 3))

    if len(columns_batch) == 1:
        axes = [axes] 

    for i, col in enumerate(columns_batch):
        if i < len(axes):
            axes[i].hist(train_df[col], bins=30, color='skyblue', edgecolor='black')
            axes[i].set_title(col)

    plt.tight_layout()
    plt.show()
    plt.close(fig)

# Scatterplot para analisis exploratorio

# Dividir las columnas en batches manejables
batch_size = 25 
num_batches = (len(train_df.columns) + batch_size - 1) // batch_size 
for batch in range(num_batches):
    start_idx = batch * batch_size
    end_idx = start_idx + batch_size
    columns_batch = train_df.columns[start_idx:end_idx]
    fig, axes = plt.subplots(nrows=len(columns_batch), ncols=1, figsize=(12, len(columns_batch) * 3))

    if len(columns_batch) == 1:
        axes = [axes] 

    for i, col in enumerate(columns_batch):
        train_df.plot(kind='scatter', x=col, y='Future Drawdown % from Close', ax=axes[i])

    plt.tight_layout()
    plt.show()
    plt.close(fig)

# Dividir dataframes en uno con los features y otro con la variable objetivo
X = train_df.drop(['Future Drawdown % from Close'], axis=1)
y = train_df['Future Drawdown % from Close']

#Normalizaciòn de datos para poder entrenar el KNN
X_Normalized = (X-X.min())/(X.max()-X.min())
X_Normalized

X_test = test_df.drop(['Future Drawdown % from Close'],axis=1)
y_test = test_df['Future Drawdown % from Close']
scaler = MinMaxScaler()

# Ajustar el escalador con los datos de entrenamiento y transformarlos
X_normalized = scaler.fit_transform(X)

# Transformar los datos de validación utilizando el mismo escalador
X_test_normalized = scaler.transform(X_test)

# Convierte los arrays normalizados de vuelta a DataFrames
X_normalized_df = pd.DataFrame(X_normalized, columns=X.columns)
X_test_normalized_df = pd.DataFrame(X_test_normalized, columns=X_test.columns)

%%capture
#Usar GridSearch para encontrar la cantidad óptima de vecinos para KNN

"""# Machine Learning Models
#Model 3: K-Nearest-Neighborhood

#Cross Validation to select the quantity of neighbors
# Define the parameter grid
param_grid = {'n_neighbors': list(range(1, 2000))}

# Create a KNeighborsClassifier instance
knn = KNeighborsRegressor()

# Create a GridSearchCV instance
grid_search = GridSearchCV(knn, param_grid, cv=5, verbose=2, n_jobs=8)

# Fit the grid search to the data
grid_search.fit(X_Normalized, y)

# Best parameters foundIn cross-validation, especially with k-NN, it's also common to vary the number of neighbors (the k value) to find the best model. You can do this using a loop or more systematically using GridSearchCV or RandomizedSearchCV, which not only tries different values of k but also performs cross-validation at the same time.

#Using GridSearchCV to find the best k
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_}")
"""

# Definir 999 vecinos cercanos
knn = KNeighborsRegressor(n_neighbors=999)

# Entrenar el modelo con la data normalizada
knn.fit(X_Normalized, y)

KNeighborsRegressor(n_neighbors=999)

KNeighborsRegressor(n_neighbors=999)

# Validaciòn cruzada para evaluar el modelo
mae_scores = cross_val_score(knn, X_Normalized, y, cv=5, scoring='neg_mean_absolute_error')
rmse_scores = cross_val_score(knn, X_normalized, y, cv=5, scoring='neg_root_mean_squared_error')
mse_scores = cross_val_score(knn, X_normalized, y, cv=5, scoring='neg_mean_squared_error')

print(f"CV MAE Scores: {mae_scores}")
print(f"Average CV MAE Score: {mae_scores.mean()}")

print(f"CV RMSE Scores: {rmse_scores}")
print(f"Average CV RMSE Score: {rmse_scores.mean()}")

print(f"CV MSE Scores: {mse_scores}")
print(f"Average CV MSE Score: {mse_scores.mean()}")

CV MAE Scores: [-0.14415454 -0.11846136 -0.20397549 -0.23626327 -0.14168603]
Average CV MAE Score: -0.1689081399056976
CV RMSE Scores: [-0.18077957 -0.18165573 -0.23208107 -0.27605778 -0.16584231]
Average CV RMSE Score: -0.2072832889729766
CV MSE Scores: [-0.03268125 -0.0329988  -0.05386162 -0.0762079  -0.02750367]
Average CV MSE Score: -0.04465064869439081

%%capture
#Esconder resultados con capture para proteger el contenido del modelo

# Determinar los pesos de las variables con Importancia por permutaciòn
result = permutation_importance(knn, X_test, y_test, scoring='neg_mean_absolute_error', n_repeats=10, random_state=42)

print("Número de columnas en X_test:", X_test.shape[1])
print("Longitud de importances_mean:", len(result.importances_mean))

# Usar nombres reales de las columnas
feature_names = ["High_A%", "Volume_A%","Unemployment_Rate", "CPI_12","Consumer_Sentiment","Consumer_Sentiment_Growth","Earnings_Yield_Growth","GDP_per_Capita_Growth_A","GDP_Government_Consumption_Expenditure_And_Gross_Investment_A","Secondary_Income_Growth","High_A_GOLD%","Close_A_GOLD%","Low_A_GOLD%","Close_A_COPPER%","Current Drawdown % from Past Close","CAPE","CAPE_Growth","RSI_A","Gross_Domestic_Product_Growth_Q","CPI_3","DFF_Absolute","TreasuryYield_Relative","TreasuryYield_Absolute","Personal_Income_Expenditure_Change_A","Personal_Income_Expenditure_Change_Q","Business_Confidence_Index"]  # Reemplazar con los nombres reales

importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': result.importances_mean
})

# Ordenar por importancia
importance_df = importance_df.sort_values(by='Importance', ascending=False)

for feature, importance in zip(importance_df['Feature'], importance_df['Importance']):
    print(f"{feature}: {importance:.4f}")

# Hacer predicciones en data de prueba y validaciòn
y_pred = knn.predict(X_Normalized)
y_pred2 = knn.predict(X_test_normalized_df)

# Evaluar el modelo en la data de prueba
rmse = root_mean_squared_error(y_test, y_pred2)
mse = mean_squared_error(y_test, y_pred2)
mae = mean_absolute_error(y_test, y_pred2)

print(f'Root Mean Squared Error: {rmse}')
print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')

Root Mean Squared Error: 0.13559073421776544
Mean Squared Error: 0.018384847205712703
Mean Absolute Error: 0.12217366043600938

# Cálculo de residuos
residuos = y_test - y_pred2
residuos_abs = np.abs(residuos)

# Gráfico 1: Histograma de los residuos
plt.figure(figsize=(10, 6))
plt.hist(residuos, bins=30, color='blue', alpha=0.7, edgecolor='black')
plt.title('Histograma de los Residuos')
plt.xlabel('Residuos')
plt.ylabel('Frecuencia')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

# Gráfico 2: QQ Plot de los residuos
plt.figure(figsize=(10, 6))
stats.probplot(residuos, dist="norm", plot=plt)
plt.title('QQ Plot de los Residuos')
plt.grid(True)
plt.show()

# Gráfico #3: Modelo en data de entrenamiento
plt.figure(figsize=(12, 7))
# Asumiendo que 'Row ID' es el índice de df
plt.bar(train_df.index - 0.2, y, width=0.4, label='Meses de Drawdown real')
plt.bar(train_df.index + 0.2, y_pred, width=0.4, label='Meses de Drawdown predicción')
plt.title('Modelo 1-KNN: Drawdown_train')
plt.xlabel('ID de fila')
plt.ylabel('Meses')
plt.legend()
plt.show()
plt.close()

C:\Users\Abraham\anaconda3\Lib\site-packages\IPython\core\pylabtools.py:170: UserWarning: Creating legend with loc="best" can be slow with large amounts of data.
  fig.canvas.print_figure(bytes_io, **kw)

# Gráfico #4: Modelo en data de prueba
plt.figure(figsize=(12, 7))
# Asumiendo que 'Row ID' es el índice de df
plt.bar(test_df.index - 0.2, y_test, width=0.4, label='Meses de Drawdown real')
plt.bar(test_df.index + 0.2, y_pred2, width=0.4, label='Meses de Drawdown predicción')
plt.title('Modelo 1-KNN: Dradown_test')
plt.xlabel('ID de fila')
plt.ylabel('Meses')
plt.legend()
plt.show()
plt.close()

# Gráfico #5: Comparaciòn de Valores Predichos VS Valor absoluto de los residuos
plt.figure(figsize=(10, 6))
plt.scatter(y_pred2, residuos_abs, alpha=0.5, color= "purple")
plt.title('Modelo 1-KNN: Valor absoluto de los residuos VS valor predecido')
plt.xlabel('Valor predecido')
plt.ylabel('Valor absoluto de los residuos')
plt.grid(True)
plt.show()
plt.close()  # Cerrar la figura para liberar memoria

# Modelo 2: Decision Tree
# Definir el modelo de árbol de decisión para regresión
model = DecisionTreeRegressor(random_state=42)

# Definir los parámetros para la búsqueda en gridsearch
param_grid = {
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10],
    'max_features': [None, 'sqrt', 'log2']
}


# Configurar GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error', verbose=2)

# Ajustar el modelo con los mejores parámetros
grid_search.fit(X, y)

# Obtener el mejor modelo
best_model = grid_search.best_estimator_

Fitting 5 folds for each of 135 candidates, totalling 675 fits

plt.figure(figsize=(20, 10))  # Ajusta el tamaño del gráfico según sea necesario
plot_tree(best_model, filled=True, feature_names=X.columns, fontsize=10)
plt.title('Arbol de decisión')
plt.close()

# Validación cruzada para evaluar el modelo
cv_scores = cross_val_score(best_model, X_normalized, y, cv=5, scoring='neg_mean_absolute_error')
cv_scores2 = cross_val_score(best_model, X_normalized, y, cv=5, scoring='neg_root_mean_squared_error')
cv_scores3 = cross_val_score(best_model, X_normalized, y, cv=5, scoring='neg_mean_squared_error')

mae_scores = -cv_scores
rmse_scores = -cv_scores2
mse_scores = -cv_scores3

print(f"CV MAE Scores: {mae_scores}")
print(f"Average CV MAE Score: {mae_scores.mean()}")

print(f"CV RMSE Scores: {rmse_scores}")
print(f"Average CV RMSE Score: {rmse_scores.mean()}")

print(f"CV MSE Scores: {mse_scores}")
print(f"Average CV MSE Score: {mse_scores.mean()}")

CV MAE Scores: [0.11982018 0.10137021 0.22041393 0.16315768 0.12037304]
Average CV MAE Score: 0.14502700703004662
CV RMSE Scores: [0.16838936 0.12172215 0.25868258 0.21969937 0.14614422]
Average CV RMSE Score: 0.1829275338333946
CV MSE Scores: [0.02835498 0.01481628 0.06691668 0.04826781 0.02135813]
Average CV MSE Score: 0.035942775638766244

# Predicciones en la data de prueba
y_pred2 = best_model.predict(X_test)

# Evaluación del modelo en la data de prueba
rmse = root_mean_squared_error(y_test, y_pred2)
mse = mean_squared_error(y_test, y_pred2)
mae = mean_absolute_error(y_test, y_pred2)

print(f'Root Mean Squared Error: {rmse}')
print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')

Root Mean Squared Error: 0.2249189439013285
Mean Squared Error: 0.050588531325688954
Mean Absolute Error: 0.1749234384962452

# Cálculo de residuos
residuos = y_test - y_pred2
residuos_abs = np.abs(residuos)

# Gráfico 1: Histograma de los residuos
plt.figure(figsize=(10, 6))
plt.hist(residuos, bins=30, color='blue', alpha=0.7, edgecolor='black')
plt.title('Histograma de los Residuos')
plt.xlabel('Residuos')
plt.ylabel('Frecuencia')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

# Gráfico 2: QQ Plot de los residuos
plt.figure(figsize=(10, 6))
stats.probplot(residuos, dist="norm", plot=plt)
plt.title('QQ Plot de los Residuos')
plt.grid(True)
plt.show()

# Gráfico 3: Predicciones de DT en data de entrenamiento
plt.figure(figsize=(12, 7))
# Asumiendo que 'Row ID' es el índice de df
plt.bar(train_df.index - 0.2, y, width=0.4, label='Meses de Drawdown real')
plt.bar(train_df.index + 0.2, y_pred, width=0.4, label='Meses de Drawdown predicción')
plt.title('Modelo 2-DT: Drawdown_Train')
plt.xlabel('ID de fila')
plt.ylabel('Meses')
plt.legend()
plt.show()
plt.close()

# Gráfico 4: Predicciones de DT en data de prueba
plt.figure(figsize=(12, 7))
# Asumiendo que 'Row ID' es el índice de df
plt.bar(test_df.index - 0.2, y_test, width=0.4, label='Meses de Drawdown real')
plt.bar(test_df.index + 0.2, y_pred2, width=0.4, label='Meses de Drawdown predicción')
plt.title('Modelo 2-DT: Drawdown_Test')
plt.xlabel('ID de fila')
plt.ylabel('Meses')
plt.legend()
plt.show()
plt.close()

# Gráfico 5: Valores absolutos de los residuos VS valor Predicho
plt.figure(figsize=(10, 6))
plt.scatter(y_pred2, residuos_abs, alpha=0.5, color= "green")
plt.title('Modelo 2-DT: Valor absoluto de los residuos VS valor predecido')
plt.xlabel('Valor predecido')
plt.ylabel('Valor absoluto de los residuos')
plt.grid(True)
plt.show()
plt.close()

%%capture
"""# Modelo 3: Random Forest

# Grid search para escoger los mejores hiperparámetros
param_grid = {
    'n_estimators': [100, 200, 300,],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'] 
}

rf = RandomForestRegressor()
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_error', verbose=2, n_jobs=-1)
grid_search.fit(X_normalized, y)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", -grid_search.best_score_)
"""

# Entrenar el random forest
best_params = {
    'max_depth': 10,
    'max_features':'log2',
    'min_samples_leaf': 2,
    'min_samples_split': 2,
    'n_estimators': 100
}

rf = RandomForestRegressor(**best_params)


rf.fit(X_normalized, y)

RandomForestRegressor(max_depth=10, max_features='log2', min_samples_leaf=2)

RandomForestRegressor(max_depth=10, max_features='log2', min_samples_leaf=2)

# Validación cruzada para evaluar performance del modelo
cv_scores = cross_val_score(rf, X_normalized, y, cv=5, scoring='neg_mean_absolute_error')
cv_scores2 = cross_val_score(rf, X_normalized, y, cv=5, scoring='neg_root_mean_squared_error')
cv_scores3 = cross_val_score(rf, X_normalized, y, cv=5, scoring='neg_mean_squared_error')

# Since the scoring is negative MSE, We multiply by -1 to get positive values
mae_scores = -cv_scores
rmse_scores = -cv_scores2
mse_scores = -cv_scores3

print(f"CV MEA Scores: {mae_scores}")
print(f"Average CV MEA Score: {mae_scores.mean()}")

print(f"CV RMSE Scores: {rmse_scores}")
print(f"Average CV RMSE Score: {rmse_scores.mean()}")

print(f"CV MSE Scores: {mse_scores}")
print(f"Average CV RMSE Score: {mse_scores.mean()}")

CV MEA Scores: [0.09873589 0.12606374 0.17610461 0.23498032 0.14504664]
Average CV MEA Score: 0.15618624040019777
CV RMSE Scores: [0.10594645 0.13818789 0.19817576 0.25905039 0.17002988]
Average CV RMSE Score: 0.17427807458877956
CV MSE Scores: [0.01135283 0.02012412 0.03802585 0.07136105 0.02602355]
Average CV RMSE Score: 0.03337748115337203

%%capture
# Predicciones en data de entrenamiento y prueba
y_pred = rf.predict(X_test_normalized_df)
y_pred3 = rf.predict(X_Normalized)

# Evaluar el modelo en la data de prueba
rmse = root_mean_squared_error(y_test,y_pred)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)  # Calculate MAE

print(f'Mean Squared Error: {rmse}')
print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')

Mean Squared Error: 0.18642023170242236
Mean Squared Error: 0.03475250278798484
Mean Absolute Error: 0.17409408180766328

# Cálculo de residuos
residuos = y_test - y_pred
residuos_abs = np.abs(residuos)

# Gráfico 1: Histograma de los residuos
plt.figure(figsize=(10, 6))
plt.hist(residuos, bins=30, color='blue', alpha=0.7, edgecolor='black')
plt.title('Histograma de los Residuos')
plt.xlabel('Residuos')
plt.ylabel('Frecuencia')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

# Gráfico 2: QQ Plot de los residuos
plt.figure(figsize=(10, 6))
stats.probplot(residuos, dist="norm", plot=plt)
plt.title('QQ Plot de los Residuos')
plt.grid(True)
plt.show()

# Gráfico 3: Modelo en data de entrenamiento
plt.figure(figsize=(12, 7))
plt.bar(train_df.index - 0.2, y, width=0.4, label='Meses de Drawdown real')
plt.bar(train_df.index + 0.2, y_pred3, width=0.4, label='Meses de Drawdown predicción')
plt.title('Modelo 3-RF: Drawdown_train')
plt.xlabel('ID de fila')
plt.ylabel('Meses')
plt.legend()
plt.show()
plt.close()

# Gráfico 4: Modelo en data de prueba
plt.figure(figsize=(12, 7))
plt.bar(test_df.index - 0.2, y_test, width=0.4, label='Drawdown real')
plt.bar(test_df.index + 0.2, y_pred, width=0.4, label='Drawdown predicción')
plt.title('Modelo 3-RF: Drawdown_Test')
plt.xlabel('ID de fila')
plt.ylabel('Drawdown')
plt.legend()
plt.show()
plt.close()

# Gráfico 5: Modelo en data de prueba
plt.figure(figsize=(10, 6))
plt.scatter(y_pred, residuos_abs, alpha=0.5, color="red")

plt.title('Modelo 3-RF: Residuos')
plt.xlabel('Valor predecido')
plt.ylabel('Valor absoluto de los residuos')
plt.grid(True)
plt.show()
plt.close()

%%capture
#Grid search para encontrar mejores hiperparametros para LGBM
""" 
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [-1, 10, 20, 30],  # -1 significa sin límite
    'num_leaves': [31, 50, 100],  # Número máximo de hojas en cada árbol
    'min_child_samples': [20, 50, 100],  # Mínimo de muestras por hoja
    'learning_rate': [0.01, 0.05, 0.1],  # Tasa de aprendizaje
    'colsample_bytree': [0.6, 0.8, 1.0]  # Submuestra de características por árbol
}

lgbm = LGBMRegressor()
grid_search = GridSearchCV(estimator=lgbm, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_error', verbose=2, n_jobs=-1)
grid_search.fit(X_normalized, y)

print("Best Parameters:", grid_search.best_params_)
print("Best Score:", -grid_search.best_score_)
"""

# Los mejores parámetros de GridSearchCV
best_params = {'colsample_bytree': 0.6,
               'learning_rate': 0.01,
               'max_depth': -1,
               'min_child_samples': 50,
               'n_estimators': 100,
               'num_leaves': 31}

# Entrenar el LGBM
lgbm = LGBMRegressor(**best_params)

lgbm.fit(X_normalized, y)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000141 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3527
[LightGBM] [Info] Number of data points in the train set: 17530, number of used features: 26
[LightGBM] [Info] Start training from score 0.181807

LGBMRegressor(colsample_bytree=0.6, learning_rate=0.01, min_child_samples=50)

LGBMRegressor(colsample_bytree=0.6, learning_rate=0.01, min_child_samples=50)

# Validación cruzada para evaluar el modelo
cv_scores = cross_val_score(lgbm, X_normalized, y, cv=5, scoring='neg_mean_absolute_error')
cv_scores2 = cross_val_score(lgbm, X_normalized, y, cv=5, scoring='neg_root_mean_squared_error')
cv_scores3 = cross_val_score(lgbm, X_normalized, y, cv=5, scoring='neg_mean_squared_error')

mae_scores = -cv_scores
rmse_scores = -cv_scores2
mse_scores = -cv_scores3

print(f"CV MAE Scores: {mae_scores}")
print(f"Average CV MAE Score: {mae_scores.mean()}")

print(f"CV RMSE Scores: {rmse_scores}")
print(f"Average CV RMSE Score: {rmse_scores.mean()}")

print(f"CV MSE Scores: {mse_scores}")
print(f"Average CV MSE Score: {mse_scores.mean()}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000138 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3291
[LightGBM] [Info] Number of data points in the train set: 14024, number of used features: 26
[LightGBM] [Info] Start training from score 0.187191
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000520 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3354
[LightGBM] [Info] Number of data points in the train set: 14024, number of used features: 26
[LightGBM] [Info] Start training from score 0.207531
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000140 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3378
[LightGBM] [Info] Number of data points in the train set: 14024, number of used features: 26
[LightGBM] [Info] Start training from score 0.170615
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000169 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3347
[LightGBM] [Info] Number of data points in the train set: 14024, number of used features: 26
[LightGBM] [Info] Start training from score 0.142116
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000139 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3356
[LightGBM] [Info] Number of data points in the train set: 14024, number of used features: 26
[LightGBM] [Info] Start training from score 0.201582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000524 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3291
[LightGBM] [Info] Number of data points in the train set: 14024, number of used features: 26
[LightGBM] [Info] Start training from score 0.187191
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000138 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3354
[LightGBM] [Info] Number of data points in the train set: 14024, number of used features: 26
[LightGBM] [Info] Start training from score 0.207531
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000140 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3378
[LightGBM] [Info] Number of data points in the train set: 14024, number of used features: 26
[LightGBM] [Info] Start training from score 0.170615
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000136 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3347
[LightGBM] [Info] Number of data points in the train set: 14024, number of used features: 26
[LightGBM] [Info] Start training from score 0.142116
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000136 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3356
[LightGBM] [Info] Number of data points in the train set: 14024, number of used features: 26
[LightGBM] [Info] Start training from score 0.201582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000137 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3291
[LightGBM] [Info] Number of data points in the train set: 14024, number of used features: 26
[LightGBM] [Info] Start training from score 0.187191
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000133 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3354
[LightGBM] [Info] Number of data points in the train set: 14024, number of used features: 26
[LightGBM] [Info] Start training from score 0.207531
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000138 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3378
[LightGBM] [Info] Number of data points in the train set: 14024, number of used features: 26
[LightGBM] [Info] Start training from score 0.170615
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000137 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3347
[LightGBM] [Info] Number of data points in the train set: 14024, number of used features: 26
[LightGBM] [Info] Start training from score 0.142116
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000136 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3356
[LightGBM] [Info] Number of data points in the train set: 14024, number of used features: 26
[LightGBM] [Info] Start training from score 0.201582
CV MAE Scores: [0.10197824 0.11714996 0.16456228 0.23916487 0.1228751 ]
Average CV MAE Score: 0.14914608855712091
CV RMSE Scores: [0.12120531 0.13602374 0.18384776 0.27251187 0.14195825]
Average CV RMSE Score: 0.17110938382188196
CV MSE Scores: [0.01469073 0.01850246 0.0338     0.07426272 0.02015214]
Average CV MSE Score: 0.032281608831985956

# Hacer predicciones en el dataset de entrenamiento y prueba
# Make predictions on the test data
y_pred = lgbm.predict(X_test_normalized_df)
y_pred3 = lgbm.predict(X_normalized)

# Evaluar el modelo en el dataset de prueba
rmse = root_mean_squared_error(y_test,y_pred)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)  # Calculate MAE

print(f'Mean Squared Error: {rmse}')
print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')

Mean Squared Error: 0.1408472127024267
Mean Squared Error: 0.019837937326042627
Mean Absolute Error: 0.12907963881982978

#Resultados de las predicciones
results_test = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred
})
results_test

# Cálculo de residuos
residuos = y_test - y_pred
residuos_abs = np.abs(residuos)

# Gráfico 1: Histograma de los residuos
plt.figure(figsize=(10, 6))
plt.hist(residuos, bins=30, color='blue', alpha=0.7, edgecolor='black')
plt.title('Histograma de los Residuos')
plt.xlabel('Residuos')
plt.ylabel('Frecuencia')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

# Gráfico 2: QQ Plot de los residuos
plt.figure(figsize=(10, 6))
stats.probplot(residuos, dist="norm", plot=plt)
plt.title('QQ Plot de los Residuos')
plt.grid(True)
plt.show()

# Gráfico 3: Predicciones en la data de entrenamiento
plt.figure(figsize=(12, 7))
plt.bar(train_df.index - 0.2, y, width=0.4, label='Meses de Drawdown real')
plt.bar(train_df.index + 0.2, y_pred3, width=0.4, label='Meses de Drawdown predicción')
plt.title('Modelo 4-LB: Drawdown_train')
plt.xlabel('ID de fila')
plt.ylabel('Meses')
plt.legend()
plt.show()
plt.close()

C:\Users\Abraham\anaconda3\Lib\site-packages\IPython\core\pylabtools.py:170: UserWarning: Creating legend with loc="best" can be slow with large amounts of data.
  fig.canvas.print_figure(bytes_io, **kw)

# Gráfico 4: Predicciones en la data de prueba
plt.figure(figsize=(12, 7))
plt.bar(test_df.index - 0.2, y_test, width=0.4, label='Drawdown real')
plt.bar(test_df.index + 0.2, y_pred, width=0.4, label='Drawdown predicción')
plt.title('Modelo 4-LB: Drawdown_Test')
plt.xlabel('ID de fila')
plt.ylabel('Drawdown')
plt.legend()
plt.close(fig)  # Cerrar la figura para liberar memoria

# Gráfico 5: Valores absolutos de los residuos VS valores predichos
plt.figure(figsize=(10, 6))
plt.scatter(y_pred, residuos_abs, alpha=0.5, color="black")
plt.title('Modelo 4-LightGBM: Residuos')
plt.xlabel('Valor predecido')
plt.ylabel('Valor absoluto de los residuos')
plt.grid(True)
plt.show()
plt.close()

%%capture
#Esconder resultados con capture para proteger el contenido del modelo

# Importancia de las características
feature_importances = lgbm.feature_importances_
feature_names = X.columns

# Crear un DataFrame para las importancias
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)
print(importance_df)

# Graficar la importancia de las características
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance', y='Feature', data=importance_df, palette='viridis')
plt.title('Importancia de las Características - LightGBM')
plt.xlabel('Importancia')
plt.ylabel('Características')
plt.tight_layout()
plt.show()
plt.close()

%%capture
#Esconder resultados con capture para proteger el contenido del modelo

# Calcular las importancias en porcentaje
importance_df['Importance (%)'] = (importance_df['Importance'] / importance_df['Importance'].sum()) * 100
print(importance_df)

# Graficar las importancias en porcentaje
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance (%)', y='Feature', data=importance_df, palette='viridis')
plt.title('Importancia de las Características (Porcentaje) - LightGBM')
plt.xlabel('Importancia (%)')
plt.ylabel('Características')
plt.tight_layout()

	High_A%	Volume_A%	Unemployment_Rate	CPI_12	Consumer_Sentiment	Consumer_Sentiment_Growth	Earnings_Yield_Growth	GDP_per_Capita_Growth_A	GDP_Government_Consumption_Expenditure_And_Gross_Investment_A	Secondary_Income_Growth	...	CAPE_Growth	RSI_A	Gross_Domestic_Product_Growth_Q	CPI_3	DFF_Absolute	TreasuryYield_Relative	TreasuryYield_Absolute	Personal_Income_Expenditure_Change_A	Personal_Income_Expenditure_Change_Q	Business_Confidence_Index
0	15.772200	38.876282	6.7	1.3	76.9	-19.9	-30.845771	-2.554366	3.4	-2.105958	...	2.211840	80.507163	5.6	0.8	-1.460968	-49.88148	-0.929221	9.5	2.288468	101.36050
1	15.772200	38.876282	6.7	1.3	76.9	-19.9	-30.845771	-2.554366	3.4	-2.105958	...	2.211840	80.507163	5.6	0.8	-1.460968	-49.88148	-0.929221	9.5	2.288468	101.36050
2	15.772200	38.876282	6.7	1.3	76.9	-19.9	-30.845771	-2.554366	3.4	-2.105958	...	2.211840	80.507163	5.6	0.8	-1.460968	-49.88148	-0.929221	9.5	2.288468	101.36050
3	15.772200	38.876282	6.7	1.3	76.9	-19.9	-30.845771	-2.554366	3.4	-2.105958	...	2.211840	80.507163	5.6	0.8	-1.460968	-49.88148	-0.929221	9.5	2.288468	101.36050
4	15.772200	38.876282	6.7	1.3	76.9	-19.9	-30.845771	-2.554366	3.4	-2.105958	...	2.211840	80.507163	5.6	0.8	-1.460968	-49.88148	-0.929221	9.5	2.288468	101.36050
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1515	27.260352	-1.524870	4.0	2.9	74.0	4.3	-16.250000	1.855928	3.4	16.807415	...	0.846611	83.796796	2.3	1.0	-1.000000	14.06947	0.570000	4.2	1.036821	99.37747
1516	27.260352	-1.524870	4.0	2.9	74.0	4.3	-16.250000	1.855928	3.4	16.807415	...	0.846611	83.796796	2.3	1.0	-1.000000	14.06947	0.570000	4.2	1.036821	99.37747
1517	27.260352	-1.524870	4.0	2.9	74.0	4.3	-16.250000	1.855928	3.4	16.807415	...	0.846611	83.796796	2.3	1.0	-1.000000	14.06947	0.570000	4.2	1.036821	99.37747
1518	27.260352	-1.524870	4.0	2.9	74.0	4.3	-16.250000	1.855928	3.4	16.807415	...	0.846611	83.796796	2.3	1.0	-1.000000	14.06947	0.570000	4.2	1.036821	99.37747
1519	27.260352	-1.524870	4.0	2.9	74.0	4.3	-16.250000	1.855928	3.4	16.807415	...	0.846611	83.796796	2.3	1.0	-1.000000	14.06947	0.570000	4.2	1.036821	99.37747

	Actual	Predicted
0	0.070417	0.199659
1	0.070417	0.199659
2	0.070417	0.199659
3	0.056495	0.199659
4	0.063131	0.199659
...	...	...
1515	0.001001	0.245094
1516	0.001001	0.245094
1517	0.001001	0.245094
1518	0.001001	0.245094
1519	0.000000	0.245094

Modelo	MAE CV	RMSE CV	MSE CV	MAE Test	RMSE Test	MSE Test
KNN	0.1689	0.2073	0.0446	0.1222	0.1356	0.0184
Decision Tree	0.1450	0.1829	0.0359	0.1749	0.2249	0.0506
Random Forest	0.1543	0.1767	0.0315	0.1636	0.1736	0.0301
LightGBM	0.1491	0.1711	0.0323	0.1291	0.1408	0.0198

Revolucionando las Inversiones: Predicciones del S&P500 con IA¶

Tabla de contenidos¶

Sección 1: Preparación de datos¶

Sección 2: Procesamiento y Transformación de datos¶

Sección 3: Análisis exploratorio¶

Sección 4: Ingeniería de variables¶

Sección 5: Modelos de Machine Learning¶

Modelo 1: KNN¶

Modelo 2: Decision Tree¶

Modelo 3: Random Forest¶

Modelo 4: LightGBM¶

Sección 6: Conclusión¶