#===========================Importing packages================================= import yfinance as yf import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestRegressor import matplotlib.pyplot as plt import seaborn as sns import pandas_ta as ta from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score from xgboost import XGBRegressor from sklearn.model_selection import GridSearchCV from lightgbm import LGBMRegressor import math #=========================Data collection====================================== # Fetch historical stock data for NVIDIA, from yahoo finance symbol = 'NVDA' start_date = '2021-12-01' end_date = '2023-12-01' stock_data = yf.download(symbol, start=start_date, end=end_date) #========================Exploratory data analysis============================= # Display the first few rows of the dataset print("Head of the dataset:") print(stock_data.head()) # Summary statistics print("\nSummary statistics:") print(stock_data.describe()) # Check for missing values print("\nMissing values:") print(stock_data.isnull().sum()) # Visualize the distribution of 'Adj Close' prices plt.figure(figsize=(12, 6)) sns.histplot(stock_data['Adj Close'], bins=50, kde=True) plt.title('Distribution of Adj Close Prices') plt.xlabel('Adj Close Price') plt.ylabel('Frequency') plt.show() # Visualize the adj closing prices over time plt.figure(figsize=(14, 6)) plt.plot(stock_data.index, stock_data['Adj Close'], label='Adj Close Price', color='blue') plt.title('Adj Closing Prices Over Time') plt.xlabel('Date') plt.ylabel('Adj Close Price') plt.legend() plt.show() # Visualize the daily returns plt.figure(figsize=(14, 6)) plt.plot(stock_data.index, stock_data['Adj Close'].pct_change(), label='Daily Returns', color='green') plt.title('Daily Returns Over Time') plt.xlabel('Date') plt.ylabel('Daily Returns') plt.legend() plt.show() #==========================Create features & target============================ stock_data['SMA_50'] = stock_data['Adj Close'].rolling(window=50).mean() stock_data['SMA_200'] = stock_data['Adj Close'].rolling(window=200).mean() stock_data['Daily_Return'] = stock_data['Adj Close'].pct_change() stock_data['RSI'] = stock_data.ta.rsi(close='Adj Close', length=14, append=True) stock_data['EMA'] = stock_data.ta.ema(close='Adj Close', length=9, append=True) stock_data = stock_data.dropna() # Define features and target variable features = ['Open', 'RSI'] target = 'Adj Close' # Extract features and target X = stock_data[features] y = stock_data[target] # Display the first few rows of the dataset with features and target print(stock_data[features + [target]].head()) #========================Creating X and y====================================== # Split the data into training and testing sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) # Overwrite the test set X_test = X.tail(math.floor(0.2 * len(stock_data))) y_test = y.tail(math.floor(0.2 * len(stock_data))) # We will use these for all three seperate ML methdos #=============================Random Forest==================================== # Define the hyperparameter grid, for hyperparameter tuning rf_param_grid = { 'n_estimators': [50, 100, 200], 'max_depth': [2, 5, 10], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4] } # Create a Random Forest Regressor rf_model = RandomForestRegressor(random_state=1) # Initialize GridSearchCV grid_search = GridSearchCV(estimator=rf_model, param_grid=rf_param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1) # Perform grid search to find the best hyperparameters grid_search.fit(X_train, y_train) # Get the best hyperparameters rf_best_params = grid_search.best_params_ # Create and train the Random Forest model with the best hyperparameters best_rf_model = RandomForestRegressor(random_state=1, **rf_best_params) best_rf_model.fit(X_train, y_train) # Predict the stock prices on the test set using the tuned model rf_predictions = best_rf_model.predict(X_test) # Evaluate the model using regression metrics rf_mae = mean_absolute_error(y_test, rf_predictions) rf_mse = mean_squared_error(y_test, rf_predictions) rf_r2 = r2_score(y_test, rf_predictions) rf_mape = np.mean(np.abs((y_test - rf_predictions) / y_test)) * 100 rf_rmse = np.sqrt(rf_mse) print(f"Mean Absolute Error (MAE): {rf_mae:.2f}") print(f"Mean Squared Error (MSE): {rf_mse:.2f}") print(f"R-squared (R2): {rf_r2:.2f}") print(f"Root Mean Squared Error (RMSE): {rf_rmse:.2f}") print(f"Mean Absolute Percentage Error (MAPE): {rf_mape:.2f}%") for param, value in rf_best_params.items(): print(f"{param}: {value}") # Plot the actual vs predicted prices plt.figure(figsize=(12, 6)) plt.plot(stock_data.index[-len(y_test):], y_test, label='Actual Prices', color='blue') plt.plot(stock_data.index[-len(y_test):], rf_predictions, label='Predicted Prices', color='red') plt.title(f'{symbol} Stock Price Prediction using Random Forest') plt.xlabel('Date') plt.ylabel('Stock Price') plt.legend() plt.show() # Get feature importances from the Random Forest model rf_feature_importances = best_rf_model.feature_importances_ # Create a DataFrame to store feature names and their importances rf_feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': rf_feature_importances}) # Sort the DataFrame by importance in descending order rf_feature_importance_df = rf_feature_importance_df.sort_values(by='Importance', ascending=False) # Plot the feature importance plt.figure(figsize=(10, 6)) sns.barplot(x='Importance', y='Feature', data=rf_feature_importance_df, palette='viridis') plt.title('Random Forest - Feature Importance') plt.show() I have some trouble running this code. Whenever running the code with different feautures ('Open', 'RSI'), the model always returns a R-squared of 1. Which would likely mean we are overfitting the model, how can we solve this or is this normal with stock price prediction?

