Three-value classification(USDJPY) – Preparation, Learning, Importances

Estimate USDJPY after 1 day based on 10 years of data by using scikit-learn

# import libraries
import pandas as pd
import numpy as np
import talib as ta
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
import optuna
import matplotlib.pyplot as plt
import seaborn as sns

# Check the shape
print(df0.shape)

# Check the data
summary = df0.describe()
print(summary)

# # Display information about the DataFrame
df0.info()

df0["Date"] = pd.to_datetime(df0["Date"])

# Get the oldest and latest dates
oldest_date = df0["Date"].min()
latest_date = df0["Date"].max()

print("Oldest Date:", oldest_date)
print("Latest Date:", latest_date)

df0["Date"] = pd.to_datetime(df0["Date"])

# Get the oldest and latest dates
oldest_date = df0["Date"].min()
latest_date = df0["Date"].max()

print("Oldest Date:", oldest_date)
print("Latest Date:", latest_date)

Oldest Date: 2003-09-15 00:00:00
Latest Date: 2022-11-11 00:00:00

# Load and clean the data for df1
df1 = pd.read_csv("")
df1["Date"] = pd.to_datetime(df1["Date"], format="%m/%d/%Y")  # Specify the correct date format

# Concatenate DataFrames
df = pd.concat([df0, df1])

# Calculate oldest and latest dates
oldest_date = df["Date"].min()
latest_date = df["Date"].max()

print("Oldest Date:", oldest_date)
print("Latest Date:", latest_date)

Oldest Date: 2003-09-15 00:00:00
Latest Date: 2022-11-11 00:00:00

print(df.shape)

(5218, 7)

df.info()

<class ‘pandas.core.frame.DataFrame’>
Index: 5218 entries, 0 to 217
Data columns (total 7 columns):
# Column Non-Null Count Dtype — —— ————– —–
0 Date 5218 non-null datetime64[ns]
1 Price 5218 non-null float64
2 Open 5218 non-null float64
3 High 5218 non-null float64
4 Low 5218 non-null float64
5 Vol. 0 non-null float64
6 Change % 5218 non-null object
dtypes: datetime64[ns](1), float64(5), object(1) memory usage: 326.1+ KB

# Drop the "Vol." column from df
df = df.drop("Vol.", axis=1)

# Check the shape of df after dropping the column
print("Shape of df after dropping 'Vol.' column:", df.shape)

Shape of df after dropping ‘Vol.’ column: (5218, 6)

# Identify columns with all null values
null_columns = df.columns[df.isnull().all()]

# Drop columns with all null values
df = df.drop(columns=null_columns)

# Check the shape of df after dropping null columns
print("Shape of df after dropping null columns:", df.shape)

Shape of df after dropping null columns: (5218, 6)

# print the unique column names in df
print("Unique column names in df:", df.columns.tolist())

Unique column names in df: [‘Date’, ‘Price’, ‘Open’, ‘High’, ‘Low’, ‘Change %’]

#All subsequent calculations use the closing rate
close = np.array(df["Price"])

#Create an empty dataframe to contain features
df_feature = pd.DataFrame(index=range(len(df)),columns=["SMA5/current", "SMA20/current","RSI","MACD","BBANDS+2σ","BBANDS-2σ"])

#Ccalculate technical indicators (features used in this training) using talib and put in df_feature

#Simple moving average uses the ratio of the simple moving average to the closing price of the day as the feature value
df_feature["SMA5/current"]= ta.SMA(close, timeperiod=5) / close
df_feature["SMA20/current"]= ta.SMA(close, timeperiod=20) / close

#RSI
df_feature["RSI"] = ta.RSI(close, timeperiod=14)

#MACD
df_feature["MACD"], _ , _= ta.MACD(close, fastperiod=12, slowperiod=26, signalperiod=9)

#Bollinger bands
upper, middle, lower = ta.BBANDS(close, timeperiod=20, nbdevup=3, nbdevdn=3)
df_feature["BBANDS+2σ"] = upper / close
df_feature["BBANDS-2σ"] = lower / close

df["the day before_float"] = df["Change %"].apply(lambda x: float(x.replace("%", "")))

#How to categorize the % of previous day. Divide as much as possible so that the sample of each class is equal
def classify(x):
    if x <= -0.2:
        return 'down'
    elif -0.2 < x < 0.2:
        return 'even'
    elif 0.2 <= x:
        return 'up'


df["the day before_classified"] = df["the day before_float"].apply(lambda x: classify(x))

#Shift the data you want to teach by one day (you know what I mean)
df_y = df["the day before_classified"].shift()

# Reset the index of df0_y to ensure uniqueness
df_y = df_y.reset_index(drop=True)

# Concatenate df0_feature and df0_y with reset index
df_xy = pd.concat([df_feature, df_y], axis=1)
df_xy = df_xy.dropna(how="any")

# print("Shape of df_xy:", df_xy.shape)
print("Number of samples in df_xy:", len(df_xy))

X_train, X_test, Y_train, Y_test = train_test_split(
    df_xy[["SMA5/current", "SMA20/current", "RSI", "MACD", "BBANDS+2σ", "BBANDS-2σ"]],
    df_xy["the day before_classified"],
    train_size=0.8,
    random_state=42
)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of Y_train:", Y_train.shape)
print("Shape of Y_test:", Y_test.shape)

Shape of df_xy: (5185, 7)
Number of samples in df_xy: 5185 Shape of X_train: (4148, 6)
Shape of X_test: (1037, 6)
Shape of Y_train: (4148,)
Shape of Y_test: (1037,)

X_train, X_test, Y_train, Y_test = train_test_split(df_xy[["SMA5/current", "SMA20/current","RSI","MACD","BBANDS+2σ","BBANDS-2σ"]],df_xy["前日比_classified"], train_size=0.8)

def objective(trial):
    min_samples_split = trial.suggest_int("min_samples_split", 2,16)
    max_leaf_nodes = int(trial.suggest_discrete_uniform("max_leaf_nodes", 4,64,4))
    criterion = trial.suggest_categorical("criterion", ["gini", "entropy"])
    n_estimators = int(trial.suggest_discrete_uniform("n_estimators", 50,500,50))
    max_depth = trial.suggest_int("max_depth", 3,10)
    clf = RandomForestClassifier(random_state=1, n_estimators = n_estimators, max_leaf_nodes = max_leaf_nodes, max_depth=max_depth, max_features=None,criterion=criterion,min_samples_split=min_samples_split)
    clf.fit(X_train, Y_train)
    return 1 - accuracy_score(Y_test, clf.predict(X_test))

study = optuna.create_study()
study.optimize(objective, n_trials=100)

print(1-study.best_value)
print(study.best_params)

# Create and train a RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, Y_train)

# Predict the target values for the test data
Y_pred = model.predict(X_test)

# Calculate the accuracy, precision, recall
accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred, average='weighted')  # You can also use 'macro' or 'micro'
recall = recall_score(Y_test, Y_pred, average='weighted')  # You can also use 'macro' or 'micro'

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.5660559305689489
Precision: 0.5637978876125685
Recall: 0.5660559305689489

#　Create and train a RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=1)
model.fit(X_train, Y_train)

# Get feature importances
feature_importances = model.feature_importances_

# Create a DataFrame to associate features with their importances
importance_df = pd.DataFrame({"Feature": X_train.columns, "Importance": feature_importances})
importance_df = importance_df.sort_values(by="Importance", ascending=False)

# Print or visualize the feature importances
print(importance_df)

plt.figure(figsize=(10, 6))
sns.barplot(x="Importance", y="Feature", data=importance_df)
plt.title("Feature Importances")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.show()

Feature Importance
0 SMA5/current 0.318452
2 RSI 0.142284
1 SMA20/current 0.138512
4 BBANDS+2σ 0.135807
5 BBANDS-2σ 0.135339
3 MACD 0.129607

Ranking of articles this category