Three-value classification(USDJPY) – Preparation, Learning, Importances

Economy

Estimate USDJPY after 1 day based on 10 years of data by using scikit-learn

# import libraries
import pandas as pd
import numpy as np
import talib as ta
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
import optuna
import matplotlib.pyplot as plt
import seaborn as sns
# Check the shape
print(df0.shape)
# Check the data
summary = df0.describe()
print(summary)
# # Display information about the DataFrame
df0.info()
df0["Date"] = pd.to_datetime(df0["Date"])

# Get the oldest and latest dates
oldest_date = df0["Date"].min()
latest_date = df0["Date"].max()

print("Oldest Date:", oldest_date)
print("Latest Date:", latest_date)
df0["Date"] = pd.to_datetime(df0["Date"])

# Get the oldest and latest dates
oldest_date = df0["Date"].min()
latest_date = df0["Date"].max()

print("Oldest Date:", oldest_date)
print("Latest Date:", latest_date)

Oldest Date: 2003-09-15 00:00:00
Latest Date: 2022-11-11 00:00:00

# Load and clean the data for df1
df1 = pd.read_csv("")
df1["Date"] = pd.to_datetime(df1["Date"], format="%m/%d/%Y")  # Specify the correct date format

# Concatenate DataFrames
df = pd.concat([df0, df1])

# Calculate oldest and latest dates
oldest_date = df["Date"].min()
latest_date = df["Date"].max()

print("Oldest Date:", oldest_date)
print("Latest Date:", latest_date)

Oldest Date: 2003-09-15 00:00:00
Latest Date: 2022-11-11 00:00:00

print(df.shape)

(5218, 7)

df.info()

<class ‘pandas.core.frame.DataFrame’>
Index: 5218 entries, 0 to 217
Data columns (total 7 columns):
# Column Non-Null Count Dtype — —— ————– —–
0 Date 5218 non-null datetime64[ns]
1 Price 5218 non-null float64
2 Open 5218 non-null float64
3 High 5218 non-null float64
4 Low 5218 non-null float64
5 Vol. 0 non-null float64
6 Change % 5218 non-null object
dtypes: datetime64[ns](1), float64(5), object(1) memory usage: 326.1+ KB

# Drop the "Vol." column from df
df = df.drop("Vol.", axis=1)

# Check the shape of df after dropping the column
print("Shape of df after dropping 'Vol.' column:", df.shape)

Shape of df after dropping ‘Vol.’ column: (5218, 6)

# Identify columns with all null values
null_columns = df.columns[df.isnull().all()]

# Drop columns with all null values
df = df.drop(columns=null_columns)

# Check the shape of df after dropping null columns
print("Shape of df after dropping null columns:", df.shape)

Shape of df after dropping null columns: (5218, 6)

# print the unique column names in df
print("Unique column names in df:", df.columns.tolist())

Unique column names in df: [‘Date’, ‘Price’, ‘Open’, ‘High’, ‘Low’, ‘Change %’]

#All subsequent calculations use the closing rate
close = np.array(df["Price"])

#Create an empty dataframe to contain features
df_feature = pd.DataFrame(index=range(len(df)),columns=["SMA5/current", "SMA20/current","RSI","MACD","BBANDS+2σ","BBANDS-2σ"])

#Ccalculate technical indicators (features used in this training) using talib and put in df_feature

#Simple moving average uses the ratio of the simple moving average to the closing price of the day as the feature value
df_feature["SMA5/current"]= ta.SMA(close, timeperiod=5) / close
df_feature["SMA20/current"]= ta.SMA(close, timeperiod=20) / close

#RSI
df_feature["RSI"] = ta.RSI(close, timeperiod=14)

#MACD
df_feature["MACD"], _ , _= ta.MACD(close, fastperiod=12, slowperiod=26, signalperiod=9)

#Bollinger bands
upper, middle, lower = ta.BBANDS(close, timeperiod=20, nbdevup=3, nbdevdn=3)
df_feature["BBANDS+2σ"] = upper / close
df_feature["BBANDS-2σ"] = lower / close
df["the day before_float"] = df["Change %"].apply(lambda x: float(x.replace("%", "")))

#How to categorize the % of previous day. Divide as much as possible so that the sample of each class is equal
def classify(x):
    if x <= -0.2:
        return 'down'
    elif -0.2 < x < 0.2:
        return 'even'
    elif 0.2 <= x:
        return 'up'


df["the day before_classified"] = df["the day before_float"].apply(lambda x: classify(x))

#Shift the data you want to teach by one day (you know what I mean)
df_y = df["the day before_classified"].shift()
# Reset the index of df0_y to ensure uniqueness
df_y = df_y.reset_index(drop=True)

# Concatenate df0_feature and df0_y with reset index
df_xy = pd.concat([df_feature, df_y], axis=1)
df_xy = df_xy.dropna(how="any")
# print("Shape of df_xy:", df_xy.shape)
print("Number of samples in df_xy:", len(df_xy))

X_train, X_test, Y_train, Y_test = train_test_split(
    df_xy[["SMA5/current", "SMA20/current", "RSI", "MACD", "BBANDS+2σ", "BBANDS-2σ"]],
    df_xy["the day before_classified"],
    train_size=0.8,
    random_state=42
)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of Y_train:", Y_train.shape)
print("Shape of Y_test:", Y_test.shape)

Shape of df_xy: (5185, 7)
Number of samples in df_xy: 5185 Shape of X_train: (4148, 6)
Shape of X_test: (1037, 6)
Shape of Y_train: (4148,)
Shape of Y_test: (1037,)

X_train, X_test, Y_train, Y_test = train_test_split(df_xy[["SMA5/current", "SMA20/current","RSI","MACD","BBANDS+2σ","BBANDS-2σ"]],df_xy["前日比_classified"], train_size=0.8)

def objective(trial):
    min_samples_split = trial.suggest_int("min_samples_split", 2,16)
    max_leaf_nodes = int(trial.suggest_discrete_uniform("max_leaf_nodes", 4,64,4))
    criterion = trial.suggest_categorical("criterion", ["gini", "entropy"])
    n_estimators = int(trial.suggest_discrete_uniform("n_estimators", 50,500,50))
    max_depth = trial.suggest_int("max_depth", 3,10)
    clf = RandomForestClassifier(random_state=1, n_estimators = n_estimators, max_leaf_nodes = max_leaf_nodes, max_depth=max_depth, max_features=None,criterion=criterion,min_samples_split=min_samples_split)
    clf.fit(X_train, Y_train)
    return 1 - accuracy_score(Y_test, clf.predict(X_test))

study = optuna.create_study()
study.optimize(objective, n_trials=100)

print(1-study.best_value)
print(study.best_params)
# Create and train a RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, Y_train)

# Predict the target values for the test data
Y_pred = model.predict(X_test)

# Calculate the accuracy, precision, recall
accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred, average='weighted')  # You can also use 'macro' or 'micro'
recall = recall_score(Y_test, Y_pred, average='weighted')  # You can also use 'macro' or 'micro'

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.5660559305689489
Precision: 0.5637978876125685
Recall: 0.5660559305689489

# Create and train a RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=1)
model.fit(X_train, Y_train)

# Get feature importances
feature_importances = model.feature_importances_

# Create a DataFrame to associate features with their importances
importance_df = pd.DataFrame({"Feature": X_train.columns, "Importance": feature_importances})
importance_df = importance_df.sort_values(by="Importance", ascending=False)

# Print or visualize the feature importances
print(importance_df)

plt.figure(figsize=(10, 6))
sns.barplot(x="Importance", y="Feature", data=importance_df)
plt.title("Feature Importances")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.show()

Feature Importance
0 SMA5/current 0.318452
2 RSI 0.142284
1 SMA20/current 0.138512
4 BBANDS+2σ 0.135807
5 BBANDS-2σ 0.135339
3 MACD 0.129607

Xiofx
Xiofx

An experienced Machine Learning and Deep Learning professional and logistics improvement entrepreneur in Tokyo, Japan, with an interest in economies around the world. She likes travel very much.

XiofxFollow
EconomyFXPythonTechnology
Letteralpha