Я новичок в питоне и мне нужна помощь Вот мой код
import matplotlib.pyplot as plt
import mpl_toolkits.mplot3d
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
from keras.datasets import mnist
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# import some data to play with
iris = load_iris()
mnist = mnist.load_data()
data = iris.data
print(f'number of elements: {np.count_nonzero(data)}')
print(data)
#1.5/ Make 5% values in IRIS to nan
nan_percentage = 5
nan_data = np.where(np.random.random(data.shape) < (nan_percentage/100), np.nan,data)
print(nan_data)
print(f'number of NaN values: {np.count_nonzero(np.isnan(nan_data))}')
#1.6 Preproces missing data (i.e. nan) by using all the methods in the lecture
imputers = [SimpleImputer(strategy='mean'), SimpleImputer(strategy='median'),SimpleImputer(strategy='most_frequent'), SimpleImputer(strategy='constant')]
scalers = [StandardScaler(), MinMaxScaler()]
for imputer in imputers:
for scaler in scalers:
X = imputer.fit_transform(nan_data)
X = scaler.fit_transform(X)
y = iris.target
#1.7/ For each preprocessing method, use a classification model (e.g., naive,Bayes)
# and evaluate the accuracy.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42)
clf = GaussianNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Imputer: {imputer.strategy}, Scaler: {type(scaler).__name__}, Accuracy:{accuracy:.2f}")
# 1.8+ 1.9+ 1.10. Repeat step 1.6 and 1.7 with 10%,15%, 20% nan values.
nan_percentages = [10, 15, 20]
accuracies = []
for nan_percentage in nan_percentages:
nan_data = np.where(np.random.random(data.shape) < nan_percentage/100, np.nan, data)
for imputer in imputers:
for scaler in scalers:
X = imputer.fit_transform(nan_data)
X = scaler.fit_transform(X)
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3, random_state=42)
clf = GaussianNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracies.append(accuracy)
print(f"NaN Percentage: {nan_percentage}%, Imputer: {imputer.strategy}, Scaler: {type(scaler).__name__}, Accuracy: {accuracy:.2f}")
# Generate the line plot of accuracy for each combination of imputer and scaler,across different NaN percentages
fig, ax = plt.subplots(figsize=(8, 6))
colors = ['r', 'b', 'g', 'k']
for i, imputer in enumerate(imputers):
for j, scaler in enumerate(scalers):
idx = i * len(scalers) + j
label = f"{imputer.strategy}-{type(scaler).__name__}"
ax.plot(nan_percentages, accuracies[idx::len(imputers)*len(scalers)], color=colors[j], linestyle='-', label=label)
ax.legend()
ax.set_xlabel('NaN Percentage')
ax.set_ylabel('Accuracy')
ax.set_title('Accuracy by Imputer and Scaler for Different NaN Percentages')
plt.show()
Весь код от begin до 1.7 верен, потому что я трижды проверял его. Вот полный след:
File "C:\Users\nhatd\PycharmProjects\pythonProject1\main.py", line 75, in <module>
ax.plot(nan_percentages, accuracies[idx::len(imputers)*len(scalers)], color=colors[j], linestyle='-', label=label)
File "C:\Users\nhatd\PycharmProjects\pythonProject1\venv\Lib\site-packages\matplotlib\axes\_axes.py", line 1688, in plot
lines = [*self._get_lines(*args, data=data, **kwargs)]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\nhatd\PycharmProjects\pythonProject1\venv\Lib\site-packages\matplotlib\axes\_base.py", line 311, in __call__
yield from self._plot_args(
^^^^^^^^^^^^^^^^
File "C:\Users\nhatd\PycharmProjects\pythonProject1\venv\Lib\site-packages\matplotlib\axes\_base.py", line 504, in _plot_args
raise ValueError(f"x and y must have same first dimension, but "
ValueError: x and y must have same first dimension, but have shapes (3,) and (1,)
Я пытаюсь использовать np.unique() для X и y, но это не сработало. Может мне кто-нибудь помочь :( Я пытаюсь исправить диапазон X и Y и прочитал много сообщений, но это не сработало :(
# 1.8+ 1.9+ 1.10. Repeat step 1.6 and 1.7 with 10%,15%, 20% nan values.
nan_percentages = [10, 15, 20]
accuracies = []
stats = []
for nan_percentage in nan_percentages:
nan_data = np.where(np.random.random(data.shape) < nan_percentage/100, np.nan, data)
for imputer in imputers:
for scaler in scalers:
X = imputer.fit_transform(nan_data)
X = scaler.fit_transform(X)
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3, random_state=42)
clf = GaussianNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracies.append(accuracy)
stats.append({'NaN Percentage': nan_percentage,
'Imputer': imputer.strategy,
'Scaler': type(scaler).__name__,
'Accuracy': accuracy})
print(f"NaN Percentage: {nan_percentage}%, Imputer: {imputer.strategy}, Scaler: {type(scaler).__name__}, Accuracy: {accuracy:.2f}")
Выход:
NaN Percentage: 10%, Imputer: mean, Scaler: StandardScaler, Accuracy: 0.91
NaN Percentage: 10%, Imputer: mean, Scaler: MinMaxScaler, Accuracy: 0.91
NaN Percentage: 10%, Imputer: median, Scaler: StandardScaler, Accuracy: 0.93
NaN Percentage: 10%, Imputer: median, Scaler: MinMaxScaler, Accuracy: 0.93
NaN Percentage: 10%, Imputer: most_frequent, Scaler: StandardScaler, Accuracy: 0.84
NaN Percentage: 10%, Imputer: most_frequent, Scaler: MinMaxScaler, Accuracy: 0.84
NaN Percentage: 10%, Imputer: constant, Scaler: StandardScaler, Accuracy: 0.87
NaN Percentage: 10%, Imputer: constant, Scaler: MinMaxScaler, Accuracy: 0.87
NaN Percentage: 15%, Imputer: mean, Scaler: StandardScaler, Accuracy: 0.96
NaN Percentage: 15%, Imputer: mean, Scaler: MinMaxScaler, Accuracy: 0.96
NaN Percentage: 15%, Imputer: median, Scaler: StandardScaler, Accuracy: 0.98
NaN Percentage: 15%, Imputer: median, Scaler: MinMaxScaler, Accuracy: 0.98
NaN Percentage: 15%, Imputer: most_frequent, Scaler: StandardScaler, Accuracy: 0.89
NaN Percentage: 15%, Imputer: most_frequent, Scaler: MinMaxScaler, Accuracy: 0.89
NaN Percentage: 15%, Imputer: constant, Scaler: StandardScaler, Accuracy: 0.80
NaN Percentage: 15%, Imputer: constant, Scaler: MinMaxScaler, Accuracy: 0.80
NaN Percentage: 20%, Imputer: mean, Scaler: StandardScaler, Accuracy: 0.91
NaN Percentage: 20%, Imputer: mean, Scaler: MinMaxScaler, Accuracy: 0.91
NaN Percentage: 20%, Imputer: median, Scaler: StandardScaler, Accuracy: 0.93
NaN Percentage: 20%, Imputer: median, Scaler: MinMaxScaler, Accuracy: 0.93
NaN Percentage: 20%, Imputer: most_frequent, Scaler: StandardScaler, Accuracy: 0.87
NaN Percentage: 20%, Imputer: most_frequent, Scaler: MinMaxScaler, Accuracy: 0.87
NaN Percentage: 20%, Imputer: constant, Scaler: StandardScaler, Accuracy: 0.84
NaN Percentage: 20%, Imputer: constant, Scaler: MinMaxScaler, Accuracy: 0.84
# pip install seaborn
import seaborn as sns
stats = pd.DataFrame(stats)
stats['Group'] = stats['Imputer'] + '/' + stats['Scaler']
fig, ax = plt.subplots(figsize=(14, 6))
sns.barplot(data=stats, x='NaN Percentage', y='Accuracy', hue='Group', ax=ax)
sns.move_legend(ax, 'upper left', bbox_to_anchor=(1, 1))
ax.set_title('Accuracy by Imputer and Scaler for Different NaN Percentages')
plt.tight_layout()
plt.show()
Выход:
У тебя другая проблема. Вы тестируете свою стратегию только для nan_percentage=20. Проверьте отступ вашего кода. ( 1,8+ 1,9+ 1,10)