#FGNW SEEDGENE SELECTION PCA import pandas as pd import numpy as np from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler # Assuming you already have your dataset as a pandas DataFrame Data = pd.read_csv('/data/cyang12/FGNW/combined_one_ass_score.txt', sep = '\t', engine = 'python', index_col = 0) #Data = pd.read_csv('/data/cyang12/FGNW/4_features.csv', sep = ',',index_col = 0) normalized_data = (Data-Data.mean())/Data.std() print(normalized_data) import pandas as pd import numpy as np from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler # Assuming you already have your dataset as a pandas DataFrame # df = pd.read_csv('your_dataset.csv') # Standardize the dataset scaler = StandardScaler() scaled_df = scaler.fit_transform(df) # Perform PCA to reduce dimensions to (20000,1) pca = PCA(n_components=1) reduced_df = pca.fit_transform(scaled_df) # The reduced DataFrame reduced_df = pd.DataFrame(reduced_df, columns=['PC1']) # To calculate the explained variance ratio explained_variance_ratio = pca.explained_variance_ratio_ print(f"Explained Variance Ratio: {explained_variance_ratio}") import matplotlib.pyplot as plt def reconstruction_error(n_components, scaled_data): pca = PCA(n_components=n_components) transformed_data = pca.fit_transform(scaled_data) reconstructed_data = pca.inverse_transform(transformed_data) error = np.sum(np.square(scaled_data - reconstructed_data)) return error n_components_list = list(range(1, min(df.shape) + 1)) errors = [reconstruction_error(n, scaled_df) for n in n_components_list] plt.plot(n_components_list, errors) plt.xlabel('Number of Principal Components') plt.ylabel('Reconstruction Error') plt.title('PCA Reconstruction Error vs. Number of Principal Components') plt.show()import matplotlib.pyplot as plt