I have performed a PCA analysis over my original dataset and from the compressed dataset transformed by the PCA I have also selected the number of PC I want to keep. Now I am struggling with the identification of the original features that are important in the reduced dataset and getting error
IndexError: list index out of range; the code is as below;
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
import pandas as pd
dt = pd.read_excel('GENES.xlsx')
X = dt.iloc[:, 0:14808].values
y = dt.iloc[:, 14807:14808].values
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0)
#Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
X_val = sc_X.transform(X_val)
# Applying Selecting Features
from sklearn.decomposition import PCA
pca=PCA(n_components=13)
X_train = pca.fit_transform(X_train)
X_val = pca.transform(X_val)
X_test = pca.transform(X_test)
explained_variance = pca.explained_variance_ratio_
# number of components
n_pcs= pca.components_.shape[0]
# get the index of the most important feature on EACH component
# LIST COMPREHENSION HERE
most_important = [np.abs(pca.components_[i]).argmax() for i in range(n_pcs)]
initial_feature_names = [ ' UBE2Q2P2 ' , ' SSX9 ' , ' CXORF67 ' , ' EFCAB8 ' , ' SDR16C6P ' ,
' EFCAB12 ' , ' A1BG ' , ' A1CF ' , ' RBFOX1 ' , ' GGACT ' , ' A2ML1 ' , ' A2M ' ,
' A4GALT ' , ' A4GNT ' , ' AAAS ' , ' AACSP1 ' , ' AACS ' , ' AADACL2 ' , ' AADACL3 ' , ' AADACL4 ' ,
' AADAC ' , ' AAGAB ' , ' AAK1 ' , ' AAMP ' , ' AANAT ' , ' AARS2 ' , ' AARSD1 ' , ' AARS ' , ' AASDHPPT ' ,
' AASDH ' , ' AASS ' , ' AATF ' , ' AATK ' , ' ABAT ' , ' ABCA10 ' , ' ABCA11P ' , ' ABCA12 ' , ' ABCA13 ' ,
' ABCA17P ' , ' ABCA1 ' , ' ABCA2 ' , ' ABCA3 ' , ' ABCA4 ' , ' ABCA5 ' , ' ABCA6 ' , ' ABCA7 ' ,
' ABCA8 ' , ' ABCA9 ' , ' ABCB10 ' , ' ABCB11 ' , ' ABCB1 ' , ' ABCB4 ' , ' ABCB6 ' , ' ABCB7 ' ,
' ABCB8 ' , ' ABCB9 ' , ' ABCC11 ' , ' ABCC12 ' , ' ABCC13 ' , ' ABCC1 ' , ' ABCC2 ' , ' ABCC3 ' , ' ABCC4 ' , ' ABCC5 ' , ' ABCC6P1 ' , ' ABCC6 ' , ' ABCC8 ' , ' ABCC9 ' , ' ABCD1 ' , ' ABCD2 ' , ' ABCD3 ' , ' ABCD4 ' , ' ABCE1 ' , ' ABCF1 ' , ' ABCF2 ' , ' ABCG1 ' , ' ABCG2 ' , ' ABCG4 ' , ' ABCG5 ' , ' ABCG8 ' , ' ABHD10 ' , ' ABHD11 ' , ' ABHD12B ' , ' ABHD12 ' , ' ABHD13 ' , ' ABHD14A ' , ' ABHD14B ' , ' ABHD15 ' , ' ABHD1 ' , ' ABHD2 ' , ' ABHD3 ' , ' ABHD4 ' , ' ABHD5 ' , ' ABHD6 ' , ' ABHD8 ' , ' ABI1 ' , ' ABI2 ' , ' ABI3BP ' , ' ABI3 ' , ' ABL1 ' , ' ABL2 ' , ' ABLIM1 ' , ' ABLIM2 ' , ' ABLIM3 ' , ' ABO ' , ' AOC1 ' , ' ABRA ' , ' ABR ' , ' ABT1 ' , ' ABTB1 ' , ' ABTB2 ' , ' ACAA1 ' , ' ACAA2 ' , ' ACACA ' , ' ACACB ' , ' ACAD10 ' , ' ACAD11 ' , ' ACAD8 ' , ' ACAD9 ' , ' ACADL ' , ' ACADM ' , ' ACADSB ' , ' ACADS ' , ' ACADVL ' , ' ACAN ' , ' ACAP1 ' , ' ACAP2 ' , ' ACAP3 ' , ' ACAT1 ' , ' ACAT2 ' , ' ACBD3 ' , ' ACBD4 ' , ' ACBD5 ' , ' ACBD6 ' , ' ACBD7 ' , ' ASIC2 ' , ' ASIC1 ' , ' ASIC3 ' , ' ASIC4 ' , ' ASIC5 ' , ' ACCSL ' , ' ACCS ' , ' ACD ' , ' ACE2 ' , ' ACER1 ' , ' ACER2 ' , ' ACER3 ' , ' ACE ' , ' ACHE ' , ' ACIN1 ' , ' ACLY ' , ' ACMSD ' , ' ACO1 ' , ' ACO2 ' , ' ACOT11 ' , ' ACOT12 ' , ' ACOT13 ' , ' ACOT1 ' , ' ACOT2 ' , ' ACOT4 ' , ' ACOT6 ' , ' ACOT7 ' , ' ACOT8 ' , ' ACOT9 ' , ' ACOX1 ' , ' ACOX2 ' , ' ACOX3 ' , ' ACOXL ' , ' ACP1 ' , ' ACP2 ' , ' ACP5 ' , ' ACP6 ' , ' PXYLP1 ' , ' ACPP ' , ' ACPT ' , ' ACRBP ' , ' ACRC ' , ' ACRV1 ' , ' ACR ' , ' ACSBG1 ' , ' ACSBG2 ' , ' ACSF2 ' , ' ACSF3 ' , ' ACSL1 ' , ' ACSL3 ' , ' ACSL4 ' , ' ACSL5 ' , ' ACSL6 ' , ' ACSM1 ' , ' ACSM2A ' , ' ACSM2B ' , ' ACSM3 ' , ' ACSM4 ' , ' ACSM5 ' , ' ACSS1 ' , ' ACSS2 ' , ' ACSS3 ' , ' ACTA1 ' , ' ACTA2 ' , ' ACTBL2 ' , ' ACTB ' , ' ACTC1 ' , ' ACTG1 ' , ' ACTG2 ' , ' ACTL6A ' ]
# get the names
most_important_names = [initial_feature_names[most_important[i]] for i in range(n_pcs)]
# LIST COMPREHENSION HERE AGAIN
dic = {'PC{}'.format(i): most_important_names[14805] for i in range(n_pcs)}
# build the dataframe
df = pd.DataFrame(dic.items())