Entering edit mode
10 days ago
maryak
▴
20
i am trying to classify metastasis and non metastasis samples using following code but after repeated tries iam getting precision and recall =0. Can any one help. I am using RNAseq gene expression Star count data from Xena browser.
Clinical = pd.read_csv("/home/Maryam/BRTCGA_Clinical.csv", usecols=['sampleID', 'ajcc_staging_system_edition.diagnoses', 'ajcc_pathologic_t.diagnoses', 'ajcc_pathologic_n.diagnoses','ajcc_pathologic_m.diagnoses'])
Clinical.index = Clinical.loc[:,"sampleID"]
Clinical.drop(columns=["sampleID"], inplace=True)
///
RNAseq = pd.read_csv("/home/Maryam/BRCAStar.csv")
RNAseq.index = RNAseq.loc[:,"sampleID"]
RNAseq.drop(columns=["sampleID"], inplace=True)
print(RNAseq.shape)
# Define the threshold for dropping rows (30% of columns)
threshold = RNAseq.shape[1] * 0.3
# Count the number of zeros and NaNs per row
mask = (RNAseq == 0).sum(axis=1) + RNAseq.isna().sum(axis=1) >= threshold
# Drop the rows where the condition is met
RNAseq = RNAseq[~mask]
print(RNAseq.shape)
def calculate_z_scores(RNAseq):
"""
Calculate Z-scores for each value in the DataFrame across rows.
Parameters:
df (pd.DataFrame): Input DataFrame with genes as rows and samples as columns.
Returns:
pd.DataFrame: DataFrame with Z-scores.
"""
# Subtract the mean and divide by the standard deviation (row-wise)
z_scores = RNAseq.sub(RNAseq.mean(axis=1), axis=0).div(RNAseq.std(axis=1), axis=0)
return z_scores
# Calculate Z-scores for the RNA-seq data
RNAseq = calculate_z_scores(RNAseq)
RNAseq=RNAseq.transpose()
Clinical_copy = Clinical.copy()
Clinical_copy = Clinical_copy[~Clinical_copy.index.duplicated(keep='first')]
RNAseq = RNAseq.copy()
common_sample_list = list(set(Clinical_copy.index.values).intersection(set(RNAseq.index.values)))
combined_dff = pd.concat([RNAseq.loc[common_sample_list,:],Clinical_copy.loc[common_sample_list,:]], axis=1)
df_cleaned=combined_dff.copy()
primary = df_cleaned.loc[(df_cleaned['ajcc_pathologic_n.diagnoses']=='N0') & (df_cleaned['ajcc_pathologic_m.diagnoses'] == 'M0')]
metastasis = df_cleaned.loc[df_cleaned['ajcc_pathologic_m.diagnoses'].isin(['M1','M1a','M1b','M1c','M2','M2a','M2b','M2c','M3','M3'])]
primary = primary.copy()
primary.loc[:, 'labels'] = 0
metastasis=metastasis.copy()
metastasis.loc[:, 'labels'] =1
df= pd.concat([primary,metastasis ], axis=0)
df = df.fillna(0)
df= df.drop(columns=['ajcc_staging_system_edition.diagnoses','ajcc_pathologic_t.diagnoses','ajcc_pathologic_n.diagnoses','ajcc_pathologic_m.diagnoses'])
from imblearn.over_sampling import SMOTE
from collections import Counter
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, roc_auc_score
df = df.sample(frac=1)
# Assuming df is your dataframe and 'labels' is the target column
X = df.drop(columns=['labels'])
y = df['labels']
# Split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
# Check class distribution before resampling
print(f"Original distribution: {Counter(y_train)}")
# Apply SMOTE to oversample the minority class
smote = SMOTE(sampling_strategy='minority', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
# Check class distribution after resampling
print(f"Resampled distribution: {Counter(y_resampled)}")
# Convert to XGBoost DMatrix format
dtrain = xgb.DMatrix(X_resampled, label=y_resampled)
dtest = xgb.DMatrix(X_test, label=y_test)
# Train XGBoost model with logistic regression objective
param = {"objective": "binary:logistic"} # Returns probability outputs
model = xgb.train(param, dtrain, num_boost_round=100)
# Get probability predictions
probs = model.predict(dtest) # Probabilities between 0 and 1
# Convert probabilities to binary predictions using a threshold of 0.5
threshold = 0.5
binary_preds = (probs > threshold).astype(int)
# Calculate evaluation metrics
accuracy = accuracy_score(y_test, binary_preds)
precision = precision_score(y_test, binary_preds)
recall = recall_score(y_test, binary_preds)
auc = roc_auc_score(y_test, probs) # Use probabilities for AUC calculation
# Print results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"AUC: {auc:.4f}")
print(f"test distribution: {Counter(y_test)}")