Classification of samples into metasis and non metastasis using gene expresssion RNASeq
0
0
Entering edit mode
10 days ago
maryak ▴ 20

i am trying to classify metastasis and non metastasis samples using following code but after repeated tries iam getting precision and recall =0. Can any one help. I am using RNAseq gene expression Star count data from Xena browser.

Clinical = pd.read_csv("/home/Maryam/BRTCGA_Clinical.csv", usecols=['sampleID', 'ajcc_staging_system_edition.diagnoses', 'ajcc_pathologic_t.diagnoses', 'ajcc_pathologic_n.diagnoses','ajcc_pathologic_m.diagnoses'])
Clinical.index = Clinical.loc[:,"sampleID"]
Clinical.drop(columns=["sampleID"], inplace=True)
///
RNAseq = pd.read_csv("/home/Maryam/BRCAStar.csv")
RNAseq.index = RNAseq.loc[:,"sampleID"]
RNAseq.drop(columns=["sampleID"], inplace=True)
print(RNAseq.shape)
# Define the threshold for dropping rows (30% of columns)
threshold = RNAseq.shape[1] * 0.3
# Count the number of zeros and NaNs per row
mask = (RNAseq == 0).sum(axis=1) + RNAseq.isna().sum(axis=1) >= threshold
# Drop the rows where the condition is met
RNAseq = RNAseq[~mask]
print(RNAseq.shape)
def calculate_z_scores(RNAseq):
    """
    Calculate Z-scores for each value in the DataFrame across rows.

    Parameters:
        df (pd.DataFrame): Input DataFrame with genes as rows and samples as columns.

    Returns:
        pd.DataFrame: DataFrame with Z-scores.
    """
    # Subtract the mean and divide by the standard deviation (row-wise)
    z_scores = RNAseq.sub(RNAseq.mean(axis=1), axis=0).div(RNAseq.std(axis=1), axis=0)
    return z_scores

# Calculate Z-scores for the RNA-seq data
RNAseq = calculate_z_scores(RNAseq)
RNAseq=RNAseq.transpose()
Clinical_copy = Clinical.copy()
Clinical_copy = Clinical_copy[~Clinical_copy.index.duplicated(keep='first')]
RNAseq = RNAseq.copy()
common_sample_list = list(set(Clinical_copy.index.values).intersection(set(RNAseq.index.values)))
combined_dff = pd.concat([RNAseq.loc[common_sample_list,:],Clinical_copy.loc[common_sample_list,:]], axis=1)
df_cleaned=combined_dff.copy()
primary = df_cleaned.loc[(df_cleaned['ajcc_pathologic_n.diagnoses']=='N0') & (df_cleaned['ajcc_pathologic_m.diagnoses'] == 'M0')]
metastasis = df_cleaned.loc[df_cleaned['ajcc_pathologic_m.diagnoses'].isin(['M1','M1a','M1b','M1c','M2','M2a','M2b','M2c','M3','M3'])]
primary = primary.copy()
primary.loc[:, 'labels'] = 0
metastasis=metastasis.copy()
metastasis.loc[:, 'labels'] =1
df= pd.concat([primary,metastasis ], axis=0)
df = df.fillna(0)
df= df.drop(columns=['ajcc_staging_system_edition.diagnoses','ajcc_pathologic_t.diagnoses','ajcc_pathologic_n.diagnoses','ajcc_pathologic_m.diagnoses'])
from imblearn.over_sampling import SMOTE
from collections import Counter
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, roc_auc_score
df = df.sample(frac=1)
# Assuming df is your dataframe and 'labels' is the target column
X = df.drop(columns=['labels'])
y = df['labels']

# Split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
# Check class distribution before resampling
print(f"Original distribution: {Counter(y_train)}")

# Apply SMOTE to oversample the minority class
smote = SMOTE(sampling_strategy='minority', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Check class distribution after resampling
print(f"Resampled distribution: {Counter(y_resampled)}")
# Convert to XGBoost DMatrix format
dtrain = xgb.DMatrix(X_resampled, label=y_resampled)
dtest = xgb.DMatrix(X_test, label=y_test)

# Train XGBoost model with logistic regression objective
param = {"objective": "binary:logistic"}  # Returns probability outputs
model = xgb.train(param, dtrain, num_boost_round=100)

# Get probability predictions
probs = model.predict(dtest)  # Probabilities between 0 and 1

# Convert probabilities to binary predictions using a threshold of 0.5
threshold = 0.5
binary_preds = (probs > threshold).astype(int)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, binary_preds)
precision = precision_score(y_test, binary_preds)
recall = recall_score(y_test, binary_preds)
auc = roc_auc_score(y_test, probs)  # Use probabilities for AUC calculation

# Print results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"AUC: {auc:.4f}")



print(f"test distribution: {Counter(y_test)}")
classification RNAseq • 166 views
ADD COMMENT

Login before adding your answer.

Traffic: 2634 users visited in the last hour
Help About
FAQ
Access RSS
API
Stats

Use of this site constitutes acceptance of our User Agreement and Privacy Policy.

Powered by the version 2.3.6