In [1]:
# Import the modules

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
# Exercise 1

# Load the dataset
exercise_df = pd.read_csv('exercise_data.csv')

# Display the first 10 rows
print(exercise_df.head(10))
  patient_id  age  tumor_size  mean_radius  mean_texture  mean_perimeter  \
0     P99077   65        35.3    23.130211     19.609996      165.152733   
1     P52652   35        41.5     8.334864     30.700790      170.372043   
2     P66845   74        60.5     8.991241     11.766042       63.304822   
3     P79783   32        19.5    20.960321     16.053516       94.916746   
4     P68268   59        29.7    18.633842     31.691853      151.299184   
5     P86704   77        31.4    19.683879     32.560298       70.115101   
6     P98005   56        35.7    16.245851     18.861434      112.867951   
7     P70650   42        40.3    14.967002     38.753408      195.267693   
8     P45723   27        44.4    23.113063     10.130898      199.095326   
9     P83978   59        36.7    13.115738     20.492264      160.081341   

     mean_area  mean_smoothness diagnosis column_X  ...      stage  \
0   501.865012         0.089045         B   Female  ...  Stage III   
1   581.374072         0.141783         B     Male  ...   Stage II   
2   677.820354         0.143672         M     Male  ...   Stage IV   
3   284.967964         0.106517         M     Male  ...    Stage I   
4  1856.401891         0.112608         M     Male  ...   Stage IV   
5   347.574253         0.106593         M     Male  ...  Stage III   
6          NaN         0.076759         B     Male  ...   Stage II   
7          NaN         0.055313         B   Female  ...   Stage II   
8  2355.813796         0.117293         M     Male  ...    Stage I   
9  1314.065078         0.173797         M   Female  ...    Stage I   

                treatment biomarker_status family_history smoking_history  \
0         Palliative Care        Amplified            Yes       Ex-smoker   
1         Palliative Care        Amplified             No    Heavy Smoker   
2           Immunotherapy         Negative             No       Ex-smoker   
3                 Surgery         Negative            Yes    Heavy Smoker   
4         Palliative Care         Negative             No    Heavy Smoker   
5         Palliative Care         Positive             No    Heavy Smoker   
6         Palliative Care         Negative            Yes    Heavy Smoker   
7  Bone Marrow Transplant         Positive             No       Ex-smoker   
8         Palliative Care        Amplified            Yes       Ex-smoker   
9            Chemotherapy         Negative             No       Ex-smoker   

  alcohol_consumption          diet exercise_hours_per_week blood_pressure  \
0                High       Low Fat                       8          97/73   
1                High      Balanced                       2         159/75   
2                High  High Protein                       9         136/85   
3                 Low      Balanced                       8         155/84   
4            Moderate    High Fiber                      11         150/70   
5            Moderate    High Fiber                      12         174/85   
6            Moderate      Low Carb                       3         143/86   
7                 Low      Low Carb                       3        113/100   
8                 Low    High Fiber                       5        137/100   
9                High    High Fiber                      13         140/69   

   cholesterol_level  
0                199  
1                152  
2                204  
3                196  
4                287  
5                267  
6                247  
7                245  
8                198  
9                241  

[10 rows x 22 columns]
In [3]:
# Check for missing values
print(exercise_df.isnull().sum())
patient_id                 0
age                        0
tumor_size                 0
mean_radius                0
mean_texture               0
mean_perimeter             0
mean_area                  2
mean_smoothness            0
diagnosis                  0
column_X                   0
column_T                   0
survival_months            0
stage                      0
treatment                  0
biomarker_status           0
family_history             0
smoking_history            0
alcohol_consumption        0
diet                       0
exercise_hours_per_week    0
blood_pressure             0
cholesterol_level          0
dtype: int64
In [4]:
# Calculate summary statistics
print(exercise_df.describe())
             age  tumor_size  mean_radius  mean_texture  mean_perimeter  \
count  20.000000   20.000000    20.000000     20.000000       20.000000   
mean   49.300000   38.880000    17.833562     24.918910      136.041306   
std    18.778767   11.276926     4.572292      8.927578       51.118297   
min    20.000000   19.500000     8.334864     10.130898       42.827314   
25%    33.500000   31.700000    14.883086     18.747916      101.653071   
50%    48.000000   36.900000    18.383400     23.541302      155.690263   
75%    63.500000   44.325000    21.453336     31.908964      171.718817   
max    79.000000   60.500000    23.660230     39.531605      199.095326   

         mean_area  mean_smoothness  survival_months  exercise_hours_per_week  \
count    18.000000        20.000000        20.000000                20.000000   
mean   1175.654756         0.119837        37.300000                 8.250000   
std     671.831832         0.039006        18.160396                 4.586881   
min     284.967964         0.055313         6.000000                 0.000000   
25%     521.742277         0.098287        23.250000                 4.500000   
50%    1299.602305         0.110609        40.500000                 8.500000   
75%    1645.595799         0.145301        48.250000                12.000000   
max    2355.813796         0.194991        71.000000                14.000000   

       cholesterol_level  
count          20.000000  
mean          229.750000  
std            46.190538  
min           152.000000  
25%           197.500000  
50%           237.000000  
75%           272.500000  
max           295.000000  
In [5]:
# Exercise 2

# Fill missing values with the median
exercise_df.fillna({'mean_area': exercise_df['mean_area'].median()}, inplace=True)

# Rename columns
exercise_df.rename(columns={'column_X': 'gender', 'column_T': 'cancer_type'}, inplace=True)

# Display the updated dataset
print(exercise_df.head())
  patient_id  age  tumor_size  mean_radius  mean_texture  mean_perimeter  \
0     P99077   65        35.3    23.130211     19.609996      165.152733   
1     P52652   35        41.5     8.334864     30.700790      170.372043   
2     P66845   74        60.5     8.991241     11.766042       63.304822   
3     P79783   32        19.5    20.960321     16.053516       94.916746   
4     P68268   59        29.7    18.633842     31.691853      151.299184   

     mean_area  mean_smoothness diagnosis  gender  ...      stage  \
0   501.865012         0.089045         B  Female  ...  Stage III   
1   581.374072         0.141783         B    Male  ...   Stage II   
2   677.820354         0.143672         M    Male  ...   Stage IV   
3   284.967964         0.106517         M    Male  ...    Stage I   
4  1856.401891         0.112608         M    Male  ...   Stage IV   

         treatment biomarker_status family_history smoking_history  \
0  Palliative Care        Amplified            Yes       Ex-smoker   
1  Palliative Care        Amplified             No    Heavy Smoker   
2    Immunotherapy         Negative             No       Ex-smoker   
3          Surgery         Negative            Yes    Heavy Smoker   
4  Palliative Care         Negative             No    Heavy Smoker   

  alcohol_consumption          diet exercise_hours_per_week blood_pressure  \
0                High       Low Fat                       8          97/73   
1                High      Balanced                       2         159/75   
2                High  High Protein                       9         136/85   
3                 Low      Balanced                       8         155/84   
4            Moderate    High Fiber                      11         150/70   

   cholesterol_level  
0                199  
1                152  
2                204  
3                196  
4                287  

[5 rows x 22 columns]
In [6]:
# Check for missing values
print(exercise_df.isnull().sum())
patient_id                 0
age                        0
tumor_size                 0
mean_radius                0
mean_texture               0
mean_perimeter             0
mean_area                  0
mean_smoothness            0
diagnosis                  0
gender                     0
cancer_type                0
survival_months            0
stage                      0
treatment                  0
biomarker_status           0
family_history             0
smoking_history            0
alcohol_consumption        0
diet                       0
exercise_hours_per_week    0
blood_pressure             0
cholesterol_level          0
dtype: int64
In [7]:
# Exercise 3

# Filter the data
filtered_df = exercise_df[exercise_df['age'] > 50]

# Group by 'smoking_history' and calculate the mean survival_months
grouped_df = filtered_df.groupby('smoking_history')['survival_months'].mean()
print(grouped_df)
smoking_history
Ex-smoker       57.333333
Heavy Smoker    23.000000
Non-smoker      43.000000
Name: survival_months, dtype: float64
In [8]:
# Create a scatter plot
plt.scatter(filtered_df['mean_radius'], filtered_df['mean_texture'])
plt.xlabel('Mean Radius')
plt.ylabel('Mean Texture')
plt.title('Mean Radius vs. Mean Texture')
plt.show()
No description has been provided for this image
In [9]:
# Exercise 4

# Create the list
database_ids = []

# Loop and remove the 'P'
for patient_id in exercise_df['patient_id']:
    new_id = patient_id[1:]  # Remove the 'P' prefix
    database_ids.append(new_id)

print(database_ids)
['99077', '52652', '66845', '79783', '68268', '86704', '98005', '70650', '45723', '83978', '73203', '50737', '33813', '86017', '83744', '82843', '25344', '17222', '14526', '21965']
In [10]:
# Exercise 5

# Creat the dictionary
patient_blood_pressure = {}

# Loop and split the blood pressure
for row in exercise_df.itertuples():
    patient_blood_pressure[row.patient_id] = {'systolic': row.blood_pressure.split('/')[0], 'diastolic': row.blood_pressure.split('/')[1]}

for patient in patient_blood_pressure:
    print("Patient ID: " + patient + " Systolic: " + patient_blood_pressure[patient]['systolic'] + " Diastolic: " + patient_blood_pressure[patient]['diastolic'])
Patient ID: P99077 Systolic: 97 Diastolic: 73
Patient ID: P52652 Systolic: 159 Diastolic: 75
Patient ID: P66845 Systolic: 136 Diastolic: 85
Patient ID: P79783 Systolic: 155 Diastolic: 84
Patient ID: P68268 Systolic: 150 Diastolic: 70
Patient ID: P86704 Systolic: 174 Diastolic: 85
Patient ID: P98005 Systolic: 143 Diastolic: 86
Patient ID: P70650 Systolic: 113 Diastolic: 100
Patient ID: P45723 Systolic: 137 Diastolic: 100
Patient ID: P83978 Systolic: 140 Diastolic: 69
Patient ID: P73203 Systolic: 158 Diastolic: 87
Patient ID: P50737 Systolic: 106 Diastolic: 61
Patient ID: P33813 Systolic: 98 Diastolic: 100
Patient ID: P86017 Systolic: 162 Diastolic: 78
Patient ID: P83744 Systolic: 98 Diastolic: 83
Patient ID: P82843 Systolic: 95 Diastolic: 92
Patient ID: P25344 Systolic: 149 Diastolic: 91
Patient ID: P17222 Systolic: 116 Diastolic: 86
Patient ID: P14526 Systolic: 161 Diastolic: 99
Patient ID: P21965 Systolic: 170 Diastolic: 81