In [1]:
# Import the modules
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
# Exercise 1
# Load the dataset
exercise_df = pd.read_csv('exercise_data.csv')
# Display the first 10 rows
print(exercise_df.head(10))
patient_id age tumor_size mean_radius mean_texture mean_perimeter \
0 P99077 65 35.3 23.130211 19.609996 165.152733
1 P52652 35 41.5 8.334864 30.700790 170.372043
2 P66845 74 60.5 8.991241 11.766042 63.304822
3 P79783 32 19.5 20.960321 16.053516 94.916746
4 P68268 59 29.7 18.633842 31.691853 151.299184
5 P86704 77 31.4 19.683879 32.560298 70.115101
6 P98005 56 35.7 16.245851 18.861434 112.867951
7 P70650 42 40.3 14.967002 38.753408 195.267693
8 P45723 27 44.4 23.113063 10.130898 199.095326
9 P83978 59 36.7 13.115738 20.492264 160.081341
mean_area mean_smoothness diagnosis column_X ... stage \
0 501.865012 0.089045 B Female ... Stage III
1 581.374072 0.141783 B Male ... Stage II
2 677.820354 0.143672 M Male ... Stage IV
3 284.967964 0.106517 M Male ... Stage I
4 1856.401891 0.112608 M Male ... Stage IV
5 347.574253 0.106593 M Male ... Stage III
6 NaN 0.076759 B Male ... Stage II
7 NaN 0.055313 B Female ... Stage II
8 2355.813796 0.117293 M Male ... Stage I
9 1314.065078 0.173797 M Female ... Stage I
treatment biomarker_status family_history smoking_history \
0 Palliative Care Amplified Yes Ex-smoker
1 Palliative Care Amplified No Heavy Smoker
2 Immunotherapy Negative No Ex-smoker
3 Surgery Negative Yes Heavy Smoker
4 Palliative Care Negative No Heavy Smoker
5 Palliative Care Positive No Heavy Smoker
6 Palliative Care Negative Yes Heavy Smoker
7 Bone Marrow Transplant Positive No Ex-smoker
8 Palliative Care Amplified Yes Ex-smoker
9 Chemotherapy Negative No Ex-smoker
alcohol_consumption diet exercise_hours_per_week blood_pressure \
0 High Low Fat 8 97/73
1 High Balanced 2 159/75
2 High High Protein 9 136/85
3 Low Balanced 8 155/84
4 Moderate High Fiber 11 150/70
5 Moderate High Fiber 12 174/85
6 Moderate Low Carb 3 143/86
7 Low Low Carb 3 113/100
8 Low High Fiber 5 137/100
9 High High Fiber 13 140/69
cholesterol_level
0 199
1 152
2 204
3 196
4 287
5 267
6 247
7 245
8 198
9 241
[10 rows x 22 columns]
In [3]:
# Check for missing values
print(exercise_df.isnull().sum())
patient_id 0 age 0 tumor_size 0 mean_radius 0 mean_texture 0 mean_perimeter 0 mean_area 2 mean_smoothness 0 diagnosis 0 column_X 0 column_T 0 survival_months 0 stage 0 treatment 0 biomarker_status 0 family_history 0 smoking_history 0 alcohol_consumption 0 diet 0 exercise_hours_per_week 0 blood_pressure 0 cholesterol_level 0 dtype: int64
In [4]:
# Calculate summary statistics
print(exercise_df.describe())
age tumor_size mean_radius mean_texture mean_perimeter \
count 20.000000 20.000000 20.000000 20.000000 20.000000
mean 49.300000 38.880000 17.833562 24.918910 136.041306
std 18.778767 11.276926 4.572292 8.927578 51.118297
min 20.000000 19.500000 8.334864 10.130898 42.827314
25% 33.500000 31.700000 14.883086 18.747916 101.653071
50% 48.000000 36.900000 18.383400 23.541302 155.690263
75% 63.500000 44.325000 21.453336 31.908964 171.718817
max 79.000000 60.500000 23.660230 39.531605 199.095326
mean_area mean_smoothness survival_months exercise_hours_per_week \
count 18.000000 20.000000 20.000000 20.000000
mean 1175.654756 0.119837 37.300000 8.250000
std 671.831832 0.039006 18.160396 4.586881
min 284.967964 0.055313 6.000000 0.000000
25% 521.742277 0.098287 23.250000 4.500000
50% 1299.602305 0.110609 40.500000 8.500000
75% 1645.595799 0.145301 48.250000 12.000000
max 2355.813796 0.194991 71.000000 14.000000
cholesterol_level
count 20.000000
mean 229.750000
std 46.190538
min 152.000000
25% 197.500000
50% 237.000000
75% 272.500000
max 295.000000
In [5]:
# Exercise 2
# Fill missing values with the median
exercise_df.fillna({'mean_area': exercise_df['mean_area'].median()}, inplace=True)
# Rename columns
exercise_df.rename(columns={'column_X': 'gender', 'column_T': 'cancer_type'}, inplace=True)
# Display the updated dataset
print(exercise_df.head())
patient_id age tumor_size mean_radius mean_texture mean_perimeter \
0 P99077 65 35.3 23.130211 19.609996 165.152733
1 P52652 35 41.5 8.334864 30.700790 170.372043
2 P66845 74 60.5 8.991241 11.766042 63.304822
3 P79783 32 19.5 20.960321 16.053516 94.916746
4 P68268 59 29.7 18.633842 31.691853 151.299184
mean_area mean_smoothness diagnosis gender ... stage \
0 501.865012 0.089045 B Female ... Stage III
1 581.374072 0.141783 B Male ... Stage II
2 677.820354 0.143672 M Male ... Stage IV
3 284.967964 0.106517 M Male ... Stage I
4 1856.401891 0.112608 M Male ... Stage IV
treatment biomarker_status family_history smoking_history \
0 Palliative Care Amplified Yes Ex-smoker
1 Palliative Care Amplified No Heavy Smoker
2 Immunotherapy Negative No Ex-smoker
3 Surgery Negative Yes Heavy Smoker
4 Palliative Care Negative No Heavy Smoker
alcohol_consumption diet exercise_hours_per_week blood_pressure \
0 High Low Fat 8 97/73
1 High Balanced 2 159/75
2 High High Protein 9 136/85
3 Low Balanced 8 155/84
4 Moderate High Fiber 11 150/70
cholesterol_level
0 199
1 152
2 204
3 196
4 287
[5 rows x 22 columns]
In [6]:
# Check for missing values
print(exercise_df.isnull().sum())
patient_id 0 age 0 tumor_size 0 mean_radius 0 mean_texture 0 mean_perimeter 0 mean_area 0 mean_smoothness 0 diagnosis 0 gender 0 cancer_type 0 survival_months 0 stage 0 treatment 0 biomarker_status 0 family_history 0 smoking_history 0 alcohol_consumption 0 diet 0 exercise_hours_per_week 0 blood_pressure 0 cholesterol_level 0 dtype: int64
In [7]:
# Exercise 3
# Filter the data
filtered_df = exercise_df[exercise_df['age'] > 50]
# Group by 'smoking_history' and calculate the mean survival_months
grouped_df = filtered_df.groupby('smoking_history')['survival_months'].mean()
print(grouped_df)
smoking_history Ex-smoker 57.333333 Heavy Smoker 23.000000 Non-smoker 43.000000 Name: survival_months, dtype: float64
In [8]:
# Create a scatter plot
plt.scatter(filtered_df['mean_radius'], filtered_df['mean_texture'])
plt.xlabel('Mean Radius')
plt.ylabel('Mean Texture')
plt.title('Mean Radius vs. Mean Texture')
plt.show()
In [9]:
# Exercise 4
# Create the list
database_ids = []
# Loop and remove the 'P'
for patient_id in exercise_df['patient_id']:
new_id = patient_id[1:] # Remove the 'P' prefix
database_ids.append(new_id)
print(database_ids)
['99077', '52652', '66845', '79783', '68268', '86704', '98005', '70650', '45723', '83978', '73203', '50737', '33813', '86017', '83744', '82843', '25344', '17222', '14526', '21965']
In [10]:
# Exercise 5
# Creat the dictionary
patient_blood_pressure = {}
# Loop and split the blood pressure
for row in exercise_df.itertuples():
patient_blood_pressure[row.patient_id] = {'systolic': row.blood_pressure.split('/')[0], 'diastolic': row.blood_pressure.split('/')[1]}
for patient in patient_blood_pressure:
print("Patient ID: " + patient + " Systolic: " + patient_blood_pressure[patient]['systolic'] + " Diastolic: " + patient_blood_pressure[patient]['diastolic'])
Patient ID: P99077 Systolic: 97 Diastolic: 73 Patient ID: P52652 Systolic: 159 Diastolic: 75 Patient ID: P66845 Systolic: 136 Diastolic: 85 Patient ID: P79783 Systolic: 155 Diastolic: 84 Patient ID: P68268 Systolic: 150 Diastolic: 70 Patient ID: P86704 Systolic: 174 Diastolic: 85 Patient ID: P98005 Systolic: 143 Diastolic: 86 Patient ID: P70650 Systolic: 113 Diastolic: 100 Patient ID: P45723 Systolic: 137 Diastolic: 100 Patient ID: P83978 Systolic: 140 Diastolic: 69 Patient ID: P73203 Systolic: 158 Diastolic: 87 Patient ID: P50737 Systolic: 106 Diastolic: 61 Patient ID: P33813 Systolic: 98 Diastolic: 100 Patient ID: P86017 Systolic: 162 Diastolic: 78 Patient ID: P83744 Systolic: 98 Diastolic: 83 Patient ID: P82843 Systolic: 95 Diastolic: 92 Patient ID: P25344 Systolic: 149 Diastolic: 91 Patient ID: P17222 Systolic: 116 Diastolic: 86 Patient ID: P14526 Systolic: 161 Diastolic: 99 Patient ID: P21965 Systolic: 170 Diastolic: 81