(1)企鹅体重分析
# 1. 导入相关库
import numpy as np
import pandas as pd
# 2. 导入数据
df = pd.read_csv('static/2_pandas/data/penguins.csv')
print(df.head(10))
species island bill_length_mm bill_depth_mm flipper_length_mm \
0 Adelie Torgersen 39.1 18.7 181.0
1 Adelie Torgersen 39.5 17.4 186.0
2 Adelie Torgersen 40.3 18.0 195.0
3 Adelie Torgersen NaN NaN NaN
4 Adelie Torgersen 36.7 19.3 193.0
5 Adelie Torgersen 39.3 20.6 190.0
6 Adelie Torgersen 38.9 17.8 181.0
7 Adelie Torgersen 39.2 19.6 195.0
8 Adelie Torgersen 34.1 18.1 193.0
9 Adelie Torgersen 42.0 20.2 190.0
body_mass_g sex
0 3750.0 Male
1 3800.0 Female
2 3250.0 Female
3 NaN NaN
4 3450.0 Female
5 3650.0 Male
6 3625.0 Female
7 4675.0 Male
8 3475.0 NaN
9 4250.0 NaN
# 3. 数据清洗
# 处理缺失值
df.dropna(inplace=True)
# 4. 构造数据特征
df['sex'] = df['sex'].astype('category')
df['bill_ratio'] = df['bill_length_mm'] / df['bill_depth_mm']
# print(df.head(10))
# 5. 数据分析
# 数据分箱 - 体重:低、中、高
labels = ['低', '中', '高']
df['mass_level'] = pd.cut(df['body_mass_g'], bins=3, labels=labels)
print(df['mass_level'].value_counts())
print()
# 按岛屿、性别分组
print(df.groupby(['island', 'sex'], observed=False).agg({
'body_mass_g': ['mean', 'count']
}))
mass_level
低 150
中 128
高 55
Name: count, dtype: int64
body_mass_g
mean count
island sex
Biscoe Female 4319.375000 80
Male 5104.518072 83
Dream Female 3446.311475 61
Male 3987.096774 62
Torgersen Female 3395.833333 24
Male 4034.782609 23
(2)睡眠质量分析
import numpy as np
import pandas as pd
# 导入数据
df = pd.read_csv('static/2_pandas/data/sleep.csv')
print(df.head(10))
person_id gender age occupation sleep_duration sleep_quality \
0 1 Male 29 Manual Labor 7.4 7.0
1 2 Female 43 Retired 4.2 4.9
2 3 Male 44 Retired 6.1 6.0
3 4 Male 29 Office Worker 8.3 10.0
4 5 Male 67 Retired 9.1 9.5
5 6 Female 47 Student 6.1 6.9
6 7 Male 22 Office Worker 5.1 6.1
7 8 Male 49 Office Worker 10.7 6.2
8 9 Male 25 Manual Labor 11.9 7.2
9 10 Female 51 Retired 8.2 4.0
physical_activity_level stress_level bmi_category blood_pressure \
0 41 7 Obese 124/70
1 41 5 Obese 131/86
2 107 4 Underweight 122/70
3 20 10 Obese 124/72
4 19 4 Overweight 133/78
5 24 4 Normal 123/60
6 26 6 Obese 121/70
7 49 8 Obese 134/87
8 27 8 Underweight 112/63
9 64 5 Overweight 125/84
heart_rate daily_steps sleep_disorder
0 91 8539 NaN
1 81 18754 NaN
2 81 2857 NaN
3 55 6886 NaN
4 97 14945 Insomnia
5 87 9485 NaN
6 66 15680 NaN
7 59 18767 NaN
8 99 16397 Sleep Apnea
9 76 12744 NaN
# 数据清洗
# 处理缺失值
df.fillna({"sleep_disorder": "unknown"}, inplace=True)
# print(df.head(10))
# 构造数据特征
df['gender'] = df['gender'].astype('category')
df['occupation'] = df['occupation'].astype('category')
df['bmi_category'] = df['bmi_category'].astype('category')
df[['high_pressure', 'low_pressure']] = df['blood_pressure'].str.split('/', expand=True)
# df.info()
# 睡眠质量分箱
labels = ['差', '良', '优']
df['sleep_quality_level'] = pd.cut(df['sleep_quality'], bins=3, labels=labels)
print(df.head(3))
person_id gender age occupation sleep_duration sleep_quality \
0 1 Male 29 Manual Labor 7.4 7.0
1 2 Female 43 Retired 4.2 4.9
2 3 Male 44 Retired 6.1 6.0
physical_activity_level stress_level bmi_category blood_pressure \
0 41 7 Obese 124/70
1 41 5 Obese 131/86
2 107 4 Underweight 122/70
heart_rate daily_steps sleep_disorder high_pressure low_pressure \
0 91 8539 unknown 124 70
1 81 18754 unknown 131 86
2 81 2857 unknown 122 70
sleep_quality_level
0 良
1 良
2 良
# 数据分析
print(df['bmi_category'].value_counts())
bmi_category
Overweight 109
Underweight 102
Obese 98
Normal 91
Name: count, dtype: int64
# 根据bmi分组
bmi_group = df.groupby('bmi_category', observed=True).agg({
'sleep_duration': 'mean',
'sleep_quality': 'mean',
'stress_level': 'mean'
})
print(bmi_group)
sleep_duration sleep_quality stress_level
bmi_category
Normal 7.794505 6.342857 4.857143
Obese 8.072449 6.189796 5.765306
Overweight 8.274312 6.101835 5.642202
Underweight 7.982353 5.896078 5.558824
评论 (0)