六、pandas - 数据分析

演示数据：
data.zip

1. 数据的导入导出

import pandas as pd

# 导入数据
# csv
df = pd.read_csv('static/2_pandas/data/employees.csv')
print(df.salary.sum())

691400.0

# 导出
df = df.head()
df.to_csv('static/2_pandas/data/employees1.csv')

# json
df_json = pd.read_json('static/2_pandas/data/data1.json')
print(df_json)

   id name  age
0   1   张三   25
1   2   李四   30
2   3   王五   28

import json

# 复杂json
with open('static/2_pandas/data/test.json', encoding='utf-8') as f:
    data = json.load(f)
print(type(data))
print(data['users'])
print()

df_json2 = pd.DataFrame(data['users'])
print(df_json2)

<class 'dict'>
[{'id': 1, 'name': '张三', 'age': 28, 'email': 'zhangsan@example.com', 'is_active': True, 'join_date': '2022-03-15'}, {'id': 2, 'name': '李四', 'age': 35, 'email': 'lisi@example.com', 'is_active': False, 'join_date': '2021-11-02'}, {'id': 3, 'name': '王五', 'age': 24, 'email': 'wangwu@example.com', 'is_active': True, 'join_date': '2023-01-20'}]

   id name  age                 email  is_active   join_date
0   1   张三   28  zhangsan@example.com       True  2022-03-15
1   2   李四   35      lisi@example.com      False  2021-11-02
2   3   王五   24    wangwu@example.com       True  2023-01-20

2. 缺失值的处理

import numpy as np
import pandas as pd

# 缺失值的处理
df = pd.DataFrame([[1, 2, pd.NA], ['A', None, 'D'], [7, 8, 9]], columns=['A', 'B', 'C'])
print(df)
print()

# 检查元素是否是缺失值
print(df.isna())
print(df.isnull())
print()

# 计算缺失值个数
# print(df.isna().sum()) # 列
print(df.isna().sum(axis=1))  # 行

   A    B     C
0  1  2.0  <NA>
1  A  NaN     D
2  7  8.0     9

       A      B      C
0  False  False   True
1  False   True  False
2  False  False  False
       A      B      C
0  False  False   True
1  False   True  False
2  False  False  False

0    1
1    1
2    0
dtype: int64

# 剔除缺失值
# 剔除包含缺失值的行记录
print(df.dropna())
print()

# 当前行的元素都为缺失值时才剔除
print(df.dropna(how='all'))
print()

# 如果有n个元素不是缺失值，则保留
print(df.dropna(thresh=2))
print()

   A    B  C
2  7  8.0  9

   A    B     C
0  1  2.0  <NA>
1  A  NaN     D
2  7  8.0     9

   A    B     C
0  1  2.0  <NA>
1  A  NaN     D
2  7  8.0     9

# 按列剔除
print(df.dropna(axis=1))
print()

# 如果某列有缺失值，则删除这一行
print(df.dropna(subset=['B']))
print()

   A
0  1
1  A
2  7

   A    B     C
0  1  2.0  <NA>
2  7  8.0     9

# 填充缺失值
df = pd.read_csv('static/2_pandas/data/weather_withna.csv')
print(df.isna().sum())

date               0
precipitation    303
temp_max         303
temp_min         303
wind             303
weather          303
dtype: int64

# 使用字典填充
print(df.fillna({"temp_max": 100, "wind": 20}).tail())

            date  precipitation  temp_max  temp_min  wind weather
1456  2015-12-27            NaN     100.0       NaN  20.0     NaN
1457  2015-12-28            NaN     100.0       NaN  20.0     NaN
1458  2015-12-29            NaN     100.0       NaN  20.0     NaN
1459  2015-12-30            NaN     100.0       NaN  20.0     NaN
1460  2015-12-31           20.6      12.2       5.0   3.8    rain

# 使用统计值填充
print(df.fillna(df[['temp_max', 'wind']].mean()).tail())

            date  precipitation   temp_max  temp_min      wind weather
1456  2015-12-27            NaN  15.851468       NaN  3.242055     NaN
1457  2015-12-28            NaN  15.851468       NaN  3.242055     NaN
1458  2015-12-29            NaN  15.851468       NaN  3.242055     NaN
1459  2015-12-30            NaN  15.851468       NaN  3.242055     NaN
1460  2015-12-31           20.6  12.200000       5.0  3.800000    rain

# 根据附近的值填充 front
print(df.ffill().tail())
print()

# 根据附近的值填充 behind
print(df.bfill().tail())

            date  precipitation  temp_max  temp_min  wind weather
1456  2015-12-27            0.0      11.1       4.4   4.8     sun
1457  2015-12-28            0.0      11.1       4.4   4.8     sun
1458  2015-12-29            0.0      11.1       4.4   4.8     sun
1459  2015-12-30            0.0      11.1       4.4   4.8     sun
1460  2015-12-31           20.6      12.2       5.0   3.8    rain

            date  precipitation  temp_max  temp_min  wind weather
1456  2015-12-27           20.6      12.2       5.0   3.8    rain
1457  2015-12-28           20.6      12.2       5.0   3.8    rain
1458  2015-12-29           20.6      12.2       5.0   3.8    rain
1459  2015-12-30           20.6      12.2       5.0   3.8    rain
1460  2015-12-31           20.6      12.2       5.0   3.8    rain

3. 重复数据处理

import pandas as pd

data = {
    "name": ["孙笑川", "药水哥", "孙笑川", "刘波", "冬泳怪鸽", "刘波"],
    "age": [33, 30, 33, 30, 40, 30],
    "address": ["成都", "武汉", "成都", "武汉", "北京", "武汉"]
}
df = pd.DataFrame(data)
print(df)

   name  age address
0   孙笑川   33      成都
1   药水哥   30      武汉
2   孙笑川   33      成都
3    刘波   30      武汉
4  冬泳怪鸽   40      北京
5    刘波   30      武汉

# 一整条记录都是重复的才标记
print(df.duplicated())
print()

# 去重
# print(df.drop_duplicates())

# 根据指定列去重
print(df.drop_duplicates(subset=['address']))
print()

# 根据指定列去重（保持最新的记录）
print(df.drop_duplicates(subset=['address'], keep='last'))

0    False
1    False
2     True
3    False
4    False
5     True
dtype: bool

   name  age address
0   孙笑川   33      成都
1   药水哥   30      武汉
4  冬泳怪鸽   40      北京

   name  age address
2   孙笑川   33      成都
4  冬泳怪鸽   40      北京
5    刘波   30      武汉

4. 数据类型的转换

import pandas as pd

df = pd.read_csv('static/2_pandas/data/sleep.csv')
print(df)

     person_id  gender  age     occupation  sleep_duration  sleep_quality  \
0            1    Male   29   Manual Labor             7.4            7.0   
1            2  Female   43        Retired             4.2            4.9   
2            3    Male   44        Retired             6.1            6.0   
3            4    Male   29  Office Worker             8.3           10.0   
4            5    Male   67        Retired             9.1            9.5   
..         ...     ...  ...            ...             ...            ...   
395        396  Female   36        Student             4.5            7.9   
396        397  Female   45   Manual Labor             6.0            6.1   
397        398  Female   30        Student             5.3            6.5   
398        399  Female   41        Retired            11.0            9.1   
399        400    Male   37        Retired             5.8            7.0   

     physical_activity_level  stress_level bmi_category blood_pressure  \
0                         41             7        Obese         124/70   
1                         41             5        Obese         131/86   
2                        107             4  Underweight         122/70   
3                         20            10        Obese         124/72   
4                         19             4   Overweight         133/78   
..                       ...           ...          ...            ...   
395                       73             7       Normal         118/66   
396                       72             8        Obese         132/80   
397                       58            10        Obese         125/76   
398                       73             9        Obese         130/75   
399                       41             6       Normal         118/70   

     heart_rate  daily_steps sleep_disorder  
0            91         8539            NaN  
1            81        18754            NaN  
2            81         2857            NaN  
3            55         6886            NaN  
4            97        14945       Insomnia  
..          ...          ...            ...  
395          64        14497    Sleep Apnea  
396          65        12848       Insomnia  
397          66        15255       Insomnia  
398          75         6567    Sleep Apnea  
399          51        18079            NaN  

[400 rows x 13 columns]

print(df.dtypes)

person_id                    int64
gender                      object
age                          int64
occupation                  object
sleep_duration             float64
sleep_quality              float64
physical_activity_level      int64
stress_level                 int64
bmi_category                object
blood_pressure              object
heart_rate                   int64
daily_steps                  int64
sleep_disorder              object
dtype: object

df.age = df.age.astype('int16')
print(df.dtypes)

person_id                    int64
gender                      object
age                          int16
occupation                  object
sleep_duration             float64
sleep_quality              float64
physical_activity_level      int64
stress_level                 int64
bmi_category                object
blood_pressure              object
heart_rate                   int64
daily_steps                  int64
sleep_disorder              object
dtype: object

df.gender = df.gender.astype('category')
print(df.dtypes)
print()

print(df.gender)

person_id                     int64
gender                     category
age                           int16
occupation                   object
sleep_duration              float64
sleep_quality               float64
physical_activity_level       int64
stress_level                  int64
bmi_category                 object
blood_pressure               object
heart_rate                    int64
daily_steps                   int64
sleep_disorder               object
dtype: object

0        Male
1      Female
2        Male
3        Male
4        Male
        ...  
395    Female
396    Female
397    Female
398    Female
399      Male
Name: gender, Length: 400, dtype: category
Categories (2, object): ['Female', 'Male']

5. 数据变形

import pandas as pd

data = {
    "id": [1001, 1002],
    "name": ["孙笑川", "刘波"],
    "math": [99, 89],
    "english": [60, 95],
}

df = pd.DataFrame(data)
print(df)

     id name  math  english
0  1001  孙笑川    99       60
1  1002   刘波    89       95

# 行列转置
print(df.T)

            0     1
id       1001  1002
name      孙笑川    刘波
math       99    89
english    60    95

# 宽表转长表
"""
1001 孙笑川 math 99
1001 孙笑川 english 99
"""

print(df)
print()

df1 = pd.melt(df, id_vars=['id', 'name'], var_name='科目', value_name='分数')
print(df1)

     id name  math  english
0  1001  孙笑川    99       60
1  1002   刘波    89       95

     id name       科目  分数
0  1001  孙笑川     math  99
1  1002   刘波     math  89
2  1001  孙笑川  english  60
3  1002   刘波  english  95

# 长表转宽表
df2 = pd.pivot(df1, index=['id', 'name'], columns='科目', values='分数')
print(df2)

科目         english  math
id   name               
1001 孙笑川        60    99
1002 刘波         95    89

# 分列
data = {
    "id": [1001, 1002],
    "name": ["孙笑川 带带大师兄", "刘波 药水哥"],
    "math": [99, 89],
    "english": [60, 95],
}

df = pd.DataFrame(data)

df[['first name', 'last name']] = df.name.str.split(" ", expand=True)
print(df)

     id       name  math  english first name last name
0  1001  孙笑川 带带大师兄    99       60        孙笑川     带带大师兄
1  1002     刘波 药水哥    89       95         刘波       药水哥

df = pd.read_csv('static/2_pandas/data/sleep.csv')
df = df[['person_id', 'blood_pressure']]
print(df)
print()

df[['high_pressure', 'low_pressure']] = df['blood_pressure'].str.split("/", expand=True)
df.high_pressure = df.high_pressure.astype('int16')
df.low_pressure = df.low_pressure.astype('int16')
print(df)
print()

df.info()

     person_id blood_pressure
0            1         124/70
1            2         131/86
2            3         122/70
3            4         124/72
4            5         133/78
..         ...            ...
395        396         118/66
396        397         132/80
397        398         125/76
398        399         130/75
399        400         118/70

[400 rows x 2 columns]

     person_id blood_pressure  high_pressure  low_pressure
0            1         124/70            124            70
1            2         131/86            131            86
2            3         122/70            122            70
3            4         124/72            124            72
4            5         133/78            133            78
..         ...            ...            ...           ...
395        396         118/66            118            66
396        397         132/80            132            80
397        398         125/76            125            76
398        399         130/75            130            75
399        400         118/70            118            70

[400 rows x 4 columns]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   person_id       400 non-null    int64 
 1   blood_pressure  400 non-null    object
 2   high_pressure   400 non-null    int16 
 3   low_pressure    400 non-null    int16 
dtypes: int16(2), int64(1), object(1)
memory usage: 7.9+ KB

6. 数据分箱

import pandas as pd

df = pd.read_csv('static/2_pandas/data/employees.csv')
print(df.head(10))

   employee_id first_name  last_name     email  phone_number      job_id  \
0          100     Steven       King     SKING  515.123.4567     AD_PRES   
1          101      N_ann    Kochhar  NKOCHHAR  515.123.4568       AD_VP   
2          102        Lex    De Haan   LDEHAAN  515.123.4569       AD_VP   
3          103  Alexander     Hunold   AHUNOLD  590.423.4567     IT_PROG   
4          104      Bruce      Ernst    BERNST  590.423.4568     IT_PROG   
5          105      David     Austin   DAUSTIN  590.423.4569     IT_PROG   
6          106      Valli  Pataballa  VPATABAL  590.423.4560     IT_PROG   
7          107      Diana    Lorentz  DLORENTZ  590.423.5567     IT_PROG   
8          108      Nancy  Greenberg  NGREENBE  515.124.4569      FI_MGR   
9          109     Daniel     Faviet   DFAVIET  515.124.4169  FI_ACCOUNT   

    salary  commission_pct  manager_id  department_id  
0  24000.0             NaN         NaN           90.0  
1  17000.0             NaN       100.0           90.0  
2  17000.0             NaN       100.0           90.0  
3   9000.0             NaN       102.0           60.0  
4   6000.0             NaN       103.0           60.0  
5   4800.0             NaN       103.0           60.0  
6   4800.0             NaN       103.0           60.0  
7   4200.0             NaN       103.0           60.0  
8  12000.0             NaN       101.0          100.0  
9   9000.0             NaN       108.0          100.0

df1 = df.head(10)[['employee_id', 'salary']]
print(df1)
print()

# pd.cut()
# bins=n 分成n段区间，起始值，结束值是所有数据中的最小值、最大值
print(pd.cut(df1['salary'], bins=2))
print()

# bins=[] 自定义区间的值
df1_cut = pd.cut(df1['salary'], bins=[0, 5000, 10000, 20000])
print(df1_cut)
print()
df1_cut['收入范围'] = pd.cut(df1['salary'], bins=[0, 5000, 10000, 20000], labels=['低', '中', '高'])
print(df1_cut)
print()

# 等分为指定的段数
df1_qcut = pd.qcut(df1['salary'], 3)
print(df1_qcut)

   employee_id   salary
0          100  24000.0
1          101  17000.0
2          102  17000.0
3          103   9000.0
4          104   6000.0
5          105   4800.0
6          106   4800.0
7          107   4200.0
8          108  12000.0
9          109   9000.0

0    (14100.0, 24000.0]
1    (14100.0, 24000.0]
2    (14100.0, 24000.0]
3     (4180.2, 14100.0]
4     (4180.2, 14100.0]
5     (4180.2, 14100.0]
6     (4180.2, 14100.0]
7     (4180.2, 14100.0]
8     (4180.2, 14100.0]
9     (4180.2, 14100.0]
Name: salary, dtype: category
Categories (2, interval[float64, right]): [(4180.2, 14100.0] < (14100.0, 24000.0]]

0                   NaN
1    (10000.0, 20000.0]
2    (10000.0, 20000.0]
3     (5000.0, 10000.0]
4     (5000.0, 10000.0]
5         (0.0, 5000.0]
6         (0.0, 5000.0]
7         (0.0, 5000.0]
8    (10000.0, 20000.0]
9     (5000.0, 10000.0]
Name: salary, dtype: category
Categories (3, interval[int64, right]): [(0, 5000] < (5000, 10000] < (10000, 20000]]

0                                                     NaN
1                                          (10000, 20000]
2                                          (10000, 20000]
3                                           (5000, 10000]
4                                           (5000, 10000]
5                                               (0, 5000]
6                                               (0, 5000]
7                                               (0, 5000]
8                                          (10000, 20000]
9                                           (5000, 10000]
收入范围    0    NaN
1      高
2      高
3      中
4      中
5...
Name: salary, dtype: object

0    (12000.0, 24000.0]
1    (12000.0, 24000.0]
2    (12000.0, 24000.0]
3     (6000.0, 12000.0]
4     (6000.0, 12000.0]
5    (4199.999, 6000.0]
6    (4199.999, 6000.0]
7    (4199.999, 6000.0]
8    (12000.0, 24000.0]
9     (6000.0, 12000.0]
Name: salary, dtype: category
Categories (3, interval[float64, right]): [(4199.999, 6000.0] < (6000.0, 12000.0] < (12000.0, 24000.0]]

df = pd.read_csv('static/2_pandas/data/sleep.csv')
df1 = df.head(10)[['person_id', 'sleep_quality']]

# 源数据 ---> 分箱 ---> 统计
df1['睡眠质量'] = pd.cut(df1['sleep_quality'], bins=3, labels=['差', '良好', '优秀'])
print(df1['睡眠质量'].value_counts())

睡眠质量
良好    5
差     3
优秀    2
Name: count, dtype: int64

# df.rename()
df = pd.DataFrame({
    "name": ["孙笑川", "药水哥", "刘波", "冬泳怪鸽"],
    "age": [33, 30, 40, 30],
    "address": ["成都", "武汉", "武汉", "北京"]
})
print(df)
print()

print(df.rename(index={0: 5}, columns={"address": "地址"}))

   name  age address
0   孙笑川   33      成都
1   药水哥   30      武汉
2    刘波   40      武汉
3  冬泳怪鸽   30      北京

   name  age  地址
5   孙笑川   33  成都
1   药水哥   30  武汉
2    刘波   40  武汉
3  冬泳怪鸽   30  北京

# df.set_index()
# inplace=True 在原数据上修改
df.set_index('name', inplace=True)
print(df)

      age address
name             
孙笑川    33      成都
药水哥    30      武汉
刘波     40      武汉
冬泳怪鸽   30      北京

# df.reset_index()
df.reset_index(inplace=True)
print(df)

   name  age address
0   孙笑川   33      成都
1   药水哥   30      武汉
2    刘波   40      武汉
3  冬泳怪鸽   30      北京

7. 时间数据的处理

import pandas as pd

d = pd.Timestamp('2025-12-03 14:58')
print(d)
print(type(d))

2025-12-03 14:58:00
<class 'pandas._libs.tslibs.timestamps.Timestamp'>

# 属性
print("年：", d.year)
print("月：", d.month)
print("日：", d.day)
print("时间：", d.hour, d.minute, d.second)
print("是否为当月的最后一天：", d.is_month_end)

年： 2025
月： 12
日： 3
时间： 14 58 0
是否为当月的最后一天： False

# 方法
print("周几：", d.day_name())
print("转换为日期（天）：", d.to_period('D'))

周几： Wednesday
转换为日期（天）： 2025-12-03

# 字符串转换为日期类型
d = pd.to_datetime('2025-12-03 14:58')
print(d)
print(type(d))

2025-12-03 14:58:00
<class 'pandas._libs.tslibs.timestamps.Timestamp'>

df = pd.DataFrame({
    "id": [1001, 1002],
    "name": ["孙笑川", "药水哥"],
    "date_str": ["20251201", "20251202"]
})
print(df)
print()

df['create_time'] = pd.to_datetime(df['date_str'])
print(df)
print()

df['week'] = df['create_time'].dt.day_name()
print(df)

     id name  date_str
0  1001  孙笑川  20251201
1  1002  药水哥  20251202

     id name  date_str create_time
0  1001  孙笑川  20251201  2025-12-01
1  1002  药水哥  20251202  2025-12-02

     id name  date_str create_time     week
0  1001  孙笑川  20251201  2025-12-01   Monday
1  1002  药水哥  20251202  2025-12-02  Tuesday

# csv中的日期转换
df = pd.read_csv('static/2_pandas/data/weather.csv')
df['datetime'] = pd.to_datetime(df['date'])
print(df.head(10))
print()

# 读取csv时同步进行转换
df = pd.read_csv('static/2_pandas/data/weather.csv', parse_dates=['date'])
print(df['date'].dtypes)

         date  precipitation  temp_max  temp_min  wind  weather   datetime
0  2012-01-01            0.0      12.8       5.0   4.7  drizzle 2012-01-01
1  2012-01-02           10.9      10.6       2.8   4.5     rain 2012-01-02
2  2012-01-03            0.8      11.7       7.2   2.3     rain 2012-01-03
3  2012-01-04           20.3      12.2       5.6   4.7     rain 2012-01-04
4  2012-01-05            1.3       8.9       2.8   6.1     rain 2012-01-05
5  2012-01-06            2.5       4.4       2.2   2.2     rain 2012-01-06
6  2012-01-07            0.0       7.2       2.8   2.3     rain 2012-01-07
7  2012-01-08            0.0      10.0       2.8   2.0      sun 2012-01-08
8  2012-01-09            4.3       9.4       5.0   3.4     rain 2012-01-09
9  2012-01-10            1.0       6.1       0.6   3.4     rain 2012-01-10

datetime64[ns]

# 日期数据作为索引
df1 = df.set_index('date')
print(df1)
print()

            precipitation  temp_max  temp_min  wind  weather
date                                                        
2012-01-01            0.0      12.8       5.0   4.7  drizzle
2012-01-02           10.9      10.6       2.8   4.5     rain
2012-01-03            0.8      11.7       7.2   2.3     rain
2012-01-04           20.3      12.2       5.6   4.7     rain
2012-01-05            1.3       8.9       2.8   6.1     rain
...                   ...       ...       ...   ...      ...
2015-12-27            8.6       4.4       1.7   2.9     rain
2015-12-28            1.5       5.0       1.7   1.3     rain
2015-12-29            0.0       7.2       0.6   2.6      fog
2015-12-30            0.0       5.6      -1.0   3.4      sun
2015-12-31            0.0       5.6      -2.1   3.5      sun

[1461 rows x 5 columns]

# 时间间隔
d1 = pd.Timestamp('2020-01-10')
d2 = pd.Timestamp('2020-03-05')
print(d2-d1)

55 days 00:00:00

# 按时间维度重采样
df = pd.read_csv('static/2_pandas/data/weather.csv', parse_dates=['date'])

df.set_index('date', inplace=True)
print(df[['temp_max', 'temp_min']].resample("YE").mean())

             temp_max  temp_min
date                           
2012-12-31  15.276776  7.289617
2013-12-31  16.058904  8.153973
2014-12-31  16.995890  8.662466
2015-12-31  17.427945  8.835616

8. 分组聚合

# df.groupby('分组的字段')['聚合的字段'].聚合函数

import pandas as pd

df = pd.read_csv('static/2_pandas/data/employees.csv')
print(df.head(10))

   employee_id first_name  last_name     email  phone_number      job_id  \
0          100     Steven       King     SKING  515.123.4567     AD_PRES   
1          101      N_ann    Kochhar  NKOCHHAR  515.123.4568       AD_VP   
2          102        Lex    De Haan   LDEHAAN  515.123.4569       AD_VP   
3          103  Alexander     Hunold   AHUNOLD  590.423.4567     IT_PROG   
4          104      Bruce      Ernst    BERNST  590.423.4568     IT_PROG   
5          105      David     Austin   DAUSTIN  590.423.4569     IT_PROG   
6          106      Valli  Pataballa  VPATABAL  590.423.4560     IT_PROG   
7          107      Diana    Lorentz  DLORENTZ  590.423.5567     IT_PROG   
8          108      Nancy  Greenberg  NGREENBE  515.124.4569      FI_MGR   
9          109     Daniel     Faviet   DFAVIET  515.124.4169  FI_ACCOUNT   

    salary  commission_pct  manager_id  department_id  
0  24000.0             NaN         NaN           90.0  
1  17000.0             NaN       100.0           90.0  
2  17000.0             NaN       100.0           90.0  
3   9000.0             NaN       102.0           60.0  
4   6000.0             NaN       103.0           60.0  
5   4800.0             NaN       103.0           60.0  
6   4800.0             NaN       103.0           60.0  
7   4200.0             NaN       103.0           60.0  
8  12000.0             NaN       101.0          100.0  
9   9000.0             NaN       108.0          100.0

# 缺失值处理
df = df.dropna(subset=['department_id'])

df['department_id'] = df['department_id'].astype('int64')
print(df.head(10))

   employee_id first_name  last_name     email  phone_number      job_id  \
0          100     Steven       King     SKING  515.123.4567     AD_PRES   
1          101      N_ann    Kochhar  NKOCHHAR  515.123.4568       AD_VP   
2          102        Lex    De Haan   LDEHAAN  515.123.4569       AD_VP   
3          103  Alexander     Hunold   AHUNOLD  590.423.4567     IT_PROG   
4          104      Bruce      Ernst    BERNST  590.423.4568     IT_PROG   
5          105      David     Austin   DAUSTIN  590.423.4569     IT_PROG   
6          106      Valli  Pataballa  VPATABAL  590.423.4560     IT_PROG   
7          107      Diana    Lorentz  DLORENTZ  590.423.5567     IT_PROG   
8          108      Nancy  Greenberg  NGREENBE  515.124.4569      FI_MGR   
9          109     Daniel     Faviet   DFAVIET  515.124.4169  FI_ACCOUNT   

    salary  commission_pct  manager_id  department_id  
0  24000.0             NaN         NaN             90  
1  17000.0             NaN       100.0             90  
2  17000.0             NaN       100.0             90  
3   9000.0             NaN       102.0             60  
4   6000.0             NaN       103.0             60  
5   4800.0             NaN       103.0             60  
6   4800.0             NaN       103.0             60  
7   4200.0             NaN       103.0             60  
8  12000.0             NaN       101.0            100  
9   9000.0             NaN       108.0            100

# 计算不同部门的平均薪资
# .groups 查看分组
print(df.groupby('department_id').groups)
print()

# get_group() 查看具体的分组数据
print(df.groupby('department_id').get_group(20))

{10: [100], 20: [101, 102], 30: [14, 15, 16, 17, 18, 19], 40: [103], 50: [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99], 60: [3, 4, 5, 6, 7], 70: [104], 80: [45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 79], 90: [0, 1, 2], 100: [8, 9, 10, 11, 12, 13], 110: [105, 106]}

     employee_id first_name  last_name     email  phone_number  job_id  \
101          201    Michael  Hartstein  MHARTSTE  515.123.5555  MK_MAN   
102          202        Pat        Fay      PFAY  603.123.6666  MK_REP   

      salary  commission_pct  manager_id  department_id  
101  13000.0             NaN       100.0             20  
102   6000.0             NaN       201.0             20

df1 = df.groupby('department_id')[['salary']].mean()
df1['salary'] = df1['salary'].round(2)
df1.reset_index(inplace=True)
print(df1.sort_values('salary', ascending=False))

    department_id    salary
8              90  19333.33
10            110  10150.00
6              70  10000.00
1              20   9500.00
7              80   8955.88
9             100   8600.00
3              40   6500.00
5              60   5760.00
0              10   4400.00
2              30   4150.00
4              50   3475.56

# 按部门、岗位分组聚合
df2 = df.groupby(['department_id', 'job_id'])[['salary']].mean()
df2['salary'] = df2['salary'].round(2)
df2.reset_index(inplace=True)
print(df2.sort_values('salary', ascending=False))

    department_id      job_id    salary
13             90     AD_PRES  24000.00
14             90       AD_VP  17000.00
1              20      MK_MAN  13000.00
11             80      SA_MAN  12200.00
16            100      FI_MGR  12000.00
18            110      AC_MGR  12000.00
4              30      PU_MAN  11000.00
10             70      PR_REP  10000.00
12             80      SA_REP   8396.55
17            110  AC_ACCOUNT   8300.00
15            100  FI_ACCOUNT   7920.00
8              50      ST_MAN   7280.00
5              40      HR_REP   6500.00
2              20      MK_REP   6000.00
9              60     IT_PROG   5760.00
0              10     AD_ASST   4400.00
6              50    SH_CLERK   3215.00
7              50    ST_CLERK   2785.00
3              30    PU_CLERK   2780.00

六、pandas - 数据分析

1. 数据的导入导出

2. 缺失值的处理

3. 重复数据处理

4. 数据类型的转换

5. 数据变形

6. 数据分箱

7. 时间数据的处理

8. 分组聚合

七、pandas - 数据分析 - 案例

五、pandas-DataFrame-案例

四、pandas - DataFrame

三、pandas-Series-案例

评论 (0)