Dit geeft al een beeld van typische patronen die we kunnen ontdekken, maar een echt Data Science probleem vertrekt vanuit een case/vraag
import pandas as pd
data = ['a','b','c','d']
s = pd.Series(data) # je kan ook NumPy ndarrays gebruiken
print(s)
0 a 1 b 2 c 3 d dtype: object
s = pd.Series(data,index=[100,101,102,103])
print(s)
100 a 101 b 102 c 103 d dtype: object
dicti = {'a' : 0., 'b' : 1., 'c' : 2.}
s = pd.Series(dicti)
print(s)
a 0.0 b 1.0 c 2.0 dtype: float64
s = pd.Series(dicti,index=['b','c','d','a'])
print(s)
print(s[0:2])
print(s['b'])
print(s[['a', 'b']])
print(s['c':'a'])
b 1.0 c 2.0 d NaN a 0.0 dtype: float64 b 1.0 c 2.0 dtype: float64 1.0 a 0.0 b 1.0 dtype: float64 c 2.0 d NaN a 0.0 dtype: float64
s = pd.Series(5, index=[3, 4, 7, 8])
print(s)
print(s[0:2])
print(s[3:4])
print(s.loc[3:4])
print(s.iloc[3:4])
3 5 4 5 7 5 8 5 dtype: int64 3 5 4 5 dtype: int64 8 5 dtype: int64 3 5 4 5 dtype: int64 8 5 dtype: int64
Bron: Geo-Python course, Department of Geosciences and Geography, University of Helsinki, https://geo-python.github.io/site/notebooks/L5/exploring-data-using-pandas.html
import pandas as pd
data = {'Name': ['Tom', 'Jack', 'Steve', 'Ricky'], 'Age': [28,34,29,42]}
# je kan ook Pandas Series en NumPy ndarrays gebruiken
df = pd.DataFrame(data)
df
| Name | Age | |
|---|---|---|
| 0 | Tom | 28 |
| 1 | Jack | 34 |
| 2 | Steve | 29 |
| 3 | Ricky | 42 |
data = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}]
df = pd.DataFrame(data)
df
| a | b | c | |
|---|---|---|---|
| 0 | 1 | 2 | NaN |
| 1 | 5 | 10 | 20.0 |
data = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}]
# index = rij-labels (moet overeenkomen met aantal rijen data)
# columns = kolom-labels (kan gebruikt worden voor selectie)
df = pd.DataFrame(data, index=['first', 'second'], columns=['a', 'b', 'd'])
df
| a | b | d | |
|---|---|---|---|
| first | 1 | 2 | NaN |
| second | 5 | 10 | NaN |
# kolom-manipulatie
print(df['a'])
df['c'] = [5, 6]
print(df)
df['d'] = df['a'] + df['b']
print(df)
del df['d'] # kolom verwijderen
print(df.pop('c')) # kolom 'poppen'
first 1
second 5
Name: a, dtype: int64
a b d c
first 1 2 NaN 5
second 5 10 NaN 6
a b d c
first 1 2 3 5
second 5 10 15 6
first 5
second 6
Name: c, dtype: int64
# rij-manipulatie
df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB'))
df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('AB'))
df3 = pd.concat([df, df2])
df3
| A | B | |
|---|---|---|
| 0 | 1 | 2 |
| 1 | 3 | 4 |
| 0 | 5 | 6 |
| 1 | 7 | 8 |
df3 = df3.drop(0)
df3
| A | B | |
|---|---|---|
| 1 | 3 | 4 |
| 1 | 7 | 8 |
df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB'))
df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('AB'))
df3 = pd.concat([df, df2], ignore_index=True) # beïnvloedt df2 niet !!
df3
| A | B | |
|---|---|---|
| 0 | 1 | 2 |
| 1 | 3 | 4 |
| 2 | 5 | 6 |
| 3 | 7 | 8 |
# loc: indexing en slicing met labels
# iloc: indexing en slicing met index-getallen
df3.loc[1:] # slicing with unique row labels
| A | B | |
|---|---|---|
| 1 | 3 | 4 |
| 2 | 5 | 6 |
| 3 | 7 | 8 |
print(df.loc[1, 'A'])
df.iloc[0,0] = 666
df
3
| A | B | |
|---|---|---|
| 0 | 666 | 2 |
| 1 | 3 | 4 |
import pandas as pd
import numpy as np
# voorbeeld met impliciete constructie
lijst = [['2ICT1']*2+['2ICT2']*2, ['Jan', 'Piet', 'Joris', 'Corneel']]
df = pd.DataFrame(np.random.randn(4, 4), index=lijst)
df
| 0 | 1 | 2 | 3 | ||
|---|---|---|---|---|---|
| 2ICT1 | Jan | 0.758809 | 0.796356 | 0.329736 | 1.186778 |
| Piet | 0.704024 | -0.997745 | 0.388996 | 0.574466 | |
| 2ICT2 | Joris | -1.645122 | -0.402663 | -0.529504 | -0.912307 |
| Corneel | 0.254901 | -1.248586 | 0.285135 | 1.389129 |
Bron: Geo-Python course, Department of Geosciences and Geography, University of Helsinki, https://geo-python.github.io/site/notebooks/L5/exploring-data-using-pandas.html
# Data file contents: Daily temperatures (mean, min, max) for Kumpula, Helsinki
# for June 1-30, 2016
# Data source: https://www.ncdc.noaa.gov/cdo-web/search?datasetid=GHCND
# Data processing: Extracted temperatures from raw data file, converted to
# comma-separated format
#
# David Whipp - 02.10.2017
YEARMODA,TEMP,MAX,MIN
20160601,65.5,73.6,54.7
20160602,65.8,80.8,55.0
20160603,68.4,,55.6
...
import pandas as pd
# Read the file using pandas
data = pd.read_csv('Kumpula-June-2016-w-metadata.txt', skiprows=8)
data.head()
| YEARMODA | TEMP | MAX | MIN | |
|---|---|---|---|---|
| 0 | 20160601 | 65.5 | 73.6 | 54.7 |
| 1 | 20160602 | 65.8 | 80.8 | 55.0 |
| 2 | 20160603 | 68.4 | NaN | 55.6 |
| 3 | 20160604 | 57.5 | 70.9 | 47.3 |
| 4 | 20160605 | 51.4 | 58.3 | 43.2 |
data.tail(3)
| YEARMODA | TEMP | MAX | MIN | |
|---|---|---|---|---|
| 27 | 20160628 | 65.4 | 73.0 | 55.8 |
| 28 | 20160629 | 65.8 | 73.2 | NaN |
| 29 | 20160630 | 65.7 | 72.7 | 59.2 |
print(len(data))
print(data.shape)
print(data.columns.values)
print(data.index)
30 (30, 4) ['YEARMODA' 'TEMP' 'MAX' 'MIN'] RangeIndex(start=0, stop=30, step=1)
selection = data[['YEARMODA','TEMP']]
print(selection.head())
# Each column (and each row) in a pandas data frame is actually a pandas Series
print(type(data['TEMP']))
YEARMODA TEMP 0 20160601 65.5 1 20160602 65.8 2 20160603 68.4 3 20160604 57.5 4 20160605 51.4 <class 'pandas.core.series.Series'>
# Descriptive statistics
# use mean(), median(), min(), max(), and std() on any series
print(data['TEMP'].mean())
print(data.mean())
59.730000000000004 YEARMODA 2.016062e+07 TEMP 5.973000e+01 MAX 6.804074e+01 MIN 5.125714e+01 dtype: float64
data.describe()
| YEARMODA | TEMP | MAX | MIN | |
|---|---|---|---|---|
| count | 3.000000e+01 | 30.000000 | 27.000000 | 28.000000 |
| mean | 2.016062e+07 | 59.730000 | 68.040741 | 51.257143 |
| std | 8.803408e+00 | 5.475472 | 6.505575 | 5.498985 |
| min | 2.016060e+07 | 49.400000 | 54.100000 | 41.700000 |
| 25% | 2.016061e+07 | 56.450000 | 64.650000 | 46.975000 |
| 50% | 2.016062e+07 | 60.050000 | 69.100000 | 53.100000 |
| 75% | 2.016062e+07 | 64.900000 | 72.050000 | 55.600000 |
| max | 2.016063e+07 | 69.600000 | 80.800000 | 60.300000 |
# Basic plots
data[["TEMP", "MAX", "MIN"]].plot()
<Axes: >
# Calculate max min difference
data['DIFF'] = data['MAX'] - data['MIN']
# Check the result
data.head()
| YEARMODA | TEMP | MAX | MIN | DIFF | |
|---|---|---|---|---|---|
| 0 | 20160601 | 65.5 | 73.6 | 54.7 | 18.9 |
| 1 | 20160602 | 65.8 | 80.8 | 55.0 | 25.8 |
| 2 | 20160603 | 68.4 | NaN | 55.6 | NaN |
| 3 | 20160604 | 57.5 | 70.9 | 47.3 | 23.6 |
| 4 | 20160605 | 51.4 | 58.3 | 43.2 | 15.1 |
# Filtering and updating data
data['TEMP_CELSIUS'] = (data['TEMP'] - 32) / (9/5)
warm_temps = data.loc[data['TEMP_CELSIUS'] > 15]
warm_temps.head()
| YEARMODA | TEMP | MAX | MIN | DIFF | TEMP_CELSIUS | |
|---|---|---|---|---|---|---|
| 0 | 20160601 | 65.5 | 73.6 | 54.7 | 18.9 | 18.611111 |
| 1 | 20160602 | 65.8 | 80.8 | 55.0 | 25.8 | 18.777778 |
| 2 | 20160603 | 68.4 | NaN | 55.6 | NaN | 20.222222 |
| 13 | 20160614 | 59.7 | 67.8 | 47.8 | 20.0 | 15.388889 |
| 14 | 20160615 | 63.4 | 70.3 | 49.3 | 21.0 | 17.444444 |
# Select rows with temp celsius higher than 15 degrees from late June 2016
warm_temps = data.loc[(data['TEMP_CELSIUS'] > 15) & (data['YEARMODA'] >= 20160615)]
# boolean indexing: | for or, & for and, ~ for not
warm_temps.head()
| YEARMODA | TEMP | MAX | MIN | DIFF | TEMP_CELSIUS | |
|---|---|---|---|---|---|---|
| 14 | 20160615 | 63.4 | 70.3 | 49.3 | 21.0 | 17.444444 |
| 16 | 20160617 | 60.4 | 70.7 | 55.9 | 14.8 | 15.777778 |
| 19 | 20160620 | 59.3 | 69.1 | 52.2 | 16.9 | 15.166667 |
| 20 | 20160621 | 62.6 | 71.4 | 50.4 | 21.0 | 17.000000 |
| 21 | 20160622 | 61.7 | 70.2 | 55.4 | 14.8 | 16.500000 |
warm_temps.reset_index(drop=True, inplace=True)
# drop -> drop the old index in stead of adding a separate index column
# inplace -> modify the current data frame
warm_temps.head(12)
| YEARMODA | TEMP | MAX | MIN | DIFF | TEMP_CELSIUS | |
|---|---|---|---|---|---|---|
| 0 | 20160615 | 63.4 | 70.3 | 49.3 | 21.0 | 17.444444 |
| 1 | 20160617 | 60.4 | 70.7 | 55.9 | 14.8 | 15.777778 |
| 2 | 20160620 | 59.3 | 69.1 | 52.2 | 16.9 | 15.166667 |
| 3 | 20160621 | 62.6 | 71.4 | 50.4 | 21.0 | 17.000000 |
| 4 | 20160622 | 61.7 | 70.2 | 55.4 | 14.8 | 16.500000 |
| 5 | 20160623 | 60.9 | 67.1 | 54.9 | 12.2 | 16.055556 |
| 6 | 20160624 | 61.1 | 68.9 | 56.7 | 12.2 | 16.166667 |
| 7 | 20160625 | 65.7 | 75.4 | 57.9 | 17.5 | 18.722222 |
| 8 | 20160626 | 69.6 | 77.7 | 60.3 | 17.4 | 20.888889 |
| 9 | 20160627 | 60.7 | 70.0 | NaN | NaN | 15.944444 |
| 10 | 20160628 | 65.4 | 73.0 | 55.8 | 17.2 | 18.555556 |
| 11 | 20160629 | 65.8 | 73.2 | NaN | NaN | 18.777778 |
# isna() detects NaN
# isnull() detects NaN, None, NaT
warm_temps.isna().head(12)
| YEARMODA | TEMP | MAX | MIN | DIFF | TEMP_CELSIUS | |
|---|---|---|---|---|---|---|
| 0 | False | False | False | False | False | False |
| 1 | False | False | False | False | False | False |
| 2 | False | False | False | False | False | False |
| 3 | False | False | False | False | False | False |
| 4 | False | False | False | False | False | False |
| 5 | False | False | False | False | False | False |
| 6 | False | False | False | False | False | False |
| 7 | False | False | False | False | False | False |
| 8 | False | False | False | False | False | False |
| 9 | False | False | False | True | True | False |
| 10 | False | False | False | False | False | False |
| 11 | False | False | False | True | True | False |
# Drop no data values based on the MIN column
warm_temps.dropna(subset=['MIN']) # not an inplace operation
| YEARMODA | TEMP | MAX | MIN | DIFF | TEMP_CELSIUS | |
|---|---|---|---|---|---|---|
| 0 | 20160615 | 63.4 | 70.3 | 49.3 | 21.0 | 17.444444 |
| 1 | 20160617 | 60.4 | 70.7 | 55.9 | 14.8 | 15.777778 |
| 2 | 20160620 | 59.3 | 69.1 | 52.2 | 16.9 | 15.166667 |
| 3 | 20160621 | 62.6 | 71.4 | 50.4 | 21.0 | 17.000000 |
| 4 | 20160622 | 61.7 | 70.2 | 55.4 | 14.8 | 16.500000 |
| 5 | 20160623 | 60.9 | 67.1 | 54.9 | 12.2 | 16.055556 |
| 6 | 20160624 | 61.1 | 68.9 | 56.7 | 12.2 | 16.166667 |
| 7 | 20160625 | 65.7 | 75.4 | 57.9 | 17.5 | 18.722222 |
| 8 | 20160626 | 69.6 | 77.7 | 60.3 | 17.4 | 20.888889 |
| 10 | 20160628 | 65.4 | 73.0 | 55.8 | 17.2 | 18.555556 |
| 12 | 20160630 | 65.7 | 72.7 | 59.2 | 13.5 | 18.722222 |
# Fill na values
warm_temps.fillna(-100) # not an inplace operation
| YEARMODA | TEMP | MAX | MIN | DIFF | TEMP_CELSIUS | |
|---|---|---|---|---|---|---|
| 0 | 20160615 | 63.4 | 70.3 | 49.3 | 21.0 | 17.444444 |
| 1 | 20160617 | 60.4 | 70.7 | 55.9 | 14.8 | 15.777778 |
| 2 | 20160620 | 59.3 | 69.1 | 52.2 | 16.9 | 15.166667 |
| 3 | 20160621 | 62.6 | 71.4 | 50.4 | 21.0 | 17.000000 |
| 4 | 20160622 | 61.7 | 70.2 | 55.4 | 14.8 | 16.500000 |
| 5 | 20160623 | 60.9 | 67.1 | 54.9 | 12.2 | 16.055556 |
| 6 | 20160624 | 61.1 | 68.9 | 56.7 | 12.2 | 16.166667 |
| 7 | 20160625 | 65.7 | 75.4 | 57.9 | 17.5 | 18.722222 |
| 8 | 20160626 | 69.6 | 77.7 | 60.3 | 17.4 | 20.888889 |
| 9 | 20160627 | 60.7 | 70.0 | -100.0 | -100.0 | 15.944444 |
| 10 | 20160628 | 65.4 | 73.0 | 55.8 | 17.2 | 18.555556 |
| 11 | 20160629 | 65.8 | 73.2 | -100.0 | -100.0 | 18.777778 |
| 12 | 20160630 | 65.7 | 72.7 | 59.2 | 13.5 | 18.722222 |
# Data type conversions -- astype()
print("Truncated integer values:")
data['TEMP'].astype(int).head()
Truncated integer values:
0 65 1 65 2 68 3 57 4 51 Name: TEMP, dtype: int32
print("Rounded integer values:")
print(data['TEMP'].round(0).astype(int).head())
Rounded integer values: 0 66 1 66 2 68 3 58 4 51 Name: TEMP, dtype: int32
data['TEMP'].unique()
array([65.5, 65.8, 68.4, 57.5, 51.4, 52.2, 56.9, 54.2, 49.4, 49.5, 54. ,
55.4, 58.3, 59.7, 63.4, 57.8, 60.4, 57.3, 56.3, 59.3, 62.6, 61.7,
60.9, 61.1, 65.7, 69.6, 60.7, 65.4])
data.sort_values(by='TEMP', ascending=False).head()
| YEARMODA | TEMP | MAX | MIN | DIFF | TEMP_CELSIUS | |
|---|---|---|---|---|---|---|
| 25 | 20160626 | 69.6 | 77.7 | 60.3 | 17.4 | 20.888889 |
| 2 | 20160603 | 68.4 | NaN | 55.6 | NaN | 20.222222 |
| 1 | 20160602 | 65.8 | 80.8 | 55.0 | 25.8 | 18.777778 |
| 28 | 20160629 | 65.8 | 73.2 | NaN | NaN | 18.777778 |
| 29 | 20160630 | 65.7 | 72.7 | 59.2 | 13.5 | 18.722222 |
Table of Contents
Table of Contents
git clone https://github.com/realpython/python-data-cleaning.git
pandas merge()
similar to a database’s join operations
import pandas as pd
df1 = pd.DataFrame({'user_id': ['id001', 'id002', 'id003', 'id004', 'id005', 'id006', 'id007'],
'first_name': ['Rivi', 'Wynnie', 'Kristos', 'Madalyn', 'Tobe', 'Regan', 'Kristin'],
'last_name': ['Valti', 'McMurty', 'Ivanets', 'Max', 'Riddich', 'Huyghe', 'Illis'],
'email': ['rvalti0@example.com', 'wmcmurty1@example.com', 'kivanets2@example.com',
'mmax3@example.com', 'triddich4@example.com', 'rhuyghe@example.com', 'killis4@example.com']
})
df1
| user_id | first_name | last_name | ||
|---|---|---|---|---|
| 0 | id001 | Rivi | Valti | rvalti0@example.com |
| 1 | id002 | Wynnie | McMurty | wmcmurty1@example.com |
| 2 | id003 | Kristos | Ivanets | kivanets2@example.com |
| 3 | id004 | Madalyn | Max | mmax3@example.com |
| 4 | id005 | Tobe | Riddich | triddich4@example.com |
| 5 | id006 | Regan | Huyghe | rhuyghe@example.com |
| 6 | id007 | Kristin | Illis | killis4@example.com |
df2 = pd.DataFrame({'user_id': ['id001', 'id002', 'id003', 'id004', 'id005'],
'image_url': ['http://example.com/img/id001.png', 'http://example.com/img/id002.jpg',
'http://example.com/img/id003.bmp', 'http://example.com/img/id004.jpg',
'http://example.com/img/id005.png']
})
df2.head()
| user_id | image_url | |
|---|---|---|
| 0 | id001 | http://example.com/img/id001.png |
| 1 | id002 | http://example.com/img/id002.jpg |
| 2 | id003 | http://example.com/img/id003.bmp |
| 3 | id004 | http://example.com/img/id004.jpg |
| 4 | id005 | http://example.com/img/id005.png |
df3_merged = pd.merge(df1, df2)
df3_merged
| user_id | first_name | last_name | image_url | ||
|---|---|---|---|---|---|
| 0 | id001 | Rivi | Valti | rvalti0@example.com | http://example.com/img/id001.png |
| 1 | id002 | Wynnie | McMurty | wmcmurty1@example.com | http://example.com/img/id002.jpg |
| 2 | id003 | Kristos | Ivanets | kivanets2@example.com | http://example.com/img/id003.bmp |
| 3 | id004 | Madalyn | Max | mmax3@example.com | http://example.com/img/id004.jpg |
| 4 | id005 | Tobe | Riddich | triddich4@example.com | http://example.com/img/id005.png |
df_left_merge = pd.merge(df1, df2, how ='left')
df_left_merge
| user_id | first_name | last_name | image_url | ||
|---|---|---|---|---|---|
| 0 | id001 | Rivi | Valti | rvalti0@example.com | http://example.com/img/id001.png |
| 1 | id002 | Wynnie | McMurty | wmcmurty1@example.com | http://example.com/img/id002.jpg |
| 2 | id003 | Kristos | Ivanets | kivanets2@example.com | http://example.com/img/id003.bmp |
| 3 | id004 | Madalyn | Max | mmax3@example.com | http://example.com/img/id004.jpg |
| 4 | id005 | Tobe | Riddich | triddich4@example.com | http://example.com/img/id005.png |
| 5 | id006 | Regan | Huyghe | rhuyghe@example.com | NaN |
| 6 | id007 | Kristin | Illis | killis4@example.com | NaN |
# use of join
df_join = df1.join(df2, rsuffix='_right')
df_join
| user_id | first_name | last_name | user_id_right | image_url | ||
|---|---|---|---|---|---|---|
| 0 | id001 | Rivi | Valti | rvalti0@example.com | id001 | http://example.com/img/id001.png |
| 1 | id002 | Wynnie | McMurty | wmcmurty1@example.com | id002 | http://example.com/img/id002.jpg |
| 2 | id003 | Kristos | Ivanets | kivanets2@example.com | id003 | http://example.com/img/id003.bmp |
| 3 | id004 | Madalyn | Max | mmax3@example.com | id004 | http://example.com/img/id004.jpg |
| 4 | id005 | Tobe | Riddich | triddich4@example.com | id005 | http://example.com/img/id005.png |
| 5 | id006 | Regan | Huyghe | rhuyghe@example.com | NaN | NaN |
| 6 | id007 | Kristin | Illis | killis4@example.com | NaN | NaN |
# use of concat
df2_addition = pd.DataFrame({'user_id': ['id006', 'id007'],
'image_url': ['http://example.com/img/id006.png',
'http://example.com/img/id007.jpg']
})
df2_addition
| user_id | image_url | |
|---|---|---|
| 0 | id006 | http://example.com/img/id006.png |
| 1 | id007 | http://example.com/img/id007.jpg |
df_row_concat = pd.concat([df2, df2_addition])
df_row_concat
| user_id | image_url | |
|---|---|---|
| 0 | id001 | http://example.com/img/id001.png |
| 1 | id002 | http://example.com/img/id002.jpg |
| 2 | id003 | http://example.com/img/id003.bmp |
| 3 | id004 | http://example.com/img/id004.jpg |
| 4 | id005 | http://example.com/img/id005.png |
| 0 | id006 | http://example.com/img/id006.png |
| 1 | id007 | http://example.com/img/id007.jpg |