import pandas as pd
data = ['a','b','c','d']
s = pd.Series(data) # je kan ook NumPy ndarrays gebruiken
print(s)

0    a
1    b
2    c
3    d
dtype: object


s = pd.Series(data,index=[100,101,102,103])
print(s)

100    a
101    b
102    c
103    d
dtype: object


dicti = {'a' : 0., 'b' : 1., 'c' : 2.}
s = pd.Series(dicti)
print(s)

a    0.0
b    1.0
c    2.0
dtype: float64


s = pd.Series(dicti,index=['b','c','d','a'])
print(s)
print(s[0:2]) 
print(s['b'])
print(s[['a', 'b']])
print(s['c':'a'])

b    1.0
c    2.0
d    NaN
a    0.0
dtype: float64
b    1.0
c    2.0
dtype: float64
1.0
a    0.0
b    1.0
dtype: float64
c    2.0
d    NaN
a    0.0
dtype: float64


s = pd.Series(5, index=[3, 4, 7, 8])
print(s)
print(s[0:2])
print(s[3:4])
print(s.loc[3:4])
print(s.iloc[3:4])

3    5
4    5
7    5
8    5
dtype: int64
3    5
4    5
dtype: int64
8    5
dtype: int64
3    5
4    5
dtype: int64
8    5
dtype: int64


import pandas as pd
data = {'Name': ['Tom', 'Jack', 'Steve', 'Ricky'], 'Age': [28,34,29,42]}
# je kan ook Pandas Series en NumPy ndarrays gebruiken
df = pd.DataFrame(data)
df


data = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}]
df = pd.DataFrame(data)
df


data = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}]

# index = rij-labels (moet overeenkomen met aantal rijen data)
# columns = kolom-labels (kan gebruikt worden voor selectie)
df = pd.DataFrame(data, index=['first', 'second'], columns=['a', 'b', 'd'])
df


# kolom-manipulatie
print(df['a'])
df['c'] = [5, 6]
print(df)
df['d'] = df['a'] + df['b']
print(df)

del df['d'] # kolom verwijderen
print(df.pop('c')) # kolom 'poppen'

first     1
second    5
Name: a, dtype: int64
        a   b   d  c
first   1   2 NaN  5
second  5  10 NaN  6
        a   b   d  c
first   1   2   3  5
second  5  10  15  6
first     5
second    6
Name: c, dtype: int64


# rij-manipulatie
df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB'))
df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('AB'))
df3 = pd.concat([df, df2])
df3


df3 = df3.drop(0)
df3


df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB'))
df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('AB'))
df3 = pd.concat([df, df2], ignore_index=True) # beïnvloedt df2 niet !!
df3


# loc: indexing en slicing met labels
# iloc: indexing en slicing met index-getallen

df3.loc[1:] # slicing with unique row labels


print(df.loc[1, 'A'])
df.iloc[0,0] = 666
df

3


import pandas as pd
import numpy as np

# voorbeeld met impliciete constructie

lijst = [['2ICT1']*2+['2ICT2']*2, ['Jan', 'Piet', 'Joris', 'Corneel']]

df = pd.DataFrame(np.random.randn(4, 4), index=lijst)
df


import pandas as pd

# Read the file using pandas
data = pd.read_csv('Kumpula-June-2016-w-metadata.txt', skiprows=8)
data.head()


data.tail(3)


print(len(data))
print(data.shape)
print(data.columns.values)
print(data.index)

30
(30, 4)
['YEARMODA' 'TEMP' 'MAX' 'MIN']
RangeIndex(start=0, stop=30, step=1)


selection = data[['YEARMODA','TEMP']]
print(selection.head())

# Each column (and each row) in a pandas data frame is actually a pandas Series
print(type(data['TEMP']))

   YEARMODA  TEMP
0  20160601  65.5
1  20160602  65.8
2  20160603  68.4
3  20160604  57.5
4  20160605  51.4
<class 'pandas.core.series.Series'>


# Descriptive statistics
# use mean(), median(), min(), max(), and std() on any series

print(data['TEMP'].mean())
print(data.mean())

59.730000000000004
YEARMODA    2.016062e+07
TEMP        5.973000e+01
MAX         6.804074e+01
MIN         5.125714e+01
dtype: float64


data.describe()


# Basic plots

data[["TEMP", "MAX", "MIN"]].plot()

<Axes: >


# Calculate max min difference
data['DIFF'] = data['MAX'] - data['MIN']

# Check the result
data.head()


# Filtering and updating data
data['TEMP_CELSIUS'] = (data['TEMP'] - 32) / (9/5)
warm_temps = data.loc[data['TEMP_CELSIUS'] > 15]
warm_temps.head()


# Select rows with temp celsius higher than 15 degrees from late June 2016
warm_temps = data.loc[(data['TEMP_CELSIUS'] > 15) & (data['YEARMODA'] >= 20160615)]
# boolean indexing: | for or, & for and, ~ for not
warm_temps.head()


warm_temps.reset_index(drop=True, inplace=True)
 # drop -> drop the old index in stead of adding a separate index column
 # inplace -> modify the current data frame
warm_temps.head(12)


# isna() detects NaN
# isnull() detects NaN, None, NaT
warm_temps.isna().head(12)


# Drop no data values based on the MIN column
warm_temps.dropna(subset=['MIN']) # not an inplace operation


# Fill na values
warm_temps.fillna(-100) # not an inplace operation


# Data type conversions -- astype()

print("Truncated integer values:")
data['TEMP'].astype(int).head()

Truncated integer values:

0    65
1    65
2    68
3    57
4    51
Name: TEMP, dtype: int32


print("Rounded integer values:")
print(data['TEMP'].round(0).astype(int).head())

Rounded integer values:
0    66
1    66
2    68
3    58
4    51
Name: TEMP, dtype: int32


data['TEMP'].unique()

array([65.5, 65.8, 68.4, 57.5, 51.4, 52.2, 56.9, 54.2, 49.4, 49.5, 54. ,
       55.4, 58.3, 59.7, 63.4, 57.8, 60.4, 57.3, 56.3, 59.3, 62.6, 61.7,
       60.9, 61.1, 65.7, 69.6, 60.7, 65.4])


data.sort_values(by='TEMP', ascending=False).head()


import pandas as pd
df1 = pd.DataFrame({'user_id': ['id001', 'id002', 'id003', 'id004', 'id005', 'id006', 'id007'],
                    'first_name': ['Rivi', 'Wynnie', 'Kristos', 'Madalyn', 'Tobe', 'Regan', 'Kristin'],
                    'last_name': ['Valti', 'McMurty', 'Ivanets', 'Max', 'Riddich', 'Huyghe', 'Illis'],
                    'email': ['rvalti0@example.com', 'wmcmurty1@example.com', 'kivanets2@example.com',
                              'mmax3@example.com', 'triddich4@example.com', 'rhuyghe@example.com', 'killis4@example.com']
                    })
df1


df2 = pd.DataFrame({'user_id': ['id001', 'id002', 'id003', 'id004', 'id005'],
                    'image_url': ['http://example.com/img/id001.png', 'http://example.com/img/id002.jpg',
                                  'http://example.com/img/id003.bmp', 'http://example.com/img/id004.jpg',
                                  'http://example.com/img/id005.png']
                    })

df2.head()


df3_merged = pd.merge(df1, df2)
df3_merged


df_left_merge = pd.merge(df1, df2, how ='left')
df_left_merge


# use of join

df_join = df1.join(df2, rsuffix='_right')

df_join


# use of concat

df2_addition = pd.DataFrame({'user_id': ['id006', 'id007'],
                             'image_url': ['http://example.com/img/id006.png',
                                           'http://example.com/img/id007.jpg']
                             })
df2_addition


df_row_concat = pd.concat([df2, df2_addition])

df_row_concat

		0	1	2	3
2ICT1	Jan	0.758809	0.796356	0.329736	1.186778
2ICT1	Piet	0.704024	-0.997745	0.388996	0.574466
2ICT2	Joris	-1.645122	-0.402663	-0.529504	-0.912307
2ICT2	Corneel	0.254901	-1.248586	0.285135	1.389129

	YEARMODA	TEMP	MAX	MIN
count	3.000000e+01	30.000000	27.000000	28.000000
mean	2.016062e+07	59.730000	68.040741	51.257143
std	8.803408e+00	5.475472	6.505575	5.498985
min	2.016060e+07	49.400000	54.100000	41.700000
25%	2.016061e+07	56.450000	64.650000	46.975000
50%	2.016062e+07	60.050000	69.100000	53.100000
75%	2.016062e+07	64.900000	72.050000	55.600000
max	2.016063e+07	69.600000	80.800000	60.300000

	YEARMODA	TEMP	MAX	MIN	DIFF	TEMP_CELSIUS
0	20160601	65.5	73.6	54.7	18.9	18.611111
1	20160602	65.8	80.8	55.0	25.8	18.777778
2	20160603	68.4	NaN	55.6	NaN	20.222222
13	20160614	59.7	67.8	47.8	20.0	15.388889
14	20160615	63.4	70.3	49.3	21.0	17.444444

	YEARMODA	TEMP	MAX	MIN	DIFF	TEMP_CELSIUS
14	20160615	63.4	70.3	49.3	21.0	17.444444
16	20160617	60.4	70.7	55.9	14.8	15.777778
19	20160620	59.3	69.1	52.2	16.9	15.166667
20	20160621	62.6	71.4	50.4	21.0	17.000000
21	20160622	61.7	70.2	55.4	14.8	16.500000

	YEARMODA	TEMP	MAX	MIN	DIFF	TEMP_CELSIUS
0	20160615	63.4	70.3	49.3	21.0	17.444444
1	20160617	60.4	70.7	55.9	14.8	15.777778
2	20160620	59.3	69.1	52.2	16.9	15.166667
3	20160621	62.6	71.4	50.4	21.0	17.000000
4	20160622	61.7	70.2	55.4	14.8	16.500000
5	20160623	60.9	67.1	54.9	12.2	16.055556
6	20160624	61.1	68.9	56.7	12.2	16.166667
7	20160625	65.7	75.4	57.9	17.5	18.722222
8	20160626	69.6	77.7	60.3	17.4	20.888889
9	20160627	60.7	70.0	NaN	NaN	15.944444
10	20160628	65.4	73.0	55.8	17.2	18.555556
11	20160629	65.8	73.2	NaN	NaN	18.777778

Pandas: een inleiding¶

K. Verbeeck, T. Van den Bossche, J. Maervoet¶

De Pandas library¶

Data Science context¶

Eenvoudige voorbeelden van de modeling¶

Waar kan Pandas nuttig zijn?¶

Datastructuren Pandas¶

Datastructuur #1: Series¶

Datastructuur #2: DataFrame¶

Datastructuur #3: ...¶

Data verkennen en verwerken met Pandas: een eerste opwarmer¶

Kumpula-June-2016-w-metadata.txt¶

Data cleaning …

Combinding pandas dataframes …

	YEARMODA	TEMP	MAX	MIN	DIFF	TEMP_CELSIUS
0	False	False	False	False	False	False
1	False	False	False	False	False	False
2	False	False	False	False	False	False
3	False	False	False	False	False	False
4	False	False	False	False	False	False
5	False	False	False	False	False	False
6	False	False	False	False	False	False
7	False	False	False	False	False	False
8	False	False	False	False	False	False
9	False	False	False	True	True	False
10	False	False	False	False	False	False
11	False	False	False	True	True	False

	user_id	first_name	last_name	email
0	id001	Rivi	Valti	rvalti0@example.com
1	id002	Wynnie	McMurty	wmcmurty1@example.com
2	id003	Kristos	Ivanets	kivanets2@example.com
3	id004	Madalyn	Max	mmax3@example.com
4	id005	Tobe	Riddich	triddich4@example.com
5	id006	Regan	Huyghe	rhuyghe@example.com
6	id007	Kristin	Illis	killis4@example.com

	user_id	image_url
0	id001	http://example.com/img/id001.png
1	id002	http://example.com/img/id002.jpg
2	id003	http://example.com/img/id003.bmp
3	id004	http://example.com/img/id004.jpg
4	id005	http://example.com/img/id005.png

	user_id	image_url
0	id006	http://example.com/img/id006.png
1	id007	http://example.com/img/id007.jpg

	Name	Age
0	Tom	28
1	Jack	34
2	Steve	29
3	Ricky	42