Introduction to Pandas Toolkit
In this module we will learn the fundamentals of one of the most important toolkits Python has for data cleaning and processing i.e pandas. pandas is an open source, BSD-licensed library providing high-performance, easy-to-use data structures and data analysis tools for the Python programming language. We will learn how to read in data into DataFrame structures, how to query these structures, and the details about such structures are indexed.
Introduction to Series Data Structure
import pandas as pd
# read the documentation of Series
pd.Series?
animals = ['Tiger', 'Bear','Mouse']
# passing list to series
pd.Series(animals)
0 Tiger
1 Bear
2 Mouse
dtype: object
# passing numbers to see what dtype panda sets
numbers = [6,7,8]
pd.Series(numbers)
0 6
1 7
2 8
dtype: int64
Let's see how pandas handle missing data type
animals = ['Tiger', 'Bear', None ]
pd.Series(animals)
0 Tiger
1 Bear
2 None
dtype: object
numbers = [ 5, 6, None ]
pd.Series(numbers)
0 5.0
1 6.0
2 NaN
dtype: float64
# compare NaN with None
import numpy as np
np.nan == None
False
# even when you compare nan to other nan
np.nan == np.nan
False
# you need to use special function to check nan
np.isnan(np.nan)
True
# creating series with dict given
sports = {'Hockey': 'India',
'Golf': 'Scotland',
'Sumo': 'Japan',
'Taekwondo': 'South Korea'}
s = pd.Series(sports)
s
Golf Scotland
Hockey India
Sumo Japan
Taekwondo South Korea
dtype: object
# printing indexs of series
s.index
Index(['Golf', 'Hockey', 'Sumo', 'Taekwondo'], dtype='object')
# providing inline values to series
s = pd.Series(['Tiger', 'Bear', 'Moose'], index=['India', 'America', 'Canada'])
s
India Tiger
America Bear
Canada Moose
dtype: object
# if index values not provided then exiting are overiden and missing ones are set NaN
sports = {'Archery': 'Bhutan',
'Golf': 'Scotland',
'Sumo': 'Japan',
'Taekwondo': 'South Korea'}
s = pd.Series(sports, index=['Golf', 'Sumo', 'Hockey'])
s
Golf Scotland
Sumo Japan
Hockey NaN
dtype: object
Querying Series
# creating a series
sports = {'Archery': 'Bhutan',
'Golf': 'Scotland',
'Sumo': 'Japan',
'Taekwondo': 'South Korea'}
s = pd.Series(sports)
s
Archery Bhutan
Golf Scotland
Sumo Japan
Taekwondo South Korea
dtype: object
# locating the 4th element value
s.iloc[3]
'South Korea'
# getting value of golf key
s.loc['Golf']
# iloc and loc are not methods but they are attributes
# iloc and loc are used for "row selection"
'Scotland'
# pandas smartly recognizes this as you want query through iloc
s[3]
'South Korea'
# same goes with loc
s['Golf']
'Scotland'
# but if your labels are list of numbers then it can not recognize it as iloc or loc
sports = {99: 'Bhutan',
100: 'Scotland',
101: 'Japan',
102: 'South Korea'}
s = pd.Series(sports)
s[0] #This won't call s.iloc[0] as one might expect, it fails to recognize weather to call iloc or loc
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-21-2d9b4559f19a> in <module>()
5 102: 'South Korea'}
6 s = pd.Series(sports)
----> 7 s[0] #This won't call s.iloc[0] as one might expect, it fails to recognize weather to call iloc or loc
/usr/local/lib/python3.5/dist-packages/pandas/core/series.py in __getitem__(self, key)
599 key = com._apply_if_callable(key, self)
600 try:
--> 601 result = self.index.get_value(self, key)
602
603 if not is_scalar(result):
/usr/local/lib/python3.5/dist-packages/pandas/core/indexes/base.py in get_value(self, series, key)
2475 try:
2476 return self._engine.get_value(s, k,
-> 2477 tz=getattr(series.dtype, 'tz', None))
2478 except KeyError as e1:
2479 if len(self) > 0 and self.inferred_type in ['integer', 'boolean']:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4404)()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_value (pandas/_libs/index.c:4087)()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc (pandas/_libs/index.c:5126)()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas/_libs/hashtable.c:14031)()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.Int64HashTable.get_item (pandas/_libs/hashtable.c:13975)()
KeyError: 0
performing operations
# creating a series
s = pd.Series([100.00, 120.00, 101.00, 3.00])
s
0 100.0
1 120.0
2 101.0
3 3.0
dtype: float64
# let's find sum
total = 0
for item in s:
total+=item
print(total)
324.0
# it worked but it's slow, it was serial. Let's make it more efficient
import numpy as np
totoal = np.sum(s)
print(total)
324.0
# both of these methods produces same result, let's find out which one is faster
#this creates a big series of random numbers
s = pd.Series(np.random.randint(0,1000,10000))
s.head()
0 535
1 201
2 648
3 599
4 360
dtype: int64
# lenght of series
len(s)
10000
%%timeit -n 100
summary = 0
for item in s:
summary+=item
1.41 ms ± 172 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%%timeit -n 100
summary = np.sum(s)
157 µs ± 18.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# braodcasting :- apply the operation on each value in the series
s+= 2
s.head()
0 537
1 203
2 650
3 601
4 362
dtype: int64
# Now let's compare time required for broadcasting by normal operation and pandas series method
# this is how normal method will look like
for label, value in s.iteritems():
s.set_value(label, value+2)
s.head()
0 539
1 205
2 652
3 603
4 364
dtype: int64
%%timeit -n 10
s = pd.Series(np.random.randint(0,1000,10000))
for label, value in s.iteritems():
s.loc[label]= value+2
1.14 s ± 224 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
%%timeit -n 10
s = pd.Series(np.random.randint(0,1000,10000))
s+=2
546 µs ± 99.3 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
# mixed data types in series, pandas automatically changes underlaying numpy data types
s = pd.Series([1, 2, 3])
s.loc['Animal'] = 'Bears'
s
0 1
1 2
2 3
Animal Bears
dtype: object
# when series not elements that are not unique and how it's different conceptually from relational databases
original_sports = pd.Series({'Archery': 'Bhutan',
'Golf': 'Scotland',
'Sumo': 'Japan',
'Taekwondo': 'South Korea'})
# creating a new series
cricket_loving_countries = pd.Series(['Australia',
'Barbados',
'Pakistan',
'England'],
index=['Cricket',
'Cricket',
'Cricket',
'Cricket'])
# appending with first series. This actually crates a new series and does not affact the original one
all_countries = original_sports.append(cricket_loving_countries)
all_countries
Archery Bhutan
Golf Scotland
Sumo Japan
Taekwondo South Korea
Cricket Australia
Cricket Barbados
Cricket Pakistan
Cricket England
dtype: object
# original series
original_sports
Archery Bhutan
Golf Scotland
Sumo Japan
Taekwondo South Korea
dtype: object
cricket_loving_countries
Cricket Australia
Cricket Barbados
Cricket Pakistan
Cricket England
dtype: object
# series returned for multiple items
all_countries.loc['Cricket']
Cricket Australia
Cricket Barbados
Cricket Pakistan
Cricket England
dtype: object
The DataFrame Data Structure
# DataFrame is conceptualy 2D series object, having index and multiple columns and each column having label
import pandas as pd
purchase_1 = pd.Series({'Name': 'Chris',
'Item Purchased': 'Dog Food',
'Cost': 22.50})
purchase_2 = pd.Series({'Name': 'Kevyn',
'Item Purchased': 'Kitty Litter',
'Cost': 2.50})
purchase_3 = pd.Series({'Name': 'Vinod',
'Item Purchased': 'Bird Seed',
'Cost': 5.00})
df = pd.DataFrame([purchase_1, purchase_2, purchase_3], index=['Store 1', 'Store 1', 'Store 2'])
df.head()
Cost | Item Purchased | Name | |
---|---|---|---|
Store 1 | 22.5 | Dog Food | Chris |
Store 1 | 2.5 | Kitty Litter | Kevyn |
Store 2 | 5.0 | Bird Seed | Vinod |
# retreiving the items
df.loc['Store 2']
Cost 5
Item Purchased Bird Seed
Name Vinod
Name: Store 2, dtype: object
# the type of output
type(df.loc['Store 2'])
pandas.core.series.Series
# index and columns can be non unique e.g Store 1 is repeated here.
df.loc['Store 1']
Cost | Item Purchased | Name | |
---|---|---|---|
Store 1 | 22.5 | Dog Food | Chris |
Store 1 | 2.5 | Kitty Litter | Kevyn |
# type of output
type(df.loc['Store 1'])
pandas.core.frame.DataFrame
# if we want only cost column of Store 1 index
df.loc['Store 1']['Cost']
Store 1 22.5
Store 1 2.5
Name: Cost, dtype: float64
# Transpose of given dataframe
df.T
Store 1 | Store 1 | Store 2 | |
---|---|---|---|
Cost | 22.5 | 2.5 | 5 |
Item Purchased | Dog Food | Kitty Litter | Bird Seed |
Name | Chris | Kevyn | Vinod |
# to get cost of all stores
df.T.loc['Cost']
Store 1 22.5
Store 1 2.5
Store 2 5
Name: Cost, dtype: object
# chaining, it comes with cost so btter to use another approach, for changing data chaining operation like below are not effective.
df.loc['Store 1', 'Cost']
Store 1 22.5
Store 1 2.5
Name: Cost, dtype: float64
# avoid chaining and use this
df.loc[:,['Name', 'Cost']]
Name | Cost | |
---|---|---|
Store 1 | Chris | 22.5 |
Store 1 | Kevyn | 2.5 |
Store 2 | Vinod | 5.0 |
# deleting data, drop takes either index or row lable.
# drop doesn't change the original data frame and returns the copy of changed dataframe
df.drop('Store 1')
Cost | Item Purchased | Name | |
---|---|---|---|
Store 2 | 5.0 | Bird Seed | Vinod |
# original datafrmae is still intact
df
Cost | Item Purchased | Name | |
---|---|---|---|
Store 1 | 22.5 | Dog Food | Chris |
Store 1 | 2.5 | Kitty Litter | Kevyn |
Store 2 | 5.0 | Bird Seed | Vinod |
# normal practice while droping
copy_df = df.copy()
copy_df = copy_df.drop('Store 1')
copy_df
Cost | Item Purchased | Name | |
---|---|---|---|
Store 2 | 5.0 | Bird Seed | Vinod |
# there are two options in drop function 1. inplace 2. dropping column
copy_df.drop?
# deleting column
del copy_df['Name']
copy_df
Cost | Item Purchased | |
---|---|---|
Store 2 | 5.0 | Bird Seed |
# adding new column is as good as assigining to some new value
df['Location'] = None
df
Cost | Item Purchased | Name | Location | |
---|---|---|---|---|
Store 1 | 22.5 | Dog Food | Chris | None |
Store 1 | 2.5 | Kitty Litter | Kevyn | None |
Store 2 | 5.0 | Bird Seed | Vinod | None |
DataFrame indexing and Loading
# pandas toolkit gives views on dataFrame that means if you reduce some columns from dataFrame and perform operations,
# it will change the base dataframe too.
costs = df['Cost']
costs
Store 1 22.5
Store 1 2.5
Store 2 5.0
Name: Cost, dtype: float64
# broadcasting on costs
costs+=2
costs
Store 1 24.5
Store 1 4.5
Store 2 7.0
Name: Cost, dtype: float64
# it changed original dataFrame too
df
Cost | Item Purchased | Name | Location | |
---|---|---|---|---|
Store 1 | 24.5 | Dog Food | Chris | None |
Store 1 | 4.5 | Kitty Litter | Kevyn | None |
Store 2 | 7.0 | Bird Seed | Vinod | None |
# priting the contents of olympics.csv
!cat olympics.csv
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
,№ Summer,01 !,02 !,03 !,Total,№ Winter,01 !,02 !,03 !,Total,№ Games,01 !,02 !,03 !,Combined total
Afghanistan (AFG),13,0,0,2,2,0,0,0,0,0,13,0,0,2,2
Algeria (ALG),12,5,2,8,15,3,0,0,0,0,15,5,2,8,15
Argentina (ARG),23,18,24,28,70,18,0,0,0,0,41,18,24,28,70
Armenia (ARM),5,1,2,9,12,6,0,0,0,0,11,1,2,9,12
Australasia (ANZ) [ANZ],2,3,4,5,12,0,0,0,0,0,2,3,4,5,12
Australia (AUS) [AUS] [Z],25,139,152,177,468,18,5,3,4,12,43,144,155,181,480
Austria (AUT),26,18,33,35,86,22,59,78,81,218,48,77,111,116,304
Azerbaijan (AZE),5,6,5,15,26,5,0,0,0,0,10,6,5,15,26
Bahamas (BAH),15,5,2,5,12,0,0,0,0,0,15,5,2,5,12
Bahrain (BRN),8,0,0,1,1,0,0,0,0,0,8,0,0,1,1
Barbados (BAR) [BAR],11,0,0,1,1,0,0,0,0,0,11,0,0,1,1
Belarus (BLR),5,12,24,39,75,6,6,4,5,15,11,18,28,44,90
Belgium (BEL),25,37,52,53,142,20,1,1,3,5,45,38,53,56,147
Bermuda (BER),17,0,0,1,1,7,0,0,0,0,24,0,0,1,1
Bohemia (BOH) [BOH] [Z],3,0,1,3,4,0,0,0,0,0,3,0,1,3,4
Botswana (BOT),9,0,1,0,1,0,0,0,0,0,9,0,1,0,1
Brazil (BRA),21,23,30,55,108,7,0,0,0,0,28,23,30,55,108
British West Indies (BWI) [BWI],1,0,0,2,2,0,0,0,0,0,1,0,0,2,2
Bulgaria (BUL) [H],19,51,85,78,214,19,1,2,3,6,38,52,87,81,220
Burundi (BDI),5,1,0,0,1,0,0,0,0,0,5,1,0,0,1
Cameroon (CMR),13,3,1,1,5,1,0,0,0,0,14,3,1,1,5
Canada (CAN),25,59,99,121,279,22,62,56,52,170,47,121,155,173,449
Chile (CHI) [I],22,2,7,4,13,16,0,0,0,0,38,2,7,4,13
China (CHN) [CHN],9,201,146,126,473,10,12,22,19,53,19,213,168,145,526
Colombia (COL),18,2,6,11,19,1,0,0,0,0,19,2,6,11,19
Costa Rica (CRC),14,1,1,2,4,6,0,0,0,0,20,1,1,2,4
Ivory Coast (CIV) [CIV],12,0,1,0,1,0,0,0,0,0,12,0,1,0,1
Croatia (CRO),6,6,7,10,23,7,4,6,1,11,13,10,13,11,34
Cuba (CUB) [Z],19,72,67,70,209,0,0,0,0,0,19,72,67,70,209
Cyprus (CYP),9,0,1,0,1,10,0,0,0,0,19,0,1,0,1
Czech Republic (CZE) [CZE],5,14,15,15,44,6,7,9,8,24,11,21,24,23,68
Czechoslovakia (TCH) [TCH],16,49,49,45,143,16,2,8,15,25,32,51,57,60,168
Denmark (DEN) [Z],26,43,68,68,179,13,0,1,0,1,39,43,69,68,180
Djibouti (DJI) [B],7,0,0,1,1,0,0,0,0,0,7,0,0,1,1
Dominican Republic (DOM),13,3,2,1,6,0,0,0,0,0,13,3,2,1,6
Ecuador (ECU),13,1,1,0,2,0,0,0,0,0,13,1,1,0,2
Egypt (EGY) [EGY] [Z],21,7,9,10,26,1,0,0,0,0,22,7,9,10,26
Eritrea (ERI),4,0,0,1,1,0,0,0,0,0,4,0,0,1,1
Estonia (EST),11,9,9,15,33,9,4,2,1,7,20,13,11,16,40
Ethiopia (ETH),12,21,7,17,45,2,0,0,0,0,14,21,7,17,45
Finland (FIN),24,101,84,117,302,22,42,62,57,161,46,143,146,174,463
France (FRA) [O] [P] [Z],27,202,223,246,671,22,31,31,47,109,49,233,254,293,780
Gabon (GAB),9,0,1,0,1,0,0,0,0,0,9,0,1,0,1
Georgia (GEO),5,6,5,14,25,6,0,0,0,0,11,6,5,14,25
Germany (GER) [GER] [Z],15,174,182,217,573,11,78,78,53,209,26,252,260,270,782
United Team of Germany (EUA) [EUA],3,28,54,36,118,3,8,6,5,19,6,36,60,41,137
East Germany (GDR) [GDR],5,153,129,127,409,6,39,36,35,110,11,192,165,162,519
West Germany (FRG) [FRG],5,56,67,81,204,6,11,15,13,39,11,67,82,94,243
Ghana (GHA) [GHA],13,0,1,3,4,1,0,0,0,0,14,0,1,3,4
Great Britain (GBR) [GBR] [Z],27,236,272,272,780,22,10,4,12,26,49,246,276,284,806
Greece (GRE) [Z],27,30,42,39,111,18,0,0,0,0,45,30,42,39,111
Grenada (GRN),8,1,0,0,1,0,0,0,0,0,8,1,0,0,1
Guatemala (GUA),13,0,1,0,1,1,0,0,0,0,14,0,1,0,1
Guyana (GUY) [GUY],16,0,0,1,1,0,0,0,0,0,16,0,0,1,1
Haiti (HAI) [J],14,0,1,1,2,0,0,0,0,0,14,0,1,1,2
Hong Kong (HKG) [HKG],15,1,1,1,3,4,0,0,0,0,19,1,1,1,3
Hungary (HUN),25,167,144,165,476,22,0,2,4,6,47,167,146,169,482
Iceland (ISL),19,0,2,2,4,17,0,0,0,0,36,0,2,2,4
India (IND) [F],23,9,6,11,26,9,0,0,0,0,32,9,6,11,26
Indonesia (INA),14,6,10,11,27,0,0,0,0,0,14,6,10,11,27
Iran (IRI) [K],15,15,20,25,60,10,0,0,0,0,25,15,20,25,60
Iraq (IRQ),13,0,0,1,1,0,0,0,0,0,13,0,0,1,1
Ireland (IRL),20,9,8,12,29,6,0,0,0,0,26,9,8,12,29
Israel (ISR),15,1,1,5,7,6,0,0,0,0,21,1,1,5,7
Italy (ITA) [M] [S],26,198,166,185,549,22,37,34,43,114,48,235,200,228,663
Jamaica (JAM) [JAM],16,17,30,20,67,7,0,0,0,0,23,17,30,20,67
Japan (JPN),21,130,126,142,398,20,10,17,18,45,41,140,143,160,443
Kazakhstan (KAZ),5,16,17,19,52,6,1,3,3,7,11,17,20,22,59
Kenya (KEN),13,25,32,29,86,3,0,0,0,0,16,25,32,29,86
North Korea (PRK),9,14,12,21,47,8,0,1,1,2,17,14,13,22,49
South Korea (KOR),16,81,82,80,243,17,26,17,10,53,33,107,99,90,296
Kuwait (KUW),12,0,0,2,2,0,0,0,0,0,12,0,0,2,2
Kyrgyzstan (KGZ),5,0,1,2,3,6,0,0,0,0,11,0,1,2,3
Latvia (LAT),10,3,11,5,19,10,0,4,3,7,20,3,15,8,26
Lebanon (LIB),16,0,2,2,4,16,0,0,0,0,32,0,2,2,4
Liechtenstein (LIE),16,0,0,0,0,18,2,2,5,9,34,2,2,5,9
Lithuania (LTU),8,6,5,10,21,8,0,0,0,0,16,6,5,10,21
Luxembourg (LUX) [O],22,1,1,0,2,8,0,2,0,2,30,1,3,0,4
Macedonia (MKD),5,0,0,1,1,5,0,0,0,0,10,0,0,1,1
Malaysia (MAS) [MAS],12,0,3,3,6,0,0,0,0,0,12,0,3,3,6
Mauritius (MRI),8,0,0,1,1,0,0,0,0,0,8,0,0,1,1
Mexico (MEX),22,13,21,28,62,8,0,0,0,0,30,13,21,28,62
Moldova (MDA),5,0,2,5,7,6,0,0,0,0,11,0,2,5,7
Mongolia (MGL),12,2,9,13,24,13,0,0,0,0,25,2,9,13,24
Montenegro (MNE),2,0,1,0,1,2,0,0,0,0,4,0,1,0,1
Morocco (MAR),13,6,5,11,22,6,0,0,0,0,19,6,5,11,22
Mozambique (MOZ),9,1,0,1,2,0,0,0,0,0,9,1,0,1,2
Namibia (NAM),6,0,4,0,4,0,0,0,0,0,6,0,4,0,4
Netherlands (NED) [Z],25,77,85,104,266,20,37,38,35,110,45,114,123,139,376
Netherlands Antilles (AHO) [AHO] [I],13,0,1,0,1,2,0,0,0,0,15,0,1,0,1
New Zealand (NZL) [NZL],22,42,18,39,99,15,0,1,0,1,37,42,19,39,100
Niger (NIG),11,0,0,1,1,0,0,0,0,0,11,0,0,1,1
Nigeria (NGR),15,3,8,12,23,0,0,0,0,0,15,3,8,12,23
Norway (NOR) [Q],24,56,49,43,148,22,118,111,100,329,46,174,160,143,477
Pakistan (PAK),16,3,3,4,10,2,0,0,0,0,18,3,3,4,10
Panama (PAN),16,1,0,2,3,0,0,0,0,0,16,1,0,2,3
Paraguay (PAR),11,0,1,0,1,1,0,0,0,0,12,0,1,0,1
Peru (PER) [L],17,1,3,0,4,2,0,0,0,0,19,1,3,0,4
Philippines (PHI),20,0,2,7,9,4,0,0,0,0,24,0,2,7,9
Poland (POL),20,64,82,125,271,22,6,7,7,20,42,70,89,132,291
Portugal (POR),23,4,8,11,23,7,0,0,0,0,30,4,8,11,23
Puerto Rico (PUR),17,0,2,6,8,6,0,0,0,0,23,0,2,6,8
Qatar (QAT),8,0,0,4,4,0,0,0,0,0,8,0,0,4,4
Romania (ROU),20,88,94,119,301,20,0,0,1,1,40,88,94,120,302
Russia (RUS) [RUS],5,132,121,142,395,6,49,40,35,124,11,181,161,177,519
Russian Empire (RU1) [RU1],3,1,4,3,8,0,0,0,0,0,3,1,4,3,8
Soviet Union (URS) [URS],9,395,319,296,1010,9,78,57,59,194,18,473,376,355,1204
Unified Team (EUN) [EUN],1,45,38,29,112,1,9,6,8,23,2,54,44,37,135
Saudi Arabia (KSA),10,0,1,2,3,0,0,0,0,0,10,0,1,2,3
Senegal (SEN),13,0,1,0,1,5,0,0,0,0,18,0,1,0,1
Serbia (SRB) [SRB],3,1,2,4,7,2,0,0,0,0,5,1,2,4,7
Serbia and Montenegro (SCG) [SCG],3,2,4,3,9,3,0,0,0,0,6,2,4,3,9
Singapore (SIN),15,0,2,2,4,0,0,0,0,0,15,0,2,2,4
Slovakia (SVK) [SVK],5,7,9,8,24,6,2,2,1,5,11,9,11,9,29
Slovenia (SLO),6,4,6,9,19,7,2,4,9,15,13,6,10,18,34
South Africa (RSA),18,23,26,27,76,6,0,0,0,0,24,23,26,27,76
Spain (ESP) [Z],22,37,59,35,131,19,1,0,1,2,41,38,59,36,133
Sri Lanka (SRI) [SRI],16,0,2,0,2,0,0,0,0,0,16,0,2,0,2
Sudan (SUD),11,0,1,0,1,0,0,0,0,0,11,0,1,0,1
Suriname (SUR) [E],11,1,0,1,2,0,0,0,0,0,11,1,0,1,2
Sweden (SWE) [Z],26,143,164,176,483,22,50,40,54,144,48,193,204,230,627
Switzerland (SUI),27,47,73,65,185,22,50,40,48,138,49,97,113,113,323
Syria (SYR),12,1,1,1,3,0,0,0,0,0,12,1,1,1,3
Chinese Taipei (TPE) [TPE] [TPE2],13,2,7,12,21,11,0,0,0,0,24,2,7,12,21
Tajikistan (TJK),5,0,1,2,3,4,0,0,0,0,9,0,1,2,3
Tanzania (TAN) [TAN],12,0,2,0,2,0,0,0,0,0,12,0,2,0,2
Thailand (THA),15,7,6,11,24,3,0,0,0,0,18,7,6,11,24
Togo (TOG),9,0,0,1,1,1,0,0,0,0,10,0,0,1,1
Tonga (TGA),8,0,1,0,1,1,0,0,0,0,9,0,1,0,1
Trinidad and Tobago (TRI) [TRI],16,2,5,11,18,3,0,0,0,0,19,2,5,11,18
Tunisia (TUN),13,3,3,4,10,0,0,0,0,0,13,3,3,4,10
Turkey (TUR),21,39,25,24,88,16,0,0,0,0,37,39,25,24,88
Uganda (UGA),14,2,3,2,7,0,0,0,0,0,14,2,3,2,7
Ukraine (UKR),5,33,27,55,115,6,2,1,4,7,11,35,28,59,122
United Arab Emirates (UAE),8,1,0,0,1,0,0,0,0,0,8,1,0,0,1
United States (USA) [P] [Q] [R] [Z],26,976,757,666,2399,22,96,102,84,282,48,1072,859,750,2681
Uruguay (URU),20,2,2,6,10,1,0,0,0,0,21,2,2,6,10
Uzbekistan (UZB),5,5,5,10,20,6,1,0,0,1,11,6,5,10,21
Venezuela (VEN),17,2,2,8,12,4,0,0,0,0,21,2,2,8,12
Vietnam (VIE),14,0,2,0,2,0,0,0,0,0,14,0,2,0,2
Virgin Islands (ISV),11,0,1,0,1,7,0,0,0,0,18,0,1,0,1
Yugoslavia (YUG) [YUG],16,26,29,28,83,14,0,3,1,4,30,26,32,29,87
Independent Olympic Participants (IOP) [IOP],1,0,1,2,3,0,0,0,0,0,1,0,1,2,3
Zambia (ZAM) [ZAM],12,0,1,1,2,0,0,0,0,0,12,0,1,1,2
Zimbabwe (ZIM) [ZIM],12,3,4,1,8,1,0,0,0,0,13,3,4,1,8
Mixed team (ZZX) [ZZX],3,8,5,4,17,0,0,0,0,0,3,8,5,4,17
Totals,27,4809,4775,5130,14714,22,959,958,948,2865,49,5768,5733,6078,17579
# adding this data dataframe
df = pd.read_csv('olympics.csv')
df.head()
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | NaN | № Summer | 01 ! | 02 ! | 03 ! | Total | № Winter | 01 ! | 02 ! | 03 ! | Total | № Games | 01 ! | 02 ! | 03 ! | Combined total |
1 | Afghanistan (AFG) | 13 | 0 | 0 | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 13 | 0 | 0 | 2 | 2 |
2 | Algeria (ALG) | 12 | 5 | 2 | 8 | 15 | 3 | 0 | 0 | 0 | 0 | 15 | 5 | 2 | 8 | 15 |
3 | Argentina (ARG) | 23 | 18 | 24 | 28 | 70 | 18 | 0 | 0 | 0 | 0 | 41 | 18 | 24 | 28 | 70 |
4 | Armenia (ARM) | 5 | 1 | 2 | 9 | 12 | 6 | 0 | 0 | 0 | 0 | 11 | 1 | 2 | 9 | 12 |
# but we want countries column as index and skip the first row
df = pd.read_csv('olympics.csv', index_col = 0, skiprows=1)
df.head()
№ Summer | 01 ! | 02 ! | 03 ! | Total | № Winter | 01 !.1 | 02 !.1 | 03 !.1 | Total.1 | № Games | 01 !.2 | 02 !.2 | 03 !.2 | Combined total | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Afghanistan (AFG) | 13 | 0 | 0 | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 13 | 0 | 0 | 2 | 2 |
Algeria (ALG) | 12 | 5 | 2 | 8 | 15 | 3 | 0 | 0 | 0 | 0 | 15 | 5 | 2 | 8 | 15 |
Argentina (ARG) | 23 | 18 | 24 | 28 | 70 | 18 | 0 | 0 | 0 | 0 | 41 | 18 | 24 | 28 | 70 |
Armenia (ARM) | 5 | 1 | 2 | 9 | 12 | 6 | 0 | 0 | 0 | 0 | 11 | 1 | 2 | 9 | 12 |
Australasia (ANZ) [ANZ] | 2 | 3 | 4 | 5 | 12 | 0 | 0 | 0 | 0 | 0 | 2 | 3 | 4 | 5 | 12 |
# renaming the columns
for col in df.columns:
if col[:2]=='01':
df.rename(columns={col:'Gold' + col[4:]}, inplace=True)
if col[:2]=='02':
df.rename(columns={col:'Silver' + col[4:]}, inplace=True)
if col[:2]=='03':
df.rename(columns={col:'Bronze' + col[4:]}, inplace=True)
if col[:1]=='№':
df.rename(columns={col:'#' + col[1:]}, inplace=True)
df.head()
# Summer | Gold | Silver | Bronze | Total | # Winter | Gold.1 | Silver.1 | Bronze.1 | Total.1 | # Games | Gold.2 | Silver.2 | Bronze.2 | Combined total | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Afghanistan (AFG) | 13 | 0 | 0 | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 13 | 0 | 0 | 2 | 2 |
Algeria (ALG) | 12 | 5 | 2 | 8 | 15 | 3 | 0 | 0 | 0 | 0 | 15 | 5 | 2 | 8 | 15 |
Argentina (ARG) | 23 | 18 | 24 | 28 | 70 | 18 | 0 | 0 | 0 | 0 | 41 | 18 | 24 | 28 | 70 |
Armenia (ARM) | 5 | 1 | 2 | 9 | 12 | 6 | 0 | 0 | 0 | 0 | 11 | 1 | 2 | 9 | 12 |
Australasia (ANZ) [ANZ] | 2 | 3 | 4 | 5 | 12 | 0 | 0 | 0 | 0 | 0 | 2 | 3 | 4 | 5 | 12 |
Querying a DataFrame
# boolean masking
# we want countries those have won at least one gold medal
df['Gold'] > 0
Afghanistan (AFG) False
Algeria (ALG) True
Argentina (ARG) True
Armenia (ARM) True
Australasia (ANZ) [ANZ] True
Australia (AUS) [AUS] [Z] True
Austria (AUT) True
Azerbaijan (AZE) True
Bahamas (BAH) True
Bahrain (BRN) False
Barbados (BAR) [BAR] False
Belarus (BLR) True
Belgium (BEL) True
Bermuda (BER) False
Bohemia (BOH) [BOH] [Z] False
Botswana (BOT) False
Brazil (BRA) True
British West Indies (BWI) [BWI] False
Bulgaria (BUL) [H] True
Burundi (BDI) True
Cameroon (CMR) True
Canada (CAN) True
Chile (CHI) [I] True
China (CHN) [CHN] True
Colombia (COL) True
Costa Rica (CRC) True
Ivory Coast (CIV) [CIV] False
Croatia (CRO) True
Cuba (CUB) [Z] True
Cyprus (CYP) False
...
Sri Lanka (SRI) [SRI] False
Sudan (SUD) False
Suriname (SUR) [E] True
Sweden (SWE) [Z] True
Switzerland (SUI) True
Syria (SYR) True
Chinese Taipei (TPE) [TPE] [TPE2] True
Tajikistan (TJK) False
Tanzania (TAN) [TAN] False
Thailand (THA) True
Togo (TOG) False
Tonga (TGA) False
Trinidad and Tobago (TRI) [TRI] True
Tunisia (TUN) True
Turkey (TUR) True
Uganda (UGA) True
Ukraine (UKR) True
United Arab Emirates (UAE) True
United States (USA) [P] [Q] [R] [Z] True
Uruguay (URU) True
Uzbekistan (UZB) True
Venezuela (VEN) True
Vietnam (VIE) False
Virgin Islands (ISV) False
Yugoslavia (YUG) [YUG] True
Independent Olympic Participants (IOP) [IOP] False
Zambia (ZAM) [ZAM] False
Zimbabwe (ZIM) [ZIM] True
Mixed team (ZZX) [ZZX] True
Totals True
Name: Gold, Length: 147, dtype: bool
# applying boolean masking using where
only_gold = df.where(df['Gold'] > 0)
only_gold.head()
# Summer | Gold | Silver | Bronze | Total | # Winter | Gold.1 | Silver.1 | Bronze.1 | Total.1 | # Games | Gold.2 | Silver.2 | Bronze.2 | Combined total | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Afghanistan (AFG) | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Algeria (ALG) | 12.0 | 5.0 | 2.0 | 8.0 | 15.0 | 3.0 | 0.0 | 0.0 | 0.0 | 0.0 | 15.0 | 5.0 | 2.0 | 8.0 | 15.0 |
Argentina (ARG) | 23.0 | 18.0 | 24.0 | 28.0 | 70.0 | 18.0 | 0.0 | 0.0 | 0.0 | 0.0 | 41.0 | 18.0 | 24.0 | 28.0 | 70.0 |
Armenia (ARM) | 5.0 | 1.0 | 2.0 | 9.0 | 12.0 | 6.0 | 0.0 | 0.0 | 0.0 | 0.0 | 11.0 | 1.0 | 2.0 | 9.0 | 12.0 |
Australasia (ANZ) [ANZ] | 2.0 | 3.0 | 4.0 | 5.0 | 12.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 3.0 | 4.0 | 5.0 | 12.0 |
# NaN values is ignored while doing operations. For example
only_gold['Gold'].count()
100
df['Gold'].count()
147
# if we want to drop the rows which has no data, use dropna()
only_gold = only_gold.dropna()
only_gold.head()
# Summer | Gold | Silver | Bronze | Total | # Winter | Gold.1 | Silver.1 | Bronze.1 | Total.1 | # Games | Gold.2 | Silver.2 | Bronze.2 | Combined total | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Algeria (ALG) | 12.0 | 5.0 | 2.0 | 8.0 | 15.0 | 3.0 | 0.0 | 0.0 | 0.0 | 0.0 | 15.0 | 5.0 | 2.0 | 8.0 | 15.0 |
Argentina (ARG) | 23.0 | 18.0 | 24.0 | 28.0 | 70.0 | 18.0 | 0.0 | 0.0 | 0.0 | 0.0 | 41.0 | 18.0 | 24.0 | 28.0 | 70.0 |
Armenia (ARM) | 5.0 | 1.0 | 2.0 | 9.0 | 12.0 | 6.0 | 0.0 | 0.0 | 0.0 | 0.0 | 11.0 | 1.0 | 2.0 | 9.0 | 12.0 |
Australasia (ANZ) [ANZ] | 2.0 | 3.0 | 4.0 | 5.0 | 12.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 3.0 | 4.0 | 5.0 | 12.0 |
Australia (AUS) [AUS] [Z] | 25.0 | 139.0 | 152.0 | 177.0 | 468.0 | 18.0 | 5.0 | 3.0 | 4.0 | 12.0 | 43.0 | 144.0 | 155.0 | 181.0 | 480.0 |
only_gold = df[df['Gold'] > 0]
only_gold.head()
# Summer | Gold | Silver | Bronze | Total | # Winter | Gold.1 | Silver.1 | Bronze.1 | Total.1 | # Games | Gold.2 | Silver.2 | Bronze.2 | Combined total | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Algeria (ALG) | 12 | 5 | 2 | 8 | 15 | 3 | 0 | 0 | 0 | 0 | 15 | 5 | 2 | 8 | 15 |
Argentina (ARG) | 23 | 18 | 24 | 28 | 70 | 18 | 0 | 0 | 0 | 0 | 41 | 18 | 24 | 28 | 70 |
Armenia (ARM) | 5 | 1 | 2 | 9 | 12 | 6 | 0 | 0 | 0 | 0 | 11 | 1 | 2 | 9 | 12 |
Australasia (ANZ) [ANZ] | 2 | 3 | 4 | 5 | 12 | 0 | 0 | 0 | 0 | 0 | 2 | 3 | 4 | 5 | 12 |
Australia (AUS) [AUS] [Z] | 25 | 139 | 152 | 177 | 468 | 18 | 5 | 3 | 4 | 12 | 43 | 144 | 155 | 181 | 480 |
# Combining boolean mask to create complex queries
# e.g we want all the countries who reveived gold in summer olympics as well as gold in winter olympics
len(df[(df['Gold'] > 0) | (df['Gold.1'] > 0)])
101
# countries that won gold in winter olympics and not in summer olympics
df[(df['Gold.1'] > 0) & (df['Gold'] == 0)]
# Summer | Gold | Silver | Bronze | Total | # Winter | Gold.1 | Silver.1 | Bronze.1 | Total.1 | # Games | Gold.2 | Silver.2 | Bronze.2 | Combined total | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Liechtenstein (LIE) | 16 | 0 | 0 | 0 | 0 | 18 | 2 | 2 | 5 | 9 | 34 | 2 | 2 | 5 | 9 |
Indexing Dataframes
df.head()
# Summer | Gold | Silver | Bronze | Total | # Winter | Gold.1 | Silver.1 | Bronze.1 | Total.1 | # Games | Gold.2 | Silver.2 | Bronze.2 | Combined total | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Afghanistan (AFG) | 13 | 0 | 0 | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 13 | 0 | 0 | 2 | 2 |
Algeria (ALG) | 12 | 5 | 2 | 8 | 15 | 3 | 0 | 0 | 0 | 0 | 15 | 5 | 2 | 8 | 15 |
Argentina (ARG) | 23 | 18 | 24 | 28 | 70 | 18 | 0 | 0 | 0 | 0 | 41 | 18 | 24 | 28 | 70 |
Armenia (ARM) | 5 | 1 | 2 | 9 | 12 | 6 | 0 | 0 | 0 | 0 | 11 | 1 | 2 | 9 | 12 |
Australasia (ANZ) [ANZ] | 2 | 3 | 4 | 5 | 12 | 0 | 0 | 0 | 0 | 0 | 2 | 3 | 4 | 5 | 12 |
# we want index by no of golds and not by country names
# First we preserve the existing indexes into an colummn, because set_index doesn't preserve previous indexes
df['country'] = df.index
df = df.set_index('Gold')
df.head()
# Summer | Silver | Bronze | Total | # Winter | Gold.1 | Silver.1 | Bronze.1 | Total.1 | # Games | Gold.2 | Silver.2 | Bronze.2 | Combined total | country | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Gold | |||||||||||||||
0 | 13 | 0 | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 13 | 0 | 0 | 2 | 2 | Afghanistan (AFG) |
5 | 12 | 2 | 8 | 15 | 3 | 0 | 0 | 0 | 0 | 15 | 5 | 2 | 8 | 15 | Algeria (ALG) |
18 | 23 | 24 | 28 | 70 | 18 | 0 | 0 | 0 | 0 | 41 | 18 | 24 | 28 | 70 | Argentina (ARG) |
1 | 5 | 2 | 9 | 12 | 6 | 0 | 0 | 0 | 0 | 11 | 1 | 2 | 9 | 12 | Armenia (ARM) |
3 | 2 | 4 | 5 | 12 | 0 | 0 | 0 | 0 | 0 | 2 | 3 | 4 | 5 | 12 | Australasia (ANZ) [ANZ] |
# to get rid of index completely & create default numbered index
df = df.reset_index()
df.head()
Gold | # Summer | Silver | Bronze | Total | # Winter | Gold.1 | Silver.1 | Bronze.1 | Total.1 | # Games | Gold.2 | Silver.2 | Bronze.2 | Combined total | country | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 13 | 0 | 2 | 2 | 0 | 0 | 0 | 0 | 0 | 13 | 0 | 0 | 2 | 2 | Afghanistan (AFG) |
1 | 5 | 12 | 2 | 8 | 15 | 3 | 0 | 0 | 0 | 0 | 15 | 5 | 2 | 8 | 15 | Algeria (ALG) |
2 | 18 | 23 | 24 | 28 | 70 | 18 | 0 | 0 | 0 | 0 | 41 | 18 | 24 | 28 | 70 | Argentina (ARG) |
3 | 1 | 5 | 2 | 9 | 12 | 6 | 0 | 0 | 0 | 0 | 11 | 1 | 2 | 9 | 12 | Armenia (ARM) |
4 | 3 | 2 | 4 | 5 | 12 | 0 | 0 | 0 | 0 | 0 | 2 | 3 | 4 | 5 | 12 | Australasia (ANZ) [ANZ] |
# pandas support multilevel index like composite keys in databases
# To visualize this, lets load different dataset
# Similarly you can have heirarchial
df = pd.read_csv('census.csv')
df.head()
SUMLEV | REGION | DIVISION | STATE | COUNTY | STNAME | CTYNAME | CENSUS2010POP | ESTIMATESBASE2010 | POPESTIMATE2010 | ... | RDOMESTICMIG2011 | RDOMESTICMIG2012 | RDOMESTICMIG2013 | RDOMESTICMIG2014 | RDOMESTICMIG2015 | RNETMIG2011 | RNETMIG2012 | RNETMIG2013 | RNETMIG2014 | RNETMIG2015 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 40 | 3 | 6 | 1 | 0 | Alabama | Alabama | 4779736 | 4780127 | 4785161 | ... | 0.002295 | -0.193196 | 0.381066 | 0.582002 | -0.467369 | 1.030015 | 0.826644 | 1.383282 | 1.724718 | 0.712594 |
1 | 50 | 3 | 6 | 1 | 1 | Alabama | Autauga County | 54571 | 54571 | 54660 | ... | 7.242091 | -2.915927 | -3.012349 | 2.265971 | -2.530799 | 7.606016 | -2.626146 | -2.722002 | 2.592270 | -2.187333 |
2 | 50 | 3 | 6 | 1 | 3 | Alabama | Baldwin County | 182265 | 182265 | 183193 | ... | 14.832960 | 17.647293 | 21.845705 | 19.243287 | 17.197872 | 15.844176 | 18.559627 | 22.727626 | 20.317142 | 18.293499 |
3 | 50 | 3 | 6 | 1 | 5 | Alabama | Barbour County | 27457 | 27457 | 27341 | ... | -4.728132 | -2.500690 | -7.056824 | -3.904217 | -10.543299 | -4.874741 | -2.758113 | -7.167664 | -3.978583 | -10.543299 |
4 | 50 | 3 | 6 | 1 | 7 | Alabama | Bibb County | 22915 | 22919 | 22861 | ... | -5.527043 | -5.068871 | -6.201001 | -0.177537 | 0.177258 | -5.088389 | -4.363636 | -5.403729 | 0.754533 | 1.107861 |
5 rows × 100 columns
df['SUMLEV'].unique()
array([40, 50])
# keep summary at county level
df=df[df['SUMLEV'] == 50]
df.head()
SUMLEV | REGION | DIVISION | STATE | COUNTY | STNAME | CTYNAME | CENSUS2010POP | ESTIMATESBASE2010 | POPESTIMATE2010 | ... | RDOMESTICMIG2011 | RDOMESTICMIG2012 | RDOMESTICMIG2013 | RDOMESTICMIG2014 | RDOMESTICMIG2015 | RNETMIG2011 | RNETMIG2012 | RNETMIG2013 | RNETMIG2014 | RNETMIG2015 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 50 | 3 | 6 | 1 | 1 | Alabama | Autauga County | 54571 | 54571 | 54660 | ... | 7.242091 | -2.915927 | -3.012349 | 2.265971 | -2.530799 | 7.606016 | -2.626146 | -2.722002 | 2.592270 | -2.187333 |
2 | 50 | 3 | 6 | 1 | 3 | Alabama | Baldwin County | 182265 | 182265 | 183193 | ... | 14.832960 | 17.647293 | 21.845705 | 19.243287 | 17.197872 | 15.844176 | 18.559627 | 22.727626 | 20.317142 | 18.293499 |
3 | 50 | 3 | 6 | 1 | 5 | Alabama | Barbour County | 27457 | 27457 | 27341 | ... | -4.728132 | -2.500690 | -7.056824 | -3.904217 | -10.543299 | -4.874741 | -2.758113 | -7.167664 | -3.978583 | -10.543299 |
4 | 50 | 3 | 6 | 1 | 7 | Alabama | Bibb County | 22915 | 22919 | 22861 | ... | -5.527043 | -5.068871 | -6.201001 | -0.177537 | 0.177258 | -5.088389 | -4.363636 | -5.403729 | 0.754533 | 1.107861 |
5 | 50 | 3 | 6 | 1 | 9 | Alabama | Blount County | 57322 | 57322 | 57373 | ... | 1.807375 | -1.177622 | -1.748766 | -2.062535 | -1.369970 | 1.859511 | -0.848580 | -1.402476 | -1.577232 | -0.884411 |
5 rows × 100 columns
# keeping required columns only
columns_to_keep = ['STNAME',
'CTYNAME',
'BIRTHS2010',
'BIRTHS2011',
'BIRTHS2012',
'BIRTHS2013',
'BIRTHS2014',
'BIRTHS2015',
'POPESTIMATE2010',
'POPESTIMATE2011',
'POPESTIMATE2012',
'POPESTIMATE2013',
'POPESTIMATE2014',
'POPESTIMATE2015']
df = df[columns_to_keep]
df.head()
STNAME | CTYNAME | BIRTHS2010 | BIRTHS2011 | BIRTHS2012 | BIRTHS2013 | BIRTHS2014 | BIRTHS2015 | POPESTIMATE2010 | POPESTIMATE2011 | POPESTIMATE2012 | POPESTIMATE2013 | POPESTIMATE2014 | POPESTIMATE2015 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | Alabama | Autauga County | 151 | 636 | 615 | 574 | 623 | 600 | 54660 | 55253 | 55175 | 55038 | 55290 | 55347 |
2 | Alabama | Baldwin County | 517 | 2187 | 2092 | 2160 | 2186 | 2240 | 183193 | 186659 | 190396 | 195126 | 199713 | 203709 |
3 | Alabama | Barbour County | 70 | 335 | 300 | 283 | 260 | 269 | 27341 | 27226 | 27159 | 26973 | 26815 | 26489 |
4 | Alabama | Bibb County | 44 | 266 | 245 | 259 | 247 | 253 | 22861 | 22733 | 22642 | 22512 | 22549 | 22583 |
5 | Alabama | Blount County | 183 | 744 | 710 | 646 | 618 | 603 | 57373 | 57711 | 57776 | 57734 | 57658 | 57673 |
# setting composite index
df = df.set_index(['STNAME', 'CTYNAME'])
df.head()
BIRTHS2010 | BIRTHS2011 | BIRTHS2012 | BIRTHS2013 | BIRTHS2014 | BIRTHS2015 | POPESTIMATE2010 | POPESTIMATE2011 | POPESTIMATE2012 | POPESTIMATE2013 | POPESTIMATE2014 | POPESTIMATE2015 | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
STNAME | CTYNAME | ||||||||||||
Alabama | Autauga County | 151 | 636 | 615 | 574 | 623 | 600 | 54660 | 55253 | 55175 | 55038 | 55290 | 55347 |
Baldwin County | 517 | 2187 | 2092 | 2160 | 2186 | 2240 | 183193 | 186659 | 190396 | 195126 | 199713 | 203709 | |
Barbour County | 70 | 335 | 300 | 283 | 260 | 269 | 27341 | 27226 | 27159 | 26973 | 26815 | 26489 | |
Bibb County | 44 | 266 | 245 | 259 | 247 | 253 | 22861 | 22733 | 22642 | 22512 | 22549 | 22583 | |
Blount County | 183 | 744 | 710 | 646 | 618 | 603 | 57373 | 57711 | 57776 | 57734 | 57658 | 57673 |
# how to query multilevel index
df.loc['Michigan', 'Washtenaw County']
BIRTHS2010 977
BIRTHS2011 3826
BIRTHS2012 3780
BIRTHS2013 3662
BIRTHS2014 3683
BIRTHS2015 3709
POPESTIMATE2010 345563
POPESTIMATE2011 349048
POPESTIMATE2012 351213
POPESTIMATE2013 354289
POPESTIMATE2014 357029
POPESTIMATE2015 358880
Name: (Michigan, Washtenaw County), dtype: int64
# comparing how counties
df.loc[ [('Michigan', 'Washtenaw County'),
('Michigan', 'Wayne County')] ]
BIRTHS2010 | BIRTHS2011 | BIRTHS2012 | BIRTHS2013 | BIRTHS2014 | BIRTHS2015 | POPESTIMATE2010 | POPESTIMATE2011 | POPESTIMATE2012 | POPESTIMATE2013 | POPESTIMATE2014 | POPESTIMATE2015 | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
STNAME | CTYNAME | ||||||||||||
Michigan | Washtenaw County | 977 | 3826 | 3780 | 3662 | 3683 | 3709 | 345563 | 349048 | 351213 | 354289 | 357029 | 358880 |
Wayne County | 5918 | 23819 | 23270 | 23377 | 23607 | 23586 | 1815199 | 1801273 | 1792514 | 1775713 | 1766008 | 1759335 |