Introduction to Pandas in Python

This is an introduction to the Pandas module in Python. I demonstrate core Pandas functionality using a tabular dataset from OICA.

# import pandas
import pandas
# read in oica.csv file with data
pandas.read_csv("oica.csv",sep=",")
yearcountryoutput
02018Argentina466649
12018Austria164900
22018Belgium308493
32018Brazil2879809
42018Canada2020840
8351999Ukraine1918
8361999UK1973519
8371999USA13024978
8381999Uzbekistan44433
8391999Others11965

840 rows × 3 columns

# read in oica.csv and store to object
data_df = pandas.read_csv("oica.csv",sep=",")
# check the type of the data object to which read_csv stored data
type(data_df)
pandas.core.frame.DataFrame
# check the dimensions (shape) of the data frame
data_df.shape
(840, 3)
# check a info summary for the data frame object
data_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 840 entries, 0 to 839
Data columns (total 3 columns):
year        840 non-null int64
country     840 non-null object
output      840 non-null int64
dtypes: int64(2), object(1)
memory usage: 19.8+ KB
# check the header of the data frame object
data_df.head()
yearcountryoutput
02018Argentina466649
12018Austria164900
22018Belgium308493
32018Brazil2879809
42018Canada2020840
# check the tail of the data frame object
data_df.tail()
yearcountryoutput
8351999Ukraine1918
8361999UK1973519
8371999USA13024978
8381999Uzbekistan44433
8391999Others11965
# check column names
data_df.columns
Index(['year', 'country ', 'output '], dtype='object')
# check row index range
data_df.index
RangeIndex(start=0, stop=840, step=1)
# check the body of the data_df object (numpy representation of data frame);
# may be useful when transfering data to different frame works
data_df.values
array([[2018, 'Argentina', 466649],
       [2018, 'Austria', 164900],
       [2018, 'Belgium', 308493],
       ...,
       [1999, 'USA', 13024978],
       [1999, 'Uzbekistan', 44433],
       [1999, 'Others', 11965]], dtype=object)
# get datatype overview by column
data_df.dtypes
year         int64
country     object
output       int64
dtype: object
# access column, here "country" column (subsetting column)
country_col = data_df["country "]
country_col
0       Argentina
1         Austria
2         Belgium
3          Brazil
4          Canada
          ...    
835       Ukraine
836            UK
837           USA
838    Uzbekistan
839        Others
Name: country , Length: 840, dtype: object
# check type of column
type(country_col)
pandas.core.series.Series
# return data frame instead of series when subsetting from data_df
country_col = data_df[["country "]] # subsetting a list of values from the series
country_col
country
0Argentina
1Austria
2Belgium
3Brazil
4Canada
835Ukraine
836UK
837USA
838Uzbekistan
839Others

840 rows × 1 columns

# check type of country_col now
type(country_col)
pandas.core.frame.DataFrame
# dropping column in data frame
data_df.drop(["country "],axis="columns")
yearoutput
02018466649
12018164900
22018308493
320182879809
420182020840
83519991918
83619991973519
837199913024978
838199944433
839199911965

840 rows × 2 columns

# subsetting rows using .loc
data_df.loc[0]
year             2018
country     Argentina
output         466649
Name: 0, dtype: object
# subsetting specific cell, using .loc
data_df.loc[0][1]
'Argentina'
# subsetting list of rows, using .loc
data_df.loc[[0,1,3,6,9,10]]
yearcountryoutput
02018Argentina466649
12018Austria164900
32018Brazil2879809
62018Colombia72800
92018Finland112104
102018France2270000
# unlike python lists, -1 will not work with .loc
data_df.loc[-1]
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
~\Anaconda3\lib\site-packages\pandas\core\indexes\range.py in get_loc(self, key, method, tolerance)
    375             try:
--> 376                 return self._range.index(new_key)
    377             except ValueError:

ValueError: -1 is not in range

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-38-404dbc95dcdf> in <module>
      1 # unlike python lists, -1 will not work with .loc
----> 2 data_df.loc[-1]

~\Anaconda3\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)
   1422 
   1423             maybe_callable = com.apply_if_callable(key, self.obj)
-> 1424             return self._getitem_axis(maybe_callable, axis=axis)
   1425 
   1426     def _is_scalar_access(self, key: Tuple):

~\Anaconda3\lib\site-packages\pandas\core\indexing.py in _getitem_axis(self, key, axis)
   1848         # fall thru to straight lookup
   1849         self._validate_key(key, axis)
-> 1850         return self._get_label(key, axis=axis)
   1851 
   1852 

~\Anaconda3\lib\site-packages\pandas\core\indexing.py in _get_label(self, label, axis)
    158             raise IndexingError("no slices here, handle elsewhere")
    159 
--> 160         return self.obj._xs(label, axis=axis)
    161 
    162     def _get_loc(self, key: int, axis: int):

~\Anaconda3\lib\site-packages\pandas\core\generic.py in xs(self, key, axis, level, drop_level)
   3735             loc, new_index = self.index.get_loc_level(key, drop_level=drop_level)
   3736         else:
-> 3737             loc = self.index.get_loc(key)
   3738 
   3739             if isinstance(loc, np.ndarray):

~\Anaconda3\lib\site-packages\pandas\core\indexes\range.py in get_loc(self, key, method, tolerance)
    376                 return self._range.index(new_key)
    377             except ValueError:
--> 378                 raise KeyError(key)
    379         return super().get_loc(key, method=method, tolerance=tolerance)
    380 

KeyError: -1
# but, using .iloc we can use -1
data_df.iloc[-1]
year          1999
country     Others
output       11965
Name: 839, dtype: object
# above return object is of type series
# below: subsetting such that data frame is returned
data_df.iloc[[-1]]
yearcountryoutput
8391999Others11965
# subsetting data frame, accessing several rows using .iloc
data_df.iloc[[-1,0,1]]
yearcountryoutput
8391999Others11965
02018Argentina466649
12018Austria164900
# using slicing notation ":" to access all rows, using .loc
data_df.loc[:,["year"]].head()
year
02018
12018
22018
32018
42018
# the same as above, using .iloc
data_df.iloc[:,[0]].head()
year
02018
12018
22018
32018
42018
# filtering when subsetting, using .loc
data_df.loc[data_df["country "] == "Japan"].head()
yearcountryoutput
172018Japan9728528
562017Japan9693746
972016Japan9204590
1372015Japan9278238
1772014Japan9774665
# subsetting and filtering, considering multiple variables and using .loc
data_df.loc[(data_df["country "] == "Japan") & (data_df["year"] == 2014)]
yearcountryoutput
1772014Japan9774665
# using groupby for grouping data frame entries; 
# this will subset by that variable (separate data frame for each group)
data_df.groupby("country ")["output "].mean().head() # you could group by multiple variables; in that case use a list
country 
Argentina     429045.750000
Argentina     472158.000000
Australia     264759.250000
Austria       140122.571429
Belgium       711430.238095
Name: output , dtype: float64
# a more generic way, using numpy
import numpy
data_df.groupby("country ")["output "].agg(numpy.mean).tail()
country 
Turkey        8.993806e+05
UK            1.678415e+06
USA           1.097888e+07
Ukraine       8.167905e+04
Uzbekistan    6.586671e+04
Name: output , dtype: float64
# another example, passing on numpy.std
data_df.groupby("country ")["output "].agg(numpy.std).tail()
country 
Turkey        4.669412e+05
UK            1.943838e+05
USA           1.832615e+06
Ukraine       1.201598e+05
Uzbekistan    7.579244e+04
Name: output , dtype: float64
# using .reset_index for returning a regular pandas data frame object
data_df.groupby("country ")["output "].agg(numpy.std).reset_index().tail()
countryoutput
38Turkey4.669412e+05
39UK1.943838e+05
40USA1.832615e+06
41Ukraine1.201598e+05
42Uzbekistan7.579244e+04
# pipe in python, using pandas - using parentheses "()"
(data_df
.groupby("country ")["output "]
.agg(numpy.std)
.reset_index()
.tail())
countryoutput
38Turkey4.669412e+05
39UK1.943838e+05
40USA1.832615e+06
41Ukraine1.201598e+05
42Uzbekistan7.579244e+04
# pipe in python, using pandas - using backslash "\"
data_df\
.groupby("country ")["output "]\
.agg(numpy.std)\
.reset_index()\
.tail()
countryoutput
38Turkey4.669412e+05
39UK1.943838e+05
40USA1.832615e+06
41Ukraine1.201598e+05
42Uzbekistan7.579244e+04
# using .str excessor
data_df["country "].str.capitalize().head()
0    Argentina
1      Austria
2      Belgium
3       Brazil
4       Canada
Name: country , dtype: object
# using the .str excessor from pandas data frame
data_df["country "].str.contains("A").head()
0     True
1     True
2    False
3    False
4    False
Name: country , dtype: bool
# assigning new column to existing data frame and populating it
data_df["test"] = 1
data_df.head()
yearcountryoutputtest
02018Argentina4666491
12018Austria1649001
22018Belgium3084931
32018Brazil28798091
42018Canada20208401
# remove additional column
data_df = data_df[["year","country ","output "]]
data_df.head()
yearcountryoutput
02018Argentina466649
12018Austria164900
22018Belgium308493
32018Brazil2879809
42018Canada2020840
# using .pivot_table
pivot_df = data_df.pivot_table(index = ["country "],
                              columns = ["year"],
                              values = ["output "]).reset_index()
pivot_df.tail()
countryoutput
year1999200020012002200320042005200620072009201020112012201320142015201620172018
38Turkey297862.0430947.0270685.0346565.0346565.0678381.0879452.098778.01099413.0869605.01094557.01189131.01072978.01125534.01170445.01358796.01485927.01695731.01550150.0
39UK1973519.01813894.01685238.01823018.01823018.01851484.01803109.01648388.01750253.01090139.01393463.01463999.01576945.01597872.01598879.01682156.01816622.01749385.01604328.0
40USA13024978.012799857.011424689.012279582.012279582.012052179.011946653.011263986.010780729.05709431.07743093.08661535.010335765.011066432.011660702.012100095.012198137.011189985.011314705.0
41Ukraine1918.031255.031824.053773.053773.014739.0215759.029526.0402591.069295.083133.0104654.076281.050449.028751.08244.05264.09542.06623.0
42Uzbekistan44433.032273.041005.029554.029554.063601.595814.0110.01849.01179.015688.017956.016418.0246641.024566.01854.088152.0140247.0220667.0

5 rows × 21 columns

# creating a pandas data frame
data2_df = pandas.DataFrame({
    "col1" : [1,2,3],
    "col2" : ["a","b","c"]
})
# show data2_df
data2_df
col1col2
01a
12b
23c
# writing and defining a simple function called "simple_function"
def simple_function(x):
    return x+1
# applying "simple_function" to all entries in "col1"
data2_df["col1"] = data2_df["col1"].apply(simple_function)
# view data2_df now, after having used .apply method
data2_df
col1col2
02a
13b
24c
# applying default function "print" to entire data2_df, column by column
data2_df.apply(print)
0    2
1    3
2    4
Name: col1, dtype: object
0    a
1    b
2    c
Name: col2, dtype: object
col1    None
col2    None
dtype: object
# using apply in combination with e.g. numpy.mean(); numpy is smart and will recognize the different columns
def simple_function2(col):
    return numpy.mean(col)
# define a thrid data frame using pandas
data3_df = pandas.DataFrame({
    "col1" : [1,2,3],
    "col2" : [9,8,7]
})
# apply simple_function2 to data3_df
data3_df.apply(simple_function2).reset_index()
index0
0col12.0
1col28.0
# define data4_df, which contains same values as data3_df originally did
data4_df = pandas.DataFrame({
    "col1" : [1,2,3],
    "col2" : [9,8,7]
})
# define a function simple_function3 that can process in different columns, without numpy
def simple_function3(col):
    x = col[0]
    y = col[1]
    z = col[2]
    return (x+y+z)/3
# apply to data3_df
data4_df.apply(simple_function3).reset_index()
index0
0col12.0
1col28.0
# simple functions such as mean can be directly accessed as a pandas data frame series method
data4_df["col1"].mean()
2.0
# basic mathematical operations can also be conducted e.g. column by column, i.e. column-wise
data4_df["sum"] = data4_df["col1"] + data4_df["col2"]
data4_df
col1col2sum
01910
12810
23710
# numpy has a function called vectorize; 
# numpy.vectorize() takes a function as input, and creates a vectorized version of that function
# lets define a functino that we want to use in vectorized form, based on a pandas data frame
def simple_function4(x,y):
    if (x == 20):
        return numpy.NaN 
    else:
        return(x+y) / 2
# create a new data frame with two columns; call it "data5_df"
data5_df = pandas.DataFrame({
    "col1" : [1,2,3],
    "col2" : [9,8,7]
})
# vectorize simple_function4 using numpy.vectorize()
simple_function4VEC = numpy.vectorize(simple_function4)
# now, apply simple_function4 to the data frame data5_df
simple_function4VEC(data5_df["col1"],data5_df["col2"])
array([5., 5., 5.])
# instead of passing the function to numpy.vectorize you can use a decorator
@numpy.vectorize
def simple_function5(x,y):
    if(x == 20):
        return numpy.NaN
    else: 
        return(x+y)/2
# try again, using data5_df
simple_function5(data5_df["col1"],data5_df["col2"])
array([5., 5., 5.])

Leave a Reply

Leave a Reply

Your email address will not be published. Required fields are marked *

Close

Meta