# check the body of the data_df object (numpy representation of data frame);
# may be useful when transfering data to different frame works
data_df.values
0 Argentina
1 Austria
2 Belgium
3 Brazil
4 Canada
...
835 Ukraine
836 UK
837 USA
838 Uzbekistan
839 Others
Name: country , Length: 840, dtype: object
# check type of column
type(country_col)
pandas.core.series.Series
# return data frame instead of series when subsetting from data_df
country_col = data_df[["country "]] # subsetting a list of values from the series
country_col
country
0
Argentina
1
Austria
2
Belgium
3
Brazil
4
Canada
…
…
835
Ukraine
836
UK
837
USA
838
Uzbekistan
839
Others
840 rows × 1 columns
# check type of country_col now
type(country_col)
pandas.core.frame.DataFrame
# dropping column in data frame
data_df.drop(["country "],axis="columns")
year
output
0
2018
466649
1
2018
164900
2
2018
308493
3
2018
2879809
4
2018
2020840
…
…
…
835
1999
1918
836
1999
1973519
837
1999
13024978
838
1999
44433
839
1999
11965
840 rows × 2 columns
# subsetting rows using .loc
data_df.loc[0]
year 2018
country Argentina
output 466649
Name: 0, dtype: object
# subsetting specific cell, using .loc
data_df.loc[0][1]
'Argentina'
# subsetting list of rows, using .loc
data_df.loc[[0,1,3,6,9,10]]
year
country
output
0
2018
Argentina
466649
1
2018
Austria
164900
3
2018
Brazil
2879809
6
2018
Colombia
72800
9
2018
Finland
112104
10
2018
France
2270000
# unlike python lists, -1 will not work with .loc
data_df.loc[-1]
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~\Anaconda3\lib\site-packages\pandas\core\indexes\range.py in get_loc(self, key, method, tolerance)
375 try:
--> 376 return self._range.index(new_key)
377 except ValueError:
ValueError: -1 is not in range
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
<ipython-input-38-404dbc95dcdf> in <module>
1 # unlike python lists, -1 will not work with .loc
----> 2 data_df.loc[-1]
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in __getitem__(self, key)
1422
1423 maybe_callable = com.apply_if_callable(key, self.obj)
-> 1424 return self._getitem_axis(maybe_callable, axis=axis)
1425
1426 def _is_scalar_access(self, key: Tuple):
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in _getitem_axis(self, key, axis)
1848 # fall thru to straight lookup
1849 self._validate_key(key, axis)
-> 1850 return self._get_label(key, axis=axis)
1851
1852
~\Anaconda3\lib\site-packages\pandas\core\indexing.py in _get_label(self, label, axis)
158 raise IndexingError("no slices here, handle elsewhere")
159
--> 160 return self.obj._xs(label, axis=axis)
161
162 def _get_loc(self, key: int, axis: int):
~\Anaconda3\lib\site-packages\pandas\core\generic.py in xs(self, key, axis, level, drop_level)
3735 loc, new_index = self.index.get_loc_level(key, drop_level=drop_level)
3736 else:
-> 3737 loc = self.index.get_loc(key)
3738
3739 if isinstance(loc, np.ndarray):
~\Anaconda3\lib\site-packages\pandas\core\indexes\range.py in get_loc(self, key, method, tolerance)
376 return self._range.index(new_key)
377 except ValueError:
--> 378 raise KeyError(key)
379 return super().get_loc(key, method=method, tolerance=tolerance)
380
KeyError: -1
# but, using .iloc we can use -1
data_df.iloc[-1]
year 1999
country Others
output 11965
Name: 839, dtype: object
# above return object is of type series
# below: subsetting such that data frame is returned
data_df.iloc[[-1]]
year
country
output
839
1999
Others
11965
# subsetting data frame, accessing several rows using .iloc
data_df.iloc[[-1,0,1]]
year
country
output
839
1999
Others
11965
0
2018
Argentina
466649
1
2018
Austria
164900
# using slicing notation ":" to access all rows, using .loc
data_df.loc[:,["year"]].head()
year
0
2018
1
2018
2
2018
3
2018
4
2018
# the same as above, using .iloc
data_df.iloc[:,[0]].head()
year
0
2018
1
2018
2
2018
3
2018
4
2018
# filtering when subsetting, using .loc
data_df.loc[data_df["country "] == "Japan"].head()
year
country
output
17
2018
Japan
9728528
56
2017
Japan
9693746
97
2016
Japan
9204590
137
2015
Japan
9278238
177
2014
Japan
9774665
# subsetting and filtering, considering multiple variables and using .loc
data_df.loc[(data_df["country "] == "Japan") & (data_df["year"] == 2014)]
year
country
output
177
2014
Japan
9774665
# using groupby for grouping data frame entries;
# this will subset by that variable (separate data frame for each group)
data_df.groupby("country ")["output "].mean().head() # you could group by multiple variables; in that case use a list
country
Argentina 429045.750000
Argentina 472158.000000
Australia 264759.250000
Austria 140122.571429
Belgium 711430.238095
Name: output , dtype: float64
# a more generic way, using numpy
import numpy
data_df.groupby("country ")["output "].agg(numpy.mean).tail()
country
Turkey 8.993806e+05
UK 1.678415e+06
USA 1.097888e+07
Ukraine 8.167905e+04
Uzbekistan 6.586671e+04
Name: output , dtype: float64
# another example, passing on numpy.std
data_df.groupby("country ")["output "].agg(numpy.std).tail()
country
Turkey 4.669412e+05
UK 1.943838e+05
USA 1.832615e+06
Ukraine 1.201598e+05
Uzbekistan 7.579244e+04
Name: output , dtype: float64
# using .reset_index for returning a regular pandas data frame object
data_df.groupby("country ")["output "].agg(numpy.std).reset_index().tail()
country
output
38
Turkey
4.669412e+05
39
UK
1.943838e+05
40
USA
1.832615e+06
41
Ukraine
1.201598e+05
42
Uzbekistan
7.579244e+04
# pipe in python, using pandas - using parentheses "()"
(data_df
.groupby("country ")["output "]
.agg(numpy.std)
.reset_index()
.tail())
country
output
38
Turkey
4.669412e+05
39
UK
1.943838e+05
40
USA
1.832615e+06
41
Ukraine
1.201598e+05
42
Uzbekistan
7.579244e+04
# pipe in python, using pandas - using backslash "\"
data_df\
.groupby("country ")["output "]\
.agg(numpy.std)\
.reset_index()\
.tail()
country
output
38
Turkey
4.669412e+05
39
UK
1.943838e+05
40
USA
1.832615e+06
41
Ukraine
1.201598e+05
42
Uzbekistan
7.579244e+04
# using .str excessor
data_df["country "].str.capitalize().head()
0 Argentina
1 Austria
2 Belgium
3 Brazil
4 Canada
Name: country , dtype: object
# using the .str excessor from pandas data frame
data_df["country "].str.contains("A").head()
# creating a pandas data frame
data2_df = pandas.DataFrame({
"col1" : [1,2,3],
"col2" : ["a","b","c"]
})
# show data2_df
data2_df
col1
col2
0
1
a
1
2
b
2
3
c
# writing and defining a simple function called "simple_function"
def simple_function(x):
return x+1
# applying "simple_function" to all entries in "col1"
data2_df["col1"] = data2_df["col1"].apply(simple_function)
# view data2_df now, after having used .apply method
data2_df
col1
col2
0
2
a
1
3
b
2
4
c
# applying default function "print" to entire data2_df, column by column
data2_df.apply(print)
0 2
1 3
2 4
Name: col1, dtype: object
0 a
1 b
2 c
Name: col2, dtype: object
col1 None
col2 None
dtype: object
# using apply in combination with e.g. numpy.mean(); numpy is smart and will recognize the different columns
def simple_function2(col):
return numpy.mean(col)
# define a thrid data frame using pandas
data3_df = pandas.DataFrame({
"col1" : [1,2,3],
"col2" : [9,8,7]
})
# apply simple_function2 to data3_df
data3_df.apply(simple_function2).reset_index()
index
0
0
col1
2.0
1
col2
8.0
# define data4_df, which contains same values as data3_df originally did
data4_df = pandas.DataFrame({
"col1" : [1,2,3],
"col2" : [9,8,7]
})
# define a function simple_function3 that can process in different columns, without numpy
def simple_function3(col):
x = col[0]
y = col[1]
z = col[2]
return (x+y+z)/3
# apply to data3_df
data4_df.apply(simple_function3).reset_index()
index
0
0
col1
2.0
1
col2
8.0
# simple functions such as mean can be directly accessed as a pandas data frame series method
data4_df["col1"].mean()
2.0
# basic mathematical operations can also be conducted e.g. column by column, i.e. column-wise
data4_df["sum"] = data4_df["col1"] + data4_df["col2"]
data4_df
col1
col2
sum
0
1
9
10
1
2
8
10
2
3
7
10
# numpy has a function called vectorize;
# numpy.vectorize() takes a function as input, and creates a vectorized version of that function
# lets define a functino that we want to use in vectorized form, based on a pandas data frame
def simple_function4(x,y):
if (x == 20):
return numpy.NaN
else:
return(x+y) / 2
# create a new data frame with two columns; call it "data5_df"
data5_df = pandas.DataFrame({
"col1" : [1,2,3],
"col2" : [9,8,7]
})
# vectorize simple_function4 using numpy.vectorize()
simple_function4VEC = numpy.vectorize(simple_function4)
# now, apply simple_function4 to the data frame data5_df
simple_function4VEC(data5_df["col1"],data5_df["col2"])
array([5., 5., 5.])
# instead of passing the function to numpy.vectorize you can use a decorator
@numpy.vectorize
def simple_function5(x,y):
if(x == 20):
return numpy.NaN
else:
return(x+y)/2
# try again, using data5_df
simple_function5(data5_df["col1"],data5_df["col2"])
3 thoughts on “Introduction to Pandas in Python”