- read

Python pandas: Quick Tips

Reshama Shaikh 40

About

These are the some of the most commonly used commands I use in pandas, shared here for easy reference. These example uses python version 3.9. and pandas version 1.3.0.

1) Download a `csv` file using `wget`

# import libraries
from pathlib import Path
import pandas as pd
# create directories, if they don't already exist
Path("data_raw").mkdir(parents=True, exist_ok=True)
Path("data_derived").mkdir(parents=True, exist_ok=True)
# assign variable names to the directories
path_datain = "data_raw/"
path_dataout = "data_derived/"
data_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series"
file_name = "time_series_covid19_confirmed_global.csv"
file_url = data_url + "/" + file_name# get the file
!wget -N {file_url} -P {path_datain}
df = pd.read_csv(path_datain + file_name)

Look at dataframe

df.head()

2) Read in a `.csv` file and create a dataframe

df = pd.read_csv(path_datain + file_name)

3) Make a copy of a dataframe

df2 = df.copy()

4) Explore a dataframe

Read in data

import pandas as pd

# Creating the dataframe
df = pd.DataFrame({"A":[2, 4, 6, None],
"country":["India", "Italy", "Kenya", "Unknown"],
"C":[10, 9, 8, 7],
"D":[1, 3, None, 5],
"E":['2020-01-01', '2020-01-02','2020-01-03', 'NaN']})

Get dimensions of dataframe (# rows, # columns)

df.shape()

(5,5)

Get info

df.info()

>output

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 A 4 non-null float64
1 country 5 non-null object
2 C 5 non-null int64
3 D 4 non-null float64
4 E 5 non-null object
dtypes: float64(2), int64(1), object(2)
memory usage: 328.0+ bytes

Get unique occurrences of a variable in a dataframe

import pandas as pd

# Creating the dataframe
df = pd.DataFrame({"A":[2, 4, 6, None],
"country":["India", "Italy", "Kenya", "India"],
"C":[10, 9, 8, 7],
"D":[1, 3, None, 5],
"E":['2020-01-01', '2020-01-02','2020-01-03', 'NaN']})
df['country'].unique()

Get frequency counts for a variable in a dataframe

df['country'].value_counts(normalize=False)

NOTE: Use normalize=True to obtain percents.

5) Keep a subset of variables (columns), using `filter`

import pandas as pd

# Creating the dataframe
df = pd.DataFrame({"A":[2, 4, 6],
"B":[1, 2, 3],
"C":[10, 9, 8],
"D":[1, 3, None]
})
keep_cols = ['A', 'B']df_v2 = df.filter(keep_cols)

6) “Drop” one or multiple variables in a dataframe

import pandas as pd

# Creating the dataframe
df = pd.DataFrame({"A":[2, 4, 6],
"B":[1, 2, 3],
"C":[10, 9, 8],
"D":[1, 3, None]
})
df
drop_cols = ['A', 'B']df.drop(drop_cols, axis=1, inplace=True)df

7) Take subset of rows in a dataframe: using `isin`

import pandas as pd

# Creating the dataframe
df = pd.DataFrame({"A":[2, 4, 6],
"B":[1, 2, 3],
"C":[10, 9, 8],
"D":[1, 3, None]
})
df
filtered_list = [2, 4]check1 = df[df["A"].isin(filtered_list)]check1

8) Re-order columns

import pandas as pd

# Creating the dataframe
df = pd.DataFrame({"A":[2, 4, 6],
"B":[1, 2, 3],
"C":[10, 9, 8],
"D":[1, 3, None]
})
df.head(2)
new_order = ['D', 'A', 'C', 'B']df_new = df.reindex(new_order, axis=1)df_new.head(2)

9) Rename columns in a dataframe

import pandas as pd

# Creating the dataframe
df = pd.DataFrame({"A":[120, 140, 100],
"B":[80, 90, 60],
"C":[10, 9, 8],
"D":[1, 3, None]
})
df
df = df.rename(columns={'A': 'systolic',
'B': 'diastolic'
})
df

10) Transpose a dataframe (pivot table); Use `melt`

# importing pandas as pd
import pandas as pd

# Creating the dataframe
df = pd.DataFrame({"country":["India", "USA", "Mongolia"],
"year": [2019, 2020, 2021],
"A":[10, 9, 8],
"B":[1, 3, None],
"C":['2020-01-01', '2018-01-01', 'NaN']})
df
df_long = df.melt(id_vars=['country', 'year'])df_long

11) Write out a dataframe to a `.csv` file

# Takes dataframe "df" and writes it out a csv file named "report.csv"df.to_csv('data_derived/report.csv')

12) Using `mask` to assign new data values

# importing pandas as pd
import pandas as pd

# Creating the dataframe
df = pd.DataFrame({"A":[2, 4, None],
"country":["India", "USA", "Unknown"],
"C":[10, 9, 8],
"D":[1, 3, None],
"event_date":['2020-01-01', '2018-01-01', 'NaN']})
df

Changing an existing column

# IF the event_date='2018-01-01', then assign country (existing column in df) to be "United States"df['country'].mask(df['event_date'] == '2018-01-01', 'United States', inplace=True)df

Creating a new column

# IF the event_date='2018-01-01', then assign chapter_status (a new column) to be "unbegun"# NOTE: the new column needs to be initializeddf['chapter_status'] = ""df['chapter_status'].mask(df['event_date'] == '2018-01-01', 'unbegun', inplace=True)df

13) Convert from character to integer

# importing pandas as pd
import pandas as pd

# Creating the dataframe
df = pd.DataFrame({"A":[12, 4, 5, None],
"B":[7, 2, 54, 3],
"C":[20, 16, 11, 3],
"D":[14.3, 3.5, 5.0, 6.8]})
df.head()
df['D_n']= df['D'].astype(int)df.head()

14) Replace “NaN” with another value

# importing pandas as pd
import pandas as pd

# Creating the dataframe
df = pd.DataFrame({"A":[12, 4, 5, None, 1],
"B":[7, 2, 54, 3, None],
"C":[20, 16, 11, 3, 8],
"D":[14, 3, None, 2, 6]})
df.head()
table of numbers with 5 columns and 4 rows, 3 places contain a N-a-N value
# replace the NaN values with 1000df.mask(df.isna(), 1000)
# NOTE: use inplace=True, so it is permanently replaced in the dataframe. df.mask(df.isna(), 1000, inplace=True)