numpy
, and pandas
‘Fit’ Together”Theme: Data Analysis & Engineering
Topic: How Python, numpy
, and pandas
‘Fit’ Together
Presenter: James Powell james@dutc.io
Date: Monday, October 30, 2020
Time: 12 PM PST
Keywords: Python, data anaylsis, data engineering, numpy, pandas
# GIVEN: given a DataFrame that looks like this…
# a b c d e # a - is a label, drawn from "aa" ~ "zz"
# zz 1 .2 .4 .1 # b - is drawn randomly from [0, 1, 2]
# zz 0 .3 .2 .3 # with 50% of a 1 and 25% of a 0 or 2
# yb 2 .1 .7 .6 # c, d, e - is drawn from a normal
# # distribution with a mean of 0.5
# # and std of 2.5
from pandas import DataFrame
from numpy import arange
from numpy.random import choice, normal
from random import randint
from string import ascii_lowercase
from numpy import column_stack
def random_hostname():
st = choice(['fsw', 'rsw', 'ssw'])
return f'{st}{randint(0, 10)}.p{randint(0, 10)}.fbnet'
SIZE = 1_000
df = DataFrame({
#'a': choice([*ascii_lowercase], size=((SIZE:=1_000), 2)).view('<U2').ravel(),
'a': [random_hostname() for _ in range(SIZE)],
'b': choice(arange(3), p=[.25, .5, .25], size=SIZE),
'c': normal(0.5, 2.5, size=SIZE),
'd': normal(0.5, 2.5, size=SIZE),
'e': normal(0.5, 2.5, size=SIZE),
})
# df['a'] = df['a'].str.repeat(2)
# TASK: convert a into a Categorical value
from pandas import Categorical
df['a'] = Categorical(df['a'])
# TASK: print the number of times the various a-labels appear.
# i.e., xxx - 10 times; yyy - 8 times, zzz - 2 times
df['a'].value_counts()
df['a'].str.extract('([a-z]+)[0-9]+\.(.+)', expand=True)[0].value_counts()
# TASK: (i) print the average of column c and the variance of column d
# (iia) print the average of column c and the variance of column d
# for rows where a is after "gg"
# (iib) print the average of column c and the variance of column d
# for rows where c <= d < e
# (iii) print the average of column c and the variance of column d…
# … PER value of a
df["c"].mean()
df["d"].var()
df["c"].median()
df.set_index('a', inplace=True)
from numpy import array
from datetime import timedelta, datetime
from pandas import to_datetime
times = array([timedelta(seconds=randint(0, 60)) for _ in range(len(df))]).cumsum() + datetime.now()
df.set_index(times, append=True, inplace=True)
key = "fsw8.p1.fbnet"
try:
df.loc[key]["c"].mean()
df[df["b"] > 1].loc[key].mean()
except KeyError:
# print(f'Device {key} not in dataset')
pass
# TASK: group the above by the the `a` value WITHOUT making it the index
# and WITHOUT sorting it
df.reset_index(1, drop=True, inplace=True)
df.set_index('b', append=True, inplace=True)
df.groupby(level=(0, 1))[['c', 'd']].sum().unstack().sample(3)
df.pivot_table(index='a', columns='b', values=['c', 'd'], aggfunc='sum').sample(3)
# minor axis (minor index) `.columns` / column index
# -------
# |
# | data: row data, column data
# |
# |
# major axis
# majox index
# `.index`
# row index
df['f'] = df['c'] - df['d']
print(df.sample(3))
# TASK: compute a pivot table where the left index is the values of `a`
# tabulated against the values of `b`, where the entries are the values of
# the means of the difference of `c` and `d`
# col col col
# a
# b
# c
# TASK: group by `a` and `b` and show the sum of the `c` values only
# pivot by `a` and `b` and show the sum of the `c` values
# make these two DataFrames look equivalent
# TASK: investigate .groupby.apply and .groupby.transform
# TASK: group by `a` and apply a transformation to compute the mean
# of each of `c`, `d`, and `e` and the sum of `b`
# use `groupby(…).transform`
# TASK: try to repeat the above using `.groupby(…).apply`