seminars.fb

Data Analysis & Engineering → “How Python, numpy, and pandas ‘Fit’ Together”

Theme: Data Analysis & Engineering

Topic: How Python, numpy, and pandas ‘Fit’ Together

Presenter: James Powell james@dutc.io

Date: Monday, October 30, 2020

Time: 12 PM PST

Keywords: Python, data anaylsis, data engineering, numpy, pandas

# GIVEN: given a DataFrame that looks like this…

#     a   b  c  d  e  # a - is a label, drawn from "aa" ~ "zz"
#     zz  1 .2 .4 .1  # b - is drawn randomly from [0, 1, 2]
#     zz  0 .3 .2 .3  #        with 50% of a 1 and 25% of a 0 or 2
#     yb  2 .1 .7 .6  # c, d, e - is drawn from a normal
#                     #           distribution with a mean of 0.5
#                     #           and std of 2.5

from pandas import DataFrame
from numpy import arange
from numpy.random import choice, normal
from random import randint
from string import ascii_lowercase
from numpy import column_stack

def random_hostname():
    st = choice(['fsw', 'rsw', 'ssw'])
    return f'{st}{randint(0, 10)}.p{randint(0, 10)}.fbnet'

SIZE = 1_000
df = DataFrame({
#'a': choice([*ascii_lowercase], size=((SIZE:=1_000), 2)).view('<U2').ravel(),
    'a': [random_hostname() for _ in range(SIZE)],
    'b': choice(arange(3), p=[.25, .5, .25], size=SIZE),
    'c': normal(0.5, 2.5, size=SIZE),
    'd': normal(0.5, 2.5, size=SIZE),
    'e': normal(0.5, 2.5, size=SIZE),
})
# df['a'] = df['a'].str.repeat(2)

# TASK: convert a into a Categorical value
from pandas import Categorical
df['a'] = Categorical(df['a'])

# TASK: print the number of times the various a-labels appear.
#       i.e., xxx - 10 times; yyy - 8 times, zzz - 2 times
df['a'].value_counts()
df['a'].str.extract('([a-z]+)[0-9]+\.(.+)', expand=True)[0].value_counts()

# TASK: (i) print the average of column c and the variance of column d
#       (iia) print the average of column c and the variance of column d
#             for rows where a is after "gg"
#       (iib) print the average of column c and the variance of column d
#             for rows where c <= d < e
#       (iii) print the average of column c and the variance of column d…
#             … PER value of a
df["c"].mean()
df["d"].var()
df["c"].median()
df.set_index('a', inplace=True)
from numpy import array
from datetime import timedelta, datetime
from pandas import to_datetime
times = array([timedelta(seconds=randint(0, 60)) for _ in range(len(df))]).cumsum() + datetime.now()
df.set_index(times, append=True, inplace=True)
key = "fsw8.p1.fbnet"
try:
    df.loc[key]["c"].mean()
    df[df["b"] > 1].loc[key].mean()
except KeyError:
    #  print(f'Device {key} not in dataset')
    pass

# TASK: group the above by the the `a` value WITHOUT making it the index
#       and WITHOUT sorting it

df.reset_index(1, drop=True, inplace=True)
df.set_index('b', append=True, inplace=True)

df.groupby(level=(0, 1))[['c', 'd']].sum().unstack().sample(3)
df.pivot_table(index='a', columns='b', values=['c', 'd'], aggfunc='sum').sample(3)

#   minor axis (minor index) `.columns` / column index
#   -------
# |
# |    data: row data, column data
# |
# |
# major axis
# majox index
# `.index`
# row index

df['f'] = df['c'] - df['d']
print(df.sample(3))

# TASK: compute a pivot table where the left index is the values of `a`
#       tabulated against the values of `b`, where the entries are the values of
#       the means of the difference of `c` and `d`

#      col     col     col
# a
# b
# c

# TASK: group by `a` and `b` and show the sum of the `c` values only
#       pivot by `a` and `b` and show the sum of the `c` values
#       make these two DataFrames look equivalent

# TASK: investigate .groupby.apply and .groupby.transform
# TASK: group by `a` and apply a transformation to compute the mean
#       of each of `c`, `d`, and `e` and the sum of `b`
#       use `groupby(…).transform`
# TASK: try to repeat the above using `.groupby(…).apply`