Title | “Data Analysis” |
Topic | hard problems in data analysis |
Date | Fri Oct 28 |
Time | 1pm~3pm PST |
Keywords | pandas |
In this advanced workshop, we will share with you some of the most valuable and most complex problems problems we have had to solve in pandas as part of our work with the Hardware Insights team.
In this workshop, we’ll work through these problems together, to see how we can make very sophisticated analyses straightforward and obvious.
print("Let's take a look!")
The below data set contains events related to devices that you manage. Devices follow a simple lifecycle:
provisioned → (deployed → in use) → (under maintenance → in use → …) → decommissioned
Note that a device isn’t guaranteed to be deployed or go into use, and that a device may or may not go under maintenance multiple times.
Answer:
provision → deployed
, deployed → in use
,
in use → decommissioned
) has the highest median length? How does the median
length of these phases change on a week-by-week basis?from datetime import timedelta
from random import Random
from enum import Enum
from string import ascii_lowercase, digits
from pandas import DataFrame, to_datetime, concat
from numpy.random import default_rng
rng = default_rng(0)
rnd = Random(0)
State = Enum('State', 'Provisioned Deployed InUse Maintenance Decommissioned')
def simulate():
td = lambda days: timedelta(seconds=rnd.randrange(24 * 60 * 60 * days))
yield State.Provisioned, td(days=90)
if rnd.choices([True, False], weights=[.90, .10])[0]:
yield State.Deployed, td(days=30)
if rnd.choices([True, False], weights=[.95, .05])[0]:
for _ in range(max(0, int(rnd.gauss(mu=2, sigma=1)))):
yield State.InUse, td(days=14)
yield State.Maintenance, td(days=14)
if rnd.choices([True, False], weights=[.95, .05])[0]:
yield State.InUse, td(days=14)
if rnd.choices([True, False], weights=[.50, .50])[0]:
yield State.Decommissioned, td(days=30)
if __name__ == '__main__':
devices = rng.choice([*ascii_lowercase, *digits], size=(10, 9))
devices[:, 4] = '-'
devices = devices.view('<U9').ravel()
lifecycles = (
DataFrame(simulate(), columns='state time'.split())
.assign(device=d)
.assign(state=lambda df: df['state'].apply(lambda x: x.name))
.set_index(['device', 'state'])
.cumsum()
.pipe(lambda df: df + to_datetime('2020-01-01'))
.squeeze(axis='columns')
for d in devices
)
lifecycles = concat(lifecycles, axis='index')
print(lifecycles)
lifecycles.to_pickle('lifecycles.pkl')
Background & Theory
from pandas import read_pickle
from pandas import date_range
s = read_pickle('lifecycles.pkl')
states = {dev: [] for dev in s.index.get_level_values('device').unique()}
for day_num, dt in enumerate(date_range(s.min().floor('D'), s.max().floor('D'), freq='D')):
for (dev, new_st), _ in s[lambda s: dt == s.dt.floor('D')].iteritems():
states[dev].append((dt, new_st))
if states[dev] and len(states[dev]) <= day_num:
states[dev].append((dt, states[dev][-1][-1]))
for dev, sts in states.items():
print(dev)
for st in sts:
print(st)
break
from pandas import read_pickle
from pandas import date_range
s = read_pickle('lifecycles.pkl')
# print(
# sep='\n',
# )
# for _, x in s.groupby([s.index.get_level_values('device'), s.dt.floor('D')]):
# print(x.index.get_level_values('state')[-1])
# # break
from pandas import Series, date_range
s = Series([1, 2, 3], index=date_range('2020-01-01', periods=3))
print(s)
from pandas import read_pickle, IndexSlice, Series, date_range, MultiIndex
from pandas import date_range, set_option
set_option('display.max_rows', None)
s = read_pickle('lifecycles.pkl')
# print(s)
end_of_day = (
s
.reset_index('state', drop=False)
.assign(
time=lambda df: df['time'].dt.floor('D')
)
.set_index('time', append=True)
.sort_index()
.groupby(['device', 'time'])
.tail(1) # .head(1)
.squeeze(axis='columns')
)
all_devices = end_of_day.index.get_level_values('device').unique()
all_days = date_range(
end_of_day.index.get_level_values('time').min(),
end_of_day.index.get_level_values('time').max(),
freq='D',
)
states = Series(
index=MultiIndex.from_product([all_devices, all_days], names=['device', 'time']),
dtype=object,
name='state',
)
states.loc[end_of_day.index] = end_of_day.values
states = states.ffill()
print(
# end_of_day.loc[IndexSlice[:, '2020-04', :]]
# end_of_day,
# all_devices,
# all_days,
# every_device_every_day,
# len(states.loc[end_of_day.index]),
# end_of_day.values),
# states.loc['29ny-x4yz'],
# states.groupby('time').value_counts(),
# states.index.names,
# states.head(3),
# states.groupby('time').value_counts().unstack('state').plot(),
states.groupby('time').value_counts().plot(),
sep='\n',
)
from matplotlib.pyplot import show
show()
from pandas import Series, date_range, DataFrame
from numpy.random import default_rng
rng = default_rng(0)
power = Series(
data=rng.random(size=100),
index=date_range('2020-01-01', periods=100),
)
temperature = Series(
data=rng.random(size=95),
index=date_range('2020-01-03', periods=95),
)
print(
DataFrame({
'power': power,
'temperature': temperature,
}).stack()
)
from pandas import Series, date_range
s1 = Series([1, 2, 3], index=date_range('2020-01-01', periods=3))
s2 = Series([40, 50, 60], index=date_range('2020-01-02', periods=3))
print(
# s1,
# s2,
s1 + s2,
sep='\n'
)
Answer
from pandas import read_pickle
s = read_pickle('lifecycles.pkl')
print(
s.dt.floor('D').loc[lambda s: s == '2020-04-24']
)
MultiIndex
es
with overlapping levels, how do we perform index aligned operations to
solve complex nested problems without explicit Python-level loops?print("Let's take a look!")
Given the data below, which represents trades done by a trading desk across multiple books, analyze what would happen (on a per book level) to the total market value and total profit under all the various combinations of the different scenarios:
abc
market price increases or decreases 2%def
market price increases or decreases 5%xyz
market price increases or decreases 10%from pandas import period_range, date_range, MultiIndex, Series, DataFrame, to_timedelta
from numpy import tile
from numpy.random import default_rng
from string import ascii_lowercase
rng = default_rng(0)
assets = ['abc', 'def', 'ghi', 'jkl', 'xyz']
dates = period_range('2020-01-01', periods=90, freq='D')
prices = Series(
index=(idx := MultiIndex.from_product([dates, assets], names=['date', 'asset'])),
data=(
rng.normal(loc=100, scale=50, size=len(assets)).clip(10, 150)
* rng.normal(loc=1, scale=.05, size=(len(dates), len(assets))).cumprod(axis=0)
).ravel()
).round(2)
books = ['Alice', 'Bob', 'Charlie', 'Dana', 'Evan']
dates = date_range('2020-01-01', periods=3)
trades = (
DataFrame(
index=(idx := MultiIndex.from_product(
[tile(dates, 5), books, assets],
names=['date', 'book', 'asset'],
)),
data={
'volume': rng.integers(-100_000, +100_000, size=len(idx)).round(-2),
'price': (
prices.loc[
MultiIndex.from_arrays([
idx.get_level_values('date').to_period('D'),
idx.get_level_values('asset'),
])
] * rng.normal(loc=1, scale=.1, size=len(idx)).clip(.75, 1.25)
).values,
},
)
.loc[lambda df: abs(df['volume']) > 0]
.sample(frac=.25, random_state=rng)
.pipe(lambda df: df
.set_axis(MultiIndex.from_arrays([
df.index.get_level_values('book'),
df.index.get_level_values('asset'),
df.index.get_level_values('date') + to_timedelta(rng.integers(60 * 60, size=len(df)), unit='s'),
], names=['book', 'asset', 'date']))
)
.sort_index()
)
print(
trades.head(3)
)
Background & Theory
Answer
print("Let's take a look!")
from datetime import timedelta
from random import Random
from enum import Enum
from string import ascii_lowercase, digits
from pandas import DataFrame, to_datetime, concat
from numpy.random import default_rng
rng = default_rng(0)
rnd = Random(0)
State = Enum('State', 'Provisioned Deployed InUse Maintenance Decommissioned')
def simulate():
td = lambda days: timedelta(seconds=rnd.randrange(24 * 60 * 60 * days))
yield State.Provisioned, td(days=90)
if rnd.choices([True, False], weights=[.90, .10])[0]:
yield State.Deployed, td(days=30)
if rnd.choices([True, False], weights=[.95, .05])[0]:
for _ in range(max(0, int(rnd.gauss(mu=2, sigma=1)))):
yield State.InUse, td(days=14)
yield State.Maintenance, td(days=14)
if rnd.choices([True, False], weights=[.95, .05])[0]:
yield State.InUse, td(days=14)
if rnd.choices([True, False], weights=[.50, .50])[0]:
yield State.Decommissioned, td(days=30)
if __name__ == '__main__':
devices = rng.choice([*ascii_lowercase, *digits], size=(10, 9))
devices[:, 4] = '-'
devices = devices.view('<U9').ravel()
lifecycles = (
DataFrame(simulate(), columns='state time'.split())
.assign(device=d)
.assign(state=lambda df: df['state'].apply(lambda x: x.name))
.set_index(['device', 'state'])
.cumsum()
.pipe(lambda df: df + to_datetime('2020-01-01'))
.squeeze(axis='columns')
for d in devices
)
lifecycles = concat(lifecycles, axis='index')
print(lifecycles)
lifecycles.to_pickle('lifecycles.pkl')
Background & Theory
from pandas import read_pickle, date_range, IndexSlice, DataFrame, MultiIndex, Series
s = read_pickle('lifecycles.pkl')
end_of_day = (
s
.reset_index('state', drop=False)
.assign(
time=lambda df: df['time'].dt.floor('D')
)
.set_index('time', append=True)
.sort_index()
.groupby(['device', 'time'])
.tail(1) # .head(1)
.squeeze(axis='columns')
)
all_devices = end_of_day.index.get_level_values('device').unique()
all_days = date_range(
end_of_day.index.get_level_values('time').min(),
end_of_day.index.get_level_values('time').max(),
freq='D',
)
states = Series(
index=MultiIndex.from_product([all_devices, all_days], names=['device', 'time']),
dtype=object,
name='state',
)
states.loc[end_of_day.index] = end_of_day.values
states = states.ffill().dropna()
print(states)
from pandas import Series, date_range, MultiIndex
from numpy.random import default_rng
from string import ascii_lowercase
rng = default_rng(0)
entities = rng.choice([*ascii_lowercase], size=(5, 9))
entities[:, 4] = '-'
entities = entities.view('<U9').ravel()
s = Series(
index=(idx := MultiIndex.from_product([
entities,
date_range('2020-01-01', periods=14)
], names=['entity', 'date'])),
data=rng.integers(-10, +10, size=len(idx)),
name='signal',
)
regions = (
(s > 0)
.pipe(
lambda s: (s != s.shift()).groupby('entity').cumsum(),
)
)
largest_region = regions.pipe(
lambda s: s.groupby('entity').agg(
lambda g: g.groupby(g).count().idxmax()
)
)
print(
regions
.reset_index('date', drop=False)
.set_index('signal', append=True)
.loc[
MultiIndex.from_arrays([
largest_region.index.get_level_values('entity'),
largest_region.values,
], names=['entity', 'signal'])
]
.droplevel('signal')
.set_index('date', append=True)
.pipe(
lambda idx: s.loc[idx.index]
)
,
# s,
# True + False,
# True == 1,
# False == 0,
# .loc['chmk-aada'],
# s.loc['chmk-aada'],
# s.groupby('entity').max(),
# s.groupby('entity').agg(
# lambda g: g.groupby(g > 0).count().max()
# ),
# s.loc['wqnh-bbae'],
# s.groupby('entity').transform(
# lambda g: (g > 0).rolling(3).agg(lambda x: x.all())
# ),
# s.loc['chmk-aada'],
# s.loc['wqnh-bbae'],
# s.groupby(),
# s.groupby(s == 2).count(),
# aggregations
# s.groupby('entity').mean(),
# s.groupby('date').mean(),
# s.groupby(['entity', s.index.get_level_values('date').to_period('M')]).mean(),
# s.groupby('entity').agg(
# lambda g: g.loc[g>0].sum()
# ),
# transformations
# s.groupby('entity').cumsum(),
# s.groupby('entity').transform(
# lambda g: g.rolling(3).mean()
# ),
# arbitrary (“applications”)
sep='\n',
)
Answer
pandas.Series
or pandas.DataFrame
with a rolling window
operation, what do we do if the window sizes are so small that our
“user defined functions” (“UDF”s) are effectively operating at Python
speed? How do we drop down into raw numpy
to restore lost
performance when the operation can be pushed into the “restricted
computation domain” (and what limits pandas
from being amenable to
these optimisations?)print("Let's take a look!")
The below data represents website telemetry data. Who were the top-10 users, measured by number of days on the website? What was the average time these users spent on each page (assuming that any sequence of page views <1h in duration constitute a single “visit” or “session”)?
from pandas import date_range, MultiIndex, to_datetime, to_timedelta, DataFrame, CategoricalDtype, Categorical
from numpy.random import default_rng
from numpy import unique, repeat, arange
rng = default_rng(0)
ips = [f'{a}.{b}.{c}.{d}' for a, b, c, d in unique(rng.integers(256, size=(100_000, 4)), axis=0)]
pages = CategoricalDtype(['index.html', 'about.html', 'contact.html'])
df = DataFrame(
index=(idx := rng.choice(ips, size=10_000_000)),
data={
'date':
to_datetime('2020-01-01')
+ to_timedelta(rng.integers(90 * 24 * 60 * 60, size=len(idx)), unit='s'),
'page': Categorical.from_codes(
rng.choice(arange(len(pages.categories)), size=len(idx)),
dtype=pages,
),
},
).sample(frac=.25, random_state=rng).sort_index()
print(df)
Background & Theory
Answer
pandas
pandas
(equivalent to the use of the key=
argument in the Python __builtins__.max
, __builtins__.min
, and
__builtins__.sorted
)?The below data set represents temperature and preciptation measurements for some regions.
Use it to answer the following questions:
from pandas import DataFrame, period_range, MultiIndex
from numpy.random import default_rng
from scipy.stats import skewnorm
rng = default_rng(0)
dates = period_range('2020-01-01', periods=90, freq='D')
regions = ['NY', 'CA', 'FL', 'TX']
df = DataFrame(
index=(idx := MultiIndex.from_product([dates, regions], names=['date', 'region'])),
data={
'humidity': rng.uniform(0, 1, size=len(idx)),
'precipitation': skewnorm(a=4).rvs(size=len(idx), random_state=rng).clip(0),
'temperature': rng.normal(loc=75, scale=15, size=len(idx)),
},
).round(2).sort_index()
print(df)
Background & Theory
Answer
pandas.DataFrame.groupby
operations that require
operating on multiple columns without using
pandas.DataFrame.groupby.apply
? (e.g., how do we compute weighted
averages with pandas.DataFrame.groupby.apply
?)print("Let's take a look!")
Using the below trading data, compute the weighted average buy and sell traded price of each asset using the trade volumes as weights.
from numpy.random import default_rng
from pandas import period_range, MultiIndex, Series, date_range, DataFrame
from numpy import tile
from string import ascii_lowercase
rng = default_rng(0)
portfolios = ['Alice', 'Bob', 'Charlie']
assets = rng.choice([*ascii_lowercase], size=(4, 4)).view('<U4').ravel()
dates = date_range('2020-01', periods=90)
idx = MultiIndex.from_product([
dates,
assets,
], names=['date', 'asset'])
prices = DataFrame(
index=idx,
data={
'bid': (bid := (
rng.normal(loc=100, scale=20, size=len(assets))
* rng.normal(loc=1, scale=.05, size=(len(dates), len(assets))).cumprod(axis=0)
).ravel()),
'ask': bid * rng.normal(.95, scale=.02, size=len(bid)).clip(0, .995),
},
).round(2).rename_axis('direction', axis='columns')
idx = MultiIndex.from_product([
date_range('2020-01-01', periods=90),
portfolios,
tile(assets, 5)
], names=['date', 'portfolio', 'asset'])
trades = DataFrame({
'volume': (volumes := rng.integers(-1_000_000, +1_000_000, size=len(idx)).round(-2)),
'price':
(
(prices['bid'] - prices['ask']).loc[
px_idx := MultiIndex.from_arrays([
idx.get_level_values('date').floor('D'),
idx.get_level_values('asset'),
])
].values * rng.normal(loc=1, scale=.20, size=len(idx))
).round(2)
+ prices.loc[px_idx, 'ask'].values
,
}, index=idx).sample(random_state=rng, frac=.25).sort_index()
print(
trades.head(),
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
Background & Theory
Answer
pandas
pandas
using .loc[lambda
x: x]
, .pipe
, .assign
, and PEP-572 assignment expressions (:=
)?
Why might we actually choose to do this?print("Let's take a look!")
The following data represents some measurements you have taken in the field. Given the below data, in one chained operation:
[0, 2)
, [2, 5)
, [5, 10]
and
find their meanfrom pandas import DataFrame, date_range, to_datetime, to_timedelta
from numpy.random import default_rng
rng = default_rng(0)
df = DataFrame({
'date': to_datetime('2020-01-01')
+ to_timedelta(rng.integers(90 * 24 * 60 * 60, size=(sz := 100)), unit='s'),
'entity': rng.choice(['abc', 'def', 'xyz'], size=sz),
'value': rng.integers(-10, +10, size=sz),
})
print(df)
Background & Theory
Answer
pandas
pitfallspandas
SettingWithCopyWarning
?print("Let's take a look!")
Why does pandas.SettingWithCopyWarning
come up? How do we avoid it?
The following data represents some measurements you have taken in the field.
Take the below data, and for every week in February, take the total number of
flags that occurred in that week and subtract it from the negative value
s or
add it to the positive value
s.
from pandas import DataFrame, date_range, to_datetime, to_timedelta, MultiIndex
from numpy.random import default_rng
rng = default_rng(0)
dates = to_datetime('2020-01-01') + to_timedelta(rng.integers(90 * 24 * 60 * 60, size=(sz := 100)), unit='s')
entities = rng.choice(['abc', 'def', 'xyz'], size=sz)
idx = MultiIndex.from_product([dates, entities], names=['date', 'entity'])
df = DataFrame({
'flag': rng.choice([True, False], size=len(idx)),
'value': rng.integers(-10, +10, size=len(idx)),
}, index=idx)
print(df)
Background & Theory
Answer
pandas.Series
containing a signal, how do we determine the
maximum number of times the signal consecutively appears (by number of
measurements or distance from minimum to maximum index value)?Below is code for a simple genetic algorithm with the following mechanics:
In this simulation, we:
0
or 1
values1
values it contains
1
chromosomeUsing the simulation code below, identify how tools like numpy
, pandas
,
xarray
, and features in pure Python “fit together.”
from numpy import arange
from numpy.random import default_rng
from dataclasses import dataclass, replace
from matplotlib.pyplot import plot, legend, show
from pandas import DataFrame
from xarray import DataArray
from itertools import islice
@dataclass
class Step:
population : DataArray
rng : object
@classmethod
def from_random(cls, size, length, *, rng=None):
if rng is None:
rng = default_rng(0)
population = DataArray(
rng.integers(0, 2, size=(size, length)),
dims=['size', 'length'],
)
return cls(population, rng=rng)
@property
def fitness(self):
return self.population.mean().item()
def simulate(self, *, mutation_rate=1e-4):
rng = self.rng
population = self.population
size, length = population.sizes['size'], population.sizes['length']
yield self
while True:
p = population.mean(dim='length')
p /= p.sum()
# TODO: choose only non-same mating pairs
parents = rng.choice(arange(size), size=(size, 2), p=p)
left, right = population[parents[:, 0]], population[parents[:, 1]]
xover_mask = rng.integers(0, 2, size=(size, length), dtype=bool)
xover_mask.sort(axis=1)
mutation_mask = rng.choice(
[True, False], p=(mutation_rate, 1 - mutation_rate),
size=(size, length)
)
population = ((left * xover_mask) + (right * ~xover_mask)) ^ mutation_mask
yield replace(self, population=population)
if __name__ == '__main__':
scenarios = {
length: Step.from_random(size=(SIZE := 250), length=length).simulate()
for length in {5, 10, 25, 50, 100}
}
results = DataFrame({
sc: [st.fitness for st in islice(sim, 150)]
for sc, sim in sorted(scenarios.items())
})
results.columns.name, results.index.name = 'length', 'step'
print(' {} '.format('Results').center(40, '\N{box drawings light horizontal}'))
print(results)
Further Explanation
In this genetic algorithm, we “breed” two chromosomes to produce an offspring chromosome.
Over time, we would expect repeated breeding to produce “fitter” and “fitter” offspring, but this will require that the choice of mates—the parents of each generation—are selected by their fitness.
Our fitness function is the number of ones in the chromosome. [0, 0, 0, 0, 0]
has fitness of zero (or, as a per centage: 0%); [1, 1, 1, 1, 1]
has a perfect
fitness of five (or, as a per centage: 100%.)
“Fitness-proportional selection” means: at each stage when breeding the next generation, the parents should be selected randomly with weighting given to parents that score higher in our fitness function.
from random import choices
population = [
[0, 0, 0, 0, 0], # fitness 0%: no probability of being selected for mating
[0, 0, 0, 0, 1], # fitness 20%: low probability of being selected for mating
[1, 1, 1, 0, 0], # fitness 60%: medium probability of being selected for mating
[0, 1, 1, 1, 0], # fitness 60%: medium probability of being selected for mating
[1, 1, 1, 0, 1], # fitness 80%: high probability of being selected for mating
]
fitnesses = [sum(chromo)/len(chromo) for chromo in population]
total_fitness = sum(fitnesses)
weights = [fit/total_fitness for fit in fitnesses]
print(f'{choices(population, weights=weights, k=2) = }')
The overall fitness of the population can further grow via some randomisation factors. We will implement two:
When breeding two parents x
and y
to create a child z
, we construct the
child by lining up x
and y
and finding a random point at which to stitch
the two together. This is what we mean by “one-point crossover.”
# the parents...
parents = x, y = \
[0, 0, 1, 0, 1], \
[1, 1, 1, 1, 0],
# ##
# stitch here, taking the left of x and right of y
# or taking the right of x and the left of y
# the child...
child = z = [0, 0] + [1, 1, 0] # possibility I: left-of-x + right-of-y
child = z = [1, 1] + [1, 0, 1] # possibility II: right-of-x + left-of-y
Once you have performed crossover to construct the child, you can further add in randomness by performing random, single-bit flip mutations. This is what we mean by “bit-flip mutation”: randomly select bits and flip them.
from random import choices
child = [0, 0, 1, 1, 0]
print(f'Before: {child = }')
child = [bit ^ choices([True, False], weights=((MUTATION_RATE := .5), 1 - MUTATION_RATE))[0]
for bit in child]
print(f'After: {child = }')
Background & Theory
Answer
pandas
on intervals of arbitrary
convention (i.e., open/open, closed/closed, or open/closed) using
pandas.cut
?print("Let's take a look")
Given the below time series measurements, select all the values on the following intervals:
[Jan-2, Jan-4]
[Jan-2, Jan-4)
(Jan-2, Jan-4]
(Jan-2, Jan-4)
In the above interval notation, a [
or ]
means “include this endpoint” and
a (
or )
means “do not include this endpoint.”
from pandas import Series, to_datetime, to_timedelta
from numpy.random import default_rng
rng = default_rng(0)
s = Series(
index=(idx :=
to_datetime('2020-01-01')
+ to_timedelta(rng.normal(loc=5, scale=2, size=100_000).clip(0).round().cumsum(), unit='s')
),
data=rng.choice([True, False], size=len(idx)),
).sort_index()
print(
s,
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40)
)
Background & Theory
.loc
vs .iloc
Implicitly hierarhical indices…
pandas.cut
…
Answer
pandas.Series
or
pandas.DataFrame
with MultiIndex
using pandas.IndexSlice
? How do
we perform exact entity selections with .loc
and constructed
MultiIndex
es?print("Let's take a look")
Given the below time series measurements, align them so that they match on the nearest time entry. Try rounding the time entries to the nearest day, and matching. (In the case of multiple entries per day, either match every against every or match against a grouped or rolling aggregate.)
from pandas import Series, to_datetime, to_timedelta, MultiIndex
from numpy.random import default_rng
from string import ascii_lowercase
rng = default_rng(0)
idx = MultiIndex.from_arrays([
to_datetime('2020-01-01') + to_timedelta(rng.integers(24*60*60, size=100).cumsum(), unit='s'),
rng.choice([*ascii_lowercase[:5]], size=100),
])
s1 = Series(rng.normal(size=len(idx)).round(2), index=idx)
idx = MultiIndex.from_arrays([
to_datetime('2020-01-01') + to_timedelta(rng.integers(24*60*60, size=100).cumsum(), unit='s'),
rng.choice([*ascii_lowercase[:5]], size=100),
])
s2 = Series(rng.integers(-10, +10, size=len(idx)), index=idx)
print(
s1.head(2),
s2.head(2),
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40)
)
Background & Theory
Answer