Title | “Time Series Analysis with Python” |
Topic | analysing common time series data using Python |
Date | Fri Sep 16 |
Time | 10am~11am PST |
Keywords | Python, pandas, time series analysis |
These sessions are designed for a broad audience of non-software engineers and software programmers of all backgrounds and skill-levels.
Our expected audience should comprise attendees with a…
During this session, we will endeavor to guide our audience to developing…
In previous seminars, we have seen the use of pandas for analysis of columnar data, and the importance of understanding core NumPy and pandas concepts to make these analyses flexible, efficient, and fluent.
In this seminar, we will take a closer look at analyses involving time data. We’ll construct a sample dataset very similar to actual data that you might work with from Netpipeline, SeRF, ODS, or similar. We’ll investigate a number of specific, actionable analyses (i.e., analyses which tie directly to a business outcome or business decision,) and see how to construct them, step-by-step, in pandas
Did you enjoy this seminar? Did you learn something new that will help you as you as you write your own pandas analyses?
In a future semnar, we can do deeper into advanced pandas features and
mechanics, such as the use of MultiIndex
; transformation operations like
.stack
, .unstack
, .melt
, .pivot
, and .pivot_table
; grouping and
windowing analyses using .groupby
, .rolling
, and .expanding
; efficiency
with large data sets (and the occassional need to drop down to the NumPy
layer); or other topics that would help you make better use of this powerful
analytical tool!
If you’re interested in any of these topics, please let us know! Send us an e-mail at learning@dutc.io or contact us over Workplace with your feedback!
The below data set contains events related to devices that you manage. Devices follow a simple lifecycle:
provisioned → (deployed → in use) → (under maintenance → in use → …) → decommissioned
Note that a device isn’t guaranteed to be deployed or go into use, and that a device may or may not go under maintenance multiple times.
print("Let's take a look!")
from datetime import timedelta
from random import Random
from enum import Enum
from string import ascii_lowercase, digits
from tempfile import mkdtemp
from pathlib import Path
from pandas import DataFrame, to_datetime, concat
from numpy import unique
from numpy.random import default_rng
rng = default_rng(0)
rnd = Random(0)
State = Enum('State', 'Provisioned Deployed InUse Maintenance Decommissioned')
Vendor = Enum('Vendor', 'Arista Cisco Juniper')
DeviceType = Enum('DeviceType', 'RackSwitch SpineSwitch FabricSwitch')
def simulate():
td = lambda days: timedelta(seconds=rnd.randrange(24 * 60 * 60 * days))
yield State.Provisioned, td(days=90)
if rnd.choices([True, False], weights=[.90, .10])[0]:
yield State.Deployed, td(days=30)
if rnd.choices([True, False], weights=[.95, .05])[0]:
for _ in range(max(0, int(rnd.gauss(mu=2, sigma=1)))):
yield State.InUse, td(days=14)
yield State.Maintenance, td(days=14)
if rnd.choices([True, False], weights=[.95, .05])[0]:
yield State.InUse, td(days=14)
if rnd.choices([True, False], weights=[.50, .50])[0]:
yield State.Decommissioned, td(days=60)
if __name__ == '__main__':
devices = rng.choice([*ascii_lowercase, *digits], size=(50_000, 9))
devices[:, 4] = '-'
devices = unique(devices.view('<U9').ravel())
lifecycles = (
DataFrame(simulate(), columns='state time'.split())
.assign(device=d)
.set_index(['device', 'state'])
.cumsum()
.pipe(lambda df: df + to_datetime('2020-01-01'))
.squeeze(axis='columns')
for d in devices
)
lifecycles = concat(lifecycles, axis='index')
device_infos = DataFrame(
index=(idx := devices),
data={
'type': rng.choice([*DeviceType], size=len(idx)),
'vendor': rng.choice([*Vendor], size=len(idx)),
}
)
data_dir, temp_dir = Path('data'), Path(mkdtemp())
data_dir.unlink(missing_ok=True)
data_dir.symlink_to(temp_dir)
(
lifecycles
.reset_index()
.assign(
device=lambda df: df['device'].astype('category'),
state=lambda df: df['state'].apply(lambda x: x.name).astype('category'),
)
.to_pickle(data_dir / 'lifecycles.pkl')
)
(
device_infos
.reset_index()
.assign(
vendor=lambda df: df['vendor'].apply(lambda x: x.name).astype('category'),
type=lambda df: df['type'].apply(lambda x: x.name).astype('category'),
)
.to_pickle(data_dir / 'devices.pkl')
)
print(
lifecycles
.reset_index()
.head(3)
,
device_infos
.reset_index()
.head(3)
,
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from pandas import read_pickle
from pathlib import Path
data_dir = Path('data')
df = read_pickle(data_dir / 'lifecycles.pkl')
print(
df,
df['device'].nunique(),
df['time'].agg(['min', 'max']),
df['state'].unique(),
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from pandas import read_pickle
from pathlib import Path
data_dir = Path('data')
df = read_pickle(data_dir / 'devices.pkl')
print(
df,
df['type'].unique(),
df['vendor'].unique(),
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from pandas import Series, date_range, MultiIndex
from numpy.random import default_rng
rng = default_rng(0)
s1 = Series(
# index=(idx := range(4)),
# index=(idx := [*'abcd']),
# index=(idx := date_range('2020-01-01', periods=3, name='date')),
index=(idx := MultiIndex.from_product([
date_range('2020-01-01', periods=3),
[*'abcd'],
], names=['date', 'entity'])),
data=rng.integers(-10, +10, size=len(idx)),
)
s2 = Series(
# index=(idx := range(4)),
# index=(idx := [*'bace']),
# index=(idx := [*'abcd']),
index=(idx := date_range('2020-01-02', periods=3, name='date')),
# index=(idx := MultiIndex.from_product([
# date_range('2020-01-02', periods=3),
# [*'abce'],
# ], names=['date', 'entity'])),
data=rng.integers(-10, +10, size=len(idx)),
)
print(
s1,
s2,
(s1 + s2), #.loc['2020-01-03':],
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
print("Let's take a look!")
from pandas import read_pickle, Grouper
from pathlib import Path
data_dir = Path('data')
df = (
read_pickle(data_dir / 'lifecycles.pkl')
# .set_index(['state', 'device'])
.set_index(['state', 'time'])
)
print(
# df[(df['state'] == 'Maintenance') & (df['state'] == 'Deployed')],
# df.loc['Maintenance'],
# df.loc['Maintenance'].index.get_level_values('device').unique(),
df.loc['Maintenance'].groupby(
Grouper(level='time', freq='M')
).nunique() /
df.loc['Deployed'].groupby(
Grouper(level='time', freq='M')
).nunique(),
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
print("Let's take a look!")
from pandas import read_pickle, IndexSlice
from pathlib import Path
data_dir = Path('data')
df = (
read_pickle(data_dir / 'lifecycles.pkl')
.set_index(['time'])
)
print(
# df.head(3),
df.groupby([
df.index.get_level_values('time').to_period('M'),
df.index.get_level_values('time').isocalendar().week,
])['state'].value_counts()
.loc[IndexSlice['2020-02', :, 'Maintenance', :]]
# .iloc[-2:]
,
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
print("Let's take a look!")
from pandas import read_pickle
from pathlib import Path
data_dir = Path('data')
df = (
read_pickle(data_dir / 'lifecycles.pkl')
.set_index(['device'])
)
info = (
read_pickle(data_dir / 'devices.pkl')
.set_axis(['device', 'type', 'vendor'], axis='columns')
.set_index(['device'])
)
print(
# df,
# info,
# df.join(info),
df.assign(
problematic=lambda df: df[df['state'] == 'Maintenance'].groupby('device')['state'].count() > 2
).loc[
lambda df: df['problematic'] == True
].loc[lambda df:
df.assign(
is_decommission=lambda df: df['state'] == 'Decommissioned'
).groupby('device')['is_decommission'].any()
.loc[lambda x: x].index
].groupby(['device'])['time'].agg(['min', 'max'])
.pipe(
lambda df: df['max'] - df['min']
).dt.days,
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
print("Let's take a look!")
from pandas import read_pickle, IndexSlice
from pathlib import Path
data_dir = Path('data')
df = (
read_pickle(data_dir / 'lifecycles.pkl')
.set_index(['device', 'state'])
)
info = (
read_pickle(data_dir / 'devices.pkl')
.set_axis(['device', 'type', 'vendor'], axis='columns')
.set_index(['device'])
)
# sample = df.loc[IndexSlice['000k-3p2a', ['InUse', 'Maintenance'], :]].sort_values('time')
print(
# df,
# sample.assign(
# maintenance_cycle=(
# sample.index.get_level_values('state') == 'Maintenance'
# ).cumsum()
# )[lambda df: df['maintenance_cycle'] > 0].groupby(['maintenance_cycle'])['time'].agg(['min', 'max'])
# .pipe(lambda df: df['max'] - df['min'])
# .agg(['sum', 'median', 'mean', 'count'])
# ,
df
.loc[IndexSlice[:, ['InUse', 'Maintenance'], :]]
.sort_values('time')
.pipe(lambda df:
df.join(
df.assign(
is_maintenance=lambda df: df.index.get_level_values('state') == 'Maintenance'
)
.groupby('device')['is_maintenance'].cumsum().rename('maintenance_cycle')
)
)
[lambda df: df['maintenance_cycle'] > 0].groupby(['device', 'maintenance_cycle'])['time'].agg(['min', 'max'])
.pipe(lambda df: df['max'] - df['min'])
.groupby(['device']).agg(['sum', 'median', 'mean', 'count'])
.join(info)
.groupby(['vendor'])[['sum', 'median', 'mean', 'count']].agg({
'sum':'sum',
'median':'median',
'mean':'mean',
'count':'sum',
})
,
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
print("Let's take a look!")
from pandas import read_pickle, IndexSlice
from pathlib import Path
data_dir = Path('data')
df = (
read_pickle(data_dir / 'lifecycles.pkl')
)
print(
df,
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
print("Let's take a look!")
from pandas import read_pickle, IndexSlice
from pathlib import Path
data_dir = Path('data')
df = (
read_pickle(data_dir / 'lifecycles.pkl')
)
print(
df,
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)