.index
!”Title | Don’t Forget About the .index ! |
Topic | Fluent Use of pandas & core concepts |
Date | Fri Dec 17 |
Keywords | pandas.Index , MultiIndex |
These sessions are designed for a broad audience of non-software engineers and software programmers of all backgrounds and skill-levels.
Our expected audience should comprise attendees with a…
pandas
for columnar data analysis.During this session, we will endeavour to guide our audience to developing…
pandas.Series
and pandas.DataFrame
objects.pandas.Index
.In a previous seminar, we discussed data analysis of tabular/columnar data sets using pandas
.
In this seminar, we will take a close look at what makes pandas
special: the
index! We’ll look at how a pandas.Series
can be conceptualised as a
mathematical “restricted computation domain” similar to the numpy.ndarray
with some business information attached—a coördinate system or indexing system.
We’ll talk about how a pandas.DataFrame
is not a two-dimensional data structure
but a doubly-indexed collection of index-aligned one-dimesional data. We’ll discuss
the mechanisms employed by the Index as part of .loc
operations, and core concepts
of implicit/explicit hierarchy and monotonicity.
Agenda:
pandas.Index
and .get_loc
, indexing, monotonicityint64
vs Int64
).index
vs .columns
.T
, .reindex
, .set_index
, .reset_index
, .rename
, .swapaxes
, .reorder_levels
, .swaplevel
, .droplevel
MultiIndex
, explicit vs implicit heirarchy.melt
, .pivot
Did you enjoy this seminar? Did you learn something new that will help you
use pandas
more effectively, as you do more analysis in your work?
In a future seminar, we may dive deeper into the mechanics of pandas
. We can:
pandas.DataFrame
s)pandas
, such as subclassing pandas.Series
and pandas.DataFrame
and defining registered accessorspandas.DataFrame
and see how all operations can be described in-terms of manipulations of the .index
, the .columns
, and the ._data
print("Let's get started!")
from itertools import islice
with open('data.csv') as f:
for line in islice(f, 5):
print(f'{line = }')
from itertools import islice
from csv import reader
bytes_in = []
bytes_out = []
with open('data.csv') as f:
for fields in reader(islice(f, 1, None)):
bytes_in.append(int(fields[4]))
bytes_out.append(int(fields[5]))
print(
# f'{bytes_in[:5] = }…',
# f'{bytes_out[:5] = }…',
# f'{max(bytes_in) = :,}',
# f'{max(bytes_out) = :,}',
# f'{max(x - y for x, y in zip(bytes_out, bytes_in)) = :,}',
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from csv import reader
from datetime import datetime
entries = []
with open('data.csv') as f:
next(f)
for fields in reader(f):
date, entity, _, _, bytes_in, bytes_out, temp_board, temp_case = fields
date = datetime.fromisoformat(date)
bytes_in, bytes_out = int(bytes_in), int(bytes_out)
temp_board, temp_case = float(temp_board), float(temp_case)
ent = date, entity, bytes_in, bytes_out, temp_board, temp_case
entries.append(ent)
print(
# f'{entries[0] = }',
# f'{max(entries) = }',
# f'{max(entries, key=lambda ent: ent[2]) = }',
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from csv import reader
from datetime import datetime
from collections import namedtuple
class Entry(namedtuple('Entry', 'date entity bytes_in bytes_out temp_board temp_case')):
@classmethod
def from_csv(cls, fields):
date, entity, _, _, bytes_in, bytes_out, temp_board, temp_case = fields
date = datetime.fromisoformat(date)
bytes_in, bytes_out = int(bytes_in), int(bytes_out)
temp_board, temp_case = float(temp_board), float(temp_case)
return cls(date, entity, bytes_in, bytes_out, temp_board, temp_case)
entries = []
with open('data.csv') as f:
next(f)
for fields in reader(f):
entries.append(Entry.from_csv(fields))
print(
# f'{entries[0] = }',
# f'{max(entries) = }',
# f'{max(entries, key=lambda ent: ent.bytes_in) = }',
# f'{max(entries, key=lambda ent: ent.bytes_out) = }',
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from csv import DictReader
from datetime import datetime
entries = []
with open('data.csv') as f:
for ent in DictReader(f):
for f in 'bytes_in bytes_out'.split():
ent[f] = int(ent[f])
for f in 'temp_board temp_case'.split():
ent[f] = float(ent[f])
for f in {'date'}:
ent[f] = datetime.fromisoformat(ent[f])
entries.append(ent)
print(
# f'{entries[0] = }',
# f'{max(entries, key=lambda ent: ent["bytes_in"]) = }',
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from csv import DictReader
from datetime import datetime, date
from collections import defaultdict
entries = []
with open('data.csv') as f:
for ent in DictReader(f):
for f in 'bytes_in bytes_out'.split():
ent[f] = int(ent[f])
for f in 'temp_board temp_case'.split():
ent[f] = float(ent[f])
for f in {'date'}:
ent[f] = datetime.fromisoformat(ent[f])
entries.append(ent)
by_day = defaultdict(list)
for ent in entries:
by_day[ent['date'].date()].append(ent['bytes_in'])
print(
# f'{by_day[date(2020, 1, 2)] = }',
# f'{({k: max(v) for k, v in by_day.items()}) = }',
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from csv import DictReader
from datetime import datetime, date
from collections import defaultdict, Counter
entries = []
with open('data.csv') as f:
for ent in DictReader(f):
for f in 'bytes_in bytes_out'.split():
ent[f] = int(ent[f])
for f in 'temp_board temp_case'.split():
ent[f] = float(ent[f])
for f in {'date'}:
ent[f] = datetime.fromisoformat(ent[f])
entries.append(ent)
by_day = defaultdict(Counter)
for ent in entries:
by_day[ent['date'].date()][ent['entity']] += ent['bytes_in']
print(
# f'{by_day[date(2020, 1, 2)] = }',
# f'{by_day[date(2020, 1, 2)] - by_day[date(2020, 1, 1)] = }',
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from collections import Counter
day1 = Counter({'abc': 10, 'def': 200, 'xyz': 3_000})
day2 = Counter({'abc': 13, 'def': 234, 'xyz': 3_123})
day3 = Counter({'abc': 17, 'def': 256})
print(
# f'{day2 - day1 = }',
# f'{day3 - day2 = }',
# f'{day2 + day3 = }',
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
numpy.ndarray
from numpy import load, argmax
bytes_in = load('bytes_in.npy')
bytes_out = load('bytes_out.npy')
entity = load('entity.npy', allow_pickle=True)
print(
# f'{bytes_in.shape[0] = :,}',
# f'{bytes_out.shape[0] = :,}',
# f'{bytes_in - bytes_out = :}',
# f'{(bytes_in - bytes_out).max() = :,}',
# f'{(bytes_in - bytes_out).min() = :,}',
# f'{entity[argmax(bytes_out - bytes_in)] = :}',
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
xarray.DataArray
from numpy import load, vstack, subtract
from xarray import DataArray
bytes_in = load('bytes_in.npy')
bytes_out = load('bytes_out.npy')
data = DataArray(
data=vstack([bytes_in, bytes_out]),
)
print(
# data,
# subtract(data[0], data[1]),
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from numpy import load, vstack, subtract
from xarray import DataArray
bytes_in = load('bytes_in.npy')
bytes_out = load('bytes_out.npy')
data = DataArray(
data=vstack([bytes_in, bytes_out]),
dims=['measurement', 'entity'],
)
print(
# data,
# data.mean(axis=1),
# data.mean(dim='entity'),
# data.mean(dim='measurement'),
# data.mean(dim='measurement').max(dim='entity'),
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from numpy import load, vstack, subtract
from xarray import DataArray
bytes_in = load('bytes_in.npy')
bytes_out = load('bytes_out.npy')
entity = load('entity.npy', allow_pickle=True)
data = DataArray(
data=vstack([bytes_in, bytes_out]),
dims=['measurement', 'entity'],
coords={
'measurement': ['bytes_in', 'bytes_out'],
'entity': entity,
},
)
print(
# data,
# data.sel(measurement='bytes_in'),
# data.sel(measurement='bytes_in').argmax(),
# data.sel(measurement='bytes_in').idxmax(),
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
pandas.DataFrame
from pandas import read_csv, set_option
set_option('display.max_rows', 4)
df = read_csv('data.csv')
print(
df,
# df['bytes_in'],
# df['bytes_in'].values,
# df['bytes_in'] - df['bytes_out'],
# (df['bytes_in'] - df['bytes_out']).max(),
# df['entity'][(df['bytes_in'] - df['bytes_out']).idxmax()],
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from pandas import read_csv, IndexSlice, set_option
set_option('display.max_rows', 4)
df = (
read_csv('data.csv', parse_dates=['date'])
# .set_index(['date', 'datacenter', 'building', 'entity'])
# .sort_index()
)
print(
df,
# df.loc['2020-01-03 9:00':'2020-01-03 10:00'],
# df.loc[IndexSlice['2020-01-03 9:00':'2020-01-03 10:00', ['wqn'], :]],
# df.loc[IndexSlice['2020-01-03 9:00':'2020-01-03 10:00', :, :, ['bvali.net', 'cbtjw.net'], :]],
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from pandas import read_csv, IndexSlice, MultiIndex, DataFrame, set_option
set_option('display.max_rows', 4)
df = (
read_csv('data.csv', parse_dates=['date'])
# .set_index(['date', 'datacenter', 'building', 'entity'])
# .pipe(lambda df: df
# .set_axis(
# df.columns.str.split('_', expand=True),
# axis='columns'
# )
# )
# .pipe(lambda df: df.join(
# DataFrame({('packets', 'in'): 100, ('packets', 'out'): 100}, index=df.index)
# ))
# .sort_index()
)
print(
df,
# df.loc[
# IndexSlice['2020-01-03 9:00':'2020-01-03 10:00', :, :, ['bvali.net', 'cbtjw.net'], :]
# ]['bytes'],
# df.loc[
# IndexSlice['2020-01-03 9:00':'2020-01-03 10:00', :, :, ['bvali.net', 'cbtjw.net'], :]
# ]['temp'],
# df.loc[
# IndexSlice['2020-01-03 9:00':'2020-01-03 10:00', :, :, ['bvali.net', 'cbtjw.net'], :],
# IndexSlice[:, 'in'],
# ],
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
“Alignment“ of operations:
from collections import Counter
from numpy import arange
from pandas import Series, DataFrame
# x, y = 1, 2
# x, y = 1+2j, 3+4j
# x, y = Counter('aaaabbc'), Counter('aaaaabd')
# x, y = arange(3), arange(3)
# x, y = arange(3), arange(6).reshape(2, 3)
# x, y = Series({'a': 1, 'b': 20}), Series({'b': 21, 'c': 300})
# x = DataFrame({'a': [1, 2], 'b': [20, 21]}, index=[*'xy'])
# y = DataFrame({'b': [22, 24], 'c': [300, 301]}, index=[*'yz'])
# x = Series({'a': 10, 'b': 200})
# y = DataFrame({'a': [1, 2], 'b': [3, 4]}, index=[*'xy'])
print(
'x'.center(10).center(40, '\N{box drawings light horizontal}'),
f'{x}',
'y'.center(10).center(40, '\N{box drawings light horizontal}'),
f'{y}',
'x + y'.center(10).center(40, '\N{box drawings light horizontal}'),
f'{x + y}',
sep='\n',
)
from pandas import read_csv, IndexSlice, MultiIndex, DataFrame, Series, set_option
set_option('display.max_rows', 4)
df = (
read_csv('data.csv', parse_dates=['date'])
.set_index(['date', 'datacenter', 'building', 'entity'])
.pipe(lambda df: df
.set_axis(
df.columns.str.split('_', expand=True),
axis='columns'
)
)
.pipe(lambda df: df.join(
DataFrame({('packets', 'in'): 100, ('packets', 'out'): 100}, index=df.index)
))
.sort_index()
)
# factor = Series({
# ('bytes', 'in'): .5,
# ('bytes', 'out'): .75,
# ('packets', 'in'): .5,
# ('packets', 'out'): .75,
# })
# factor = DataFrame({
# ('bytes', 'in'): [ .5, .65],
# ('bytes', 'out'): [.75, .85],
# ('packets', 'in'): [ .5, .65],
# ('packets', 'out'): [.75, .85],
# }, index=['wqn0', 'hib3']).rename_axis('building')
print(
df,
# factor,
# df.loc[
# IndexSlice['2020-01-03 9:00':'2020-01-03 10:00', :, :, ['bvali.net', 'cbtjw.net'], :]
# ][['bytes', 'packets']] * factor,
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
.index
!from pandas import read_pickle, Grouper, IndexSlice, set_option
set_option('display.max_rows', 4)
data = (
read_pickle('data.pickle')
.set_index(['date', 'datacenter', 'building', 'entity'])
.pipe(lambda df: df
.set_axis(
df.columns.str.split('_', expand=True),
axis='columns'
)
)
.sort_index()
)
print(
# # data
# # ['bytes'].sum(axis='columns')
# # [lambda s: s > s.mean() + s.std()/2]
# ,
# look at only WQN
# (wqn_data :=
# data
# # .loc[IndexSlice[:, ['wqn'], :]]
# )
# .pipe(lambda df: df
# .pipe(lambda df: df
# # # compute the daily total throughput (bytes in + bytes out)
# # ['bytes'].sum(axis='columns')
# # .groupby(Grouper(level='date', freq='1D')).sum()
# # .to_frame(name='throughput')
# )
# # .pipe(lambda df:
# # # # identify the maximum sequence of consecutive days for which
# # # # throughput is > μ + ½σ
# # # (~(df['throughput'] > df['throughput'].mean() + df['throughput'].std()/2))
# # # .cumsum()
# # # .pipe(lambda s: s.groupby(s).transform(lambda g: g.size))
# # # .pipe(lambda s: s[s == s.max()])
# # # .pipe(lambda s:
# # # wqn_data[wqn_data.index.get_level_values('date').round('1D').isin(s.index)]
# # # )
# # )
# )
# .pipe(lambda df: df
# # # identify which buildings see the greatest mean temperature
# # # within each 6H block
# # ['temp', 'case']
# # .groupby(grp := Grouper(level='date', freq='6H')).agg(
# # lambda g: g.groupby(['building']).mean().idxmax()
# # )
# .pipe(lambda s: df
# # .groupby([grp, 'building'])
# # .filter(
# # lambda g:
# # s.loc[g.index.get_level_values('date')[0].round('6H')]
# # == g.index.get_level_values('building')[0]
# # )
# )
# )
# .pipe(lambda df: df
# # # for each 1H window, find all the devices
# # # (restricted only to target dates/building) where
# # # case min temperature > 110% of board max temperature
# # ['temp']
# # .groupby(Grouper(level='date', freq='1H'))
# # .apply(lambda g:
# # {*
# # g.groupby('entity').filter(
# # lambda g: g['case'].min() > 1.05 * g['board'].max()
# # ).index.get_level_values('entity')
# # }
# # )
# )
# ,
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)