seminars.fb

Seminar (Fri Dec 17): “Don’t Forget About the .index!”

   
Title Don’t Forget About the .index!
Topic Fluent Use of pandas & core concepts
Date Fri Dec 17
Keywords pandas.Index, MultiIndex

Contents

Audience

These sessions are designed for a broad audience of non-software engineers and software programmers of all backgrounds and skill-levels.

Our expected audience should comprise attendees with a…

During this session, we will endeavour to guide our audience to developing…

Abstract

In a previous seminar, we discussed data analysis of tabular/columnar data sets using pandas.

In this seminar, we will take a close look at what makes pandas special: the index! We’ll look at how a pandas.Series can be conceptualised as a mathematical “restricted computation domain” similar to the numpy.ndarray with some business information attached—a coördinate system or indexing system. We’ll talk about how a pandas.DataFrame is not a two-dimensional data structure but a doubly-indexed collection of index-aligned one-dimesional data. We’ll discuss the mechanisms employed by the Index as part of .loc operations, and core concepts of implicit/explicit hierarchy and monotonicity.

Agenda:

What’s Next?

Did you enjoy this seminar? Did you learn something new that will help you use pandas more effectively, as you do more analysis in your work?

In a future seminar, we may dive deeper into the mechanics of pandas. We can:

Notes

print("Let's get started!")

pure Python

from itertools import islice

with open('data.csv') as f:
    for line in islice(f, 5):
        print(f'{line = }')
from itertools import islice
from csv import reader

bytes_in = []
bytes_out = []
with open('data.csv') as f:
    for fields in reader(islice(f, 1, None)):
        bytes_in.append(int(fields[4]))
        bytes_out.append(int(fields[5]))

print(
    #  f'{bytes_in[:5]   = }…',
    #  f'{bytes_out[:5]  = }…',
    #  f'{max(bytes_in)  = :,}',
    #  f'{max(bytes_out) = :,}',
    #  f'{max(x - y for x, y in zip(bytes_out, bytes_in)) = :,}',
    sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from csv import reader
from datetime import datetime

entries = []
with open('data.csv') as f:
    next(f)
    for fields in reader(f):
        date, entity, _, _, bytes_in, bytes_out, temp_board, temp_case = fields
        date = datetime.fromisoformat(date)
        bytes_in, bytes_out = int(bytes_in), int(bytes_out) 
        temp_board, temp_case = float(temp_board), float(temp_case)
        ent = date, entity, bytes_in, bytes_out, temp_board, temp_case
        entries.append(ent) 

print(
    #  f'{entries[0]   = }',
    #  f'{max(entries) = }',
    #  f'{max(entries, key=lambda ent: ent[2]) = }',
    sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from csv import reader
from datetime import datetime
from collections import namedtuple


class Entry(namedtuple('Entry', 'date entity bytes_in bytes_out temp_board temp_case')):
    @classmethod
    def from_csv(cls, fields):
        date, entity, _, _, bytes_in, bytes_out, temp_board, temp_case = fields
        date = datetime.fromisoformat(date)
        bytes_in, bytes_out = int(bytes_in), int(bytes_out) 
        temp_board, temp_case = float(temp_board), float(temp_case)
        return cls(date, entity, bytes_in, bytes_out, temp_board, temp_case)

entries = []
with open('data.csv') as f:
    next(f)
    for fields in reader(f):
        entries.append(Entry.from_csv(fields)) 

print(
    #  f'{entries[0]   = }',
    #  f'{max(entries) = }',
    #  f'{max(entries, key=lambda ent: ent.bytes_in)  = }',
    #  f'{max(entries, key=lambda ent: ent.bytes_out) = }',
    sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from csv import DictReader
from datetime import datetime

entries = []
with open('data.csv') as f:
    for ent in DictReader(f):
        for f in 'bytes_in bytes_out'.split():
            ent[f] = int(ent[f])
        for f in 'temp_board temp_case'.split():
            ent[f] = float(ent[f])
        for f in {'date'}:
            ent[f] = datetime.fromisoformat(ent[f])
        entries.append(ent) 

print(
    #  f'{entries[0] = }',
    #  f'{max(entries, key=lambda ent: ent["bytes_in"]) = }',
    sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from csv import DictReader
from datetime import datetime, date
from collections import defaultdict

entries = []
with open('data.csv') as f:
    for ent in DictReader(f):
        for f in 'bytes_in bytes_out'.split():
            ent[f] = int(ent[f])
        for f in 'temp_board temp_case'.split():
            ent[f] = float(ent[f])
        for f in {'date'}:
            ent[f] = datetime.fromisoformat(ent[f])
        entries.append(ent) 

by_day = defaultdict(list)
for ent in entries:
    by_day[ent['date'].date()].append(ent['bytes_in'])

print(
    #  f'{by_day[date(2020, 1, 2)] = }',
    #  f'{({k: max(v) for k, v in by_day.items()}) = }',
    sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from csv import DictReader
from datetime import datetime, date
from collections import defaultdict, Counter

entries = []
with open('data.csv') as f:
    for ent in DictReader(f):
        for f in 'bytes_in bytes_out'.split():
            ent[f] = int(ent[f])
        for f in 'temp_board temp_case'.split():
            ent[f] = float(ent[f])
        for f in {'date'}:
            ent[f] = datetime.fromisoformat(ent[f])
        entries.append(ent) 

by_day = defaultdict(Counter)
for ent in entries:
    by_day[ent['date'].date()][ent['entity']] += ent['bytes_in']

print(
    #  f'{by_day[date(2020, 1, 2)] = }',
    #  f'{by_day[date(2020, 1, 2)] - by_day[date(2020, 1, 1)] = }',
    sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from collections import Counter
day1 = Counter({'abc': 10, 'def': 200, 'xyz': 3_000})
day2 = Counter({'abc': 13, 'def': 234, 'xyz': 3_123})
day3 = Counter({'abc': 17, 'def': 256})

print(
    #  f'{day2 - day1 = }',
    #  f'{day3 - day2 = }',
    #  f'{day2 + day3 = }',
    sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)

the numpy.ndarray

from numpy import load, argmax

bytes_in  = load('bytes_in.npy')
bytes_out = load('bytes_out.npy')
entity = load('entity.npy', allow_pickle=True)

print(
    #  f'{bytes_in.shape[0]    = :,}',
    #  f'{bytes_out.shape[0]   = :,}',
    #  f'{bytes_in - bytes_out = :}',
    #  f'{(bytes_in - bytes_out).max() = :,}',
    #  f'{(bytes_in - bytes_out).min() = :,}',
    #  f'{entity[argmax(bytes_out - bytes_in)] = :}',
    sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)

the xarray.DataArray

from numpy import load, vstack, subtract
from xarray import DataArray

bytes_in  = load('bytes_in.npy')
bytes_out = load('bytes_out.npy')

data = DataArray(
    data=vstack([bytes_in, bytes_out]),
)

print(
    #  data,
    #  subtract(data[0], data[1]),
    sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from numpy import load, vstack, subtract
from xarray import DataArray

bytes_in  = load('bytes_in.npy')
bytes_out = load('bytes_out.npy')

data = DataArray(
    data=vstack([bytes_in, bytes_out]),
    dims=['measurement', 'entity'],
)

print(
    #  data,
    #  data.mean(axis=1),
    #  data.mean(dim='entity'),
    #  data.mean(dim='measurement'),
    #  data.mean(dim='measurement').max(dim='entity'),
    sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from numpy import load, vstack, subtract
from xarray import DataArray

bytes_in  = load('bytes_in.npy')
bytes_out = load('bytes_out.npy')
entity = load('entity.npy', allow_pickle=True)

data = DataArray(
    data=vstack([bytes_in, bytes_out]),
    dims=['measurement', 'entity'],
    coords={
        'measurement': ['bytes_in', 'bytes_out'],
        'entity': entity,
    },
)

print(
    #  data,
    #  data.sel(measurement='bytes_in'),
    #  data.sel(measurement='bytes_in').argmax(),
    #  data.sel(measurement='bytes_in').idxmax(),
    sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)

the pandas.DataFrame

from pandas import read_csv, set_option
set_option('display.max_rows', 4)

df = read_csv('data.csv')

print(
    df,
    #  df['bytes_in'],
    #  df['bytes_in'].values,
    #  df['bytes_in'] - df['bytes_out'],
    #  (df['bytes_in'] - df['bytes_out']).max(),
    #  df['entity'][(df['bytes_in'] - df['bytes_out']).idxmax()],
    sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from pandas import read_csv, IndexSlice, set_option
set_option('display.max_rows', 4)

df = (
    read_csv('data.csv', parse_dates=['date'])
   #  .set_index(['date', 'datacenter', 'building', 'entity'])
   #  .sort_index()
)

print(
    df,
    #  df.loc['2020-01-03 9:00':'2020-01-03 10:00'],
    #  df.loc[IndexSlice['2020-01-03 9:00':'2020-01-03 10:00', ['wqn'], :]],
    #  df.loc[IndexSlice['2020-01-03 9:00':'2020-01-03 10:00', :, :, ['bvali.net', 'cbtjw.net'], :]],
    sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from pandas import read_csv, IndexSlice, MultiIndex, DataFrame, set_option
set_option('display.max_rows', 4)

df = (
    read_csv('data.csv', parse_dates=['date'])
  #   .set_index(['date', 'datacenter', 'building', 'entity'])
  #   .pipe(lambda df: df
  #       .set_axis(
  #            df.columns.str.split('_', expand=True),
  #            axis='columns'
  #        )
  #    )
  #   .pipe(lambda df: df.join(
  #       DataFrame({('packets', 'in'): 100, ('packets', 'out'): 100}, index=df.index)
  #  ))
  #   .sort_index()
)

print(
    df,
    #  df.loc[
    #      IndexSlice['2020-01-03 9:00':'2020-01-03 10:00', :, :, ['bvali.net', 'cbtjw.net'], :]
    #  ]['bytes'],
    #  df.loc[
    #      IndexSlice['2020-01-03 9:00':'2020-01-03 10:00', :, :, ['bvali.net', 'cbtjw.net'], :]
    #  ]['temp'],
    #  df.loc[
    #      IndexSlice['2020-01-03 9:00':'2020-01-03 10:00', :, :, ['bvali.net', 'cbtjw.net'], :],
    #      IndexSlice[:, 'in'],
    #  ],
    sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)

“Alignment“ of operations:

from collections import Counter
from numpy import arange
from pandas import Series, DataFrame

#  x, y = 1, 2

#  x, y = 1+2j, 3+4j

#  x, y = Counter('aaaabbc'), Counter('aaaaabd')

#  x, y = arange(3), arange(3)
#  x, y = arange(3), arange(6).reshape(2, 3)

#  x, y = Series({'a': 1, 'b': 20}), Series({'b': 21, 'c': 300})

#  x = DataFrame({'a': [1, 2], 'b': [20, 21]}, index=[*'xy'])
#  y = DataFrame({'b': [22, 24], 'c': [300, 301]}, index=[*'yz'])

#  x = Series({'a': 10, 'b': 200})
#  y = DataFrame({'a': [1, 2], 'b': [3, 4]}, index=[*'xy'])

print(
    'x'.center(10).center(40, '\N{box drawings light horizontal}'),
    f'{x}',
    'y'.center(10).center(40, '\N{box drawings light horizontal}'),
    f'{y}',
    'x + y'.center(10).center(40, '\N{box drawings light horizontal}'),
    f'{x + y}',
    sep='\n',
)
from pandas import read_csv, IndexSlice, MultiIndex, DataFrame, Series, set_option
set_option('display.max_rows', 4)

df = (
    read_csv('data.csv', parse_dates=['date'])
   .set_index(['date', 'datacenter', 'building', 'entity'])
   .pipe(lambda df: df
       .set_axis(
            df.columns.str.split('_', expand=True),
            axis='columns'
        )
    )
   .pipe(lambda df: df.join(
       DataFrame({('packets', 'in'): 100, ('packets', 'out'): 100}, index=df.index)
  ))
   .sort_index()
)

#  factor = Series({
#      ('bytes',    'in'):  .5,
#      ('bytes',   'out'): .75,
#      ('packets',  'in'):  .5,
#      ('packets', 'out'): .75,
#  })

#  factor = DataFrame({
#      ('bytes',    'in'): [ .5, .65],
#      ('bytes',   'out'): [.75, .85],
#      ('packets',  'in'): [ .5, .65],
#      ('packets', 'out'): [.75, .85],
#  }, index=['wqn0', 'hib3']).rename_axis('building')

print(
    df,
    #  factor,
    #  df.loc[
    #      IndexSlice['2020-01-03 9:00':'2020-01-03 10:00', :, :, ['bvali.net', 'cbtjw.net'], :]
    #  ][['bytes', 'packets']] * factor,
    sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)

don’t forget about the .index!

from pandas import read_pickle, Grouper, IndexSlice, set_option
set_option('display.max_rows', 4)

data = (
    read_pickle('data.pickle')
   .set_index(['date', 'datacenter', 'building', 'entity'])
   .pipe(lambda df: df
       .set_axis(
            df.columns.str.split('_', expand=True),
            axis='columns'
        )
    )
   .sort_index()
)

print(
    #  #  data
    #  #  ['bytes'].sum(axis='columns')
    #  #  [lambda s: s > s.mean() + s.std()/2]
    #  ,

    # look at only WQN
    #  (wqn_data :=
    #      data
    #      #  .loc[IndexSlice[:, ['wqn'], :]]
    #  )
    #  .pipe(lambda df: df
    #      .pipe(lambda df: df
    #          #  # compute the daily total throughput (bytes in + bytes out)
    #          #  ['bytes'].sum(axis='columns')
    #          #  .groupby(Grouper(level='date', freq='1D')).sum()
    #          #  .to_frame(name='throughput')
    #      )
    #      #  .pipe(lambda df:
    #      #      #  # identify the maximum sequence of consecutive days for which
    #      #      #  #   throughput is > μ + ½σ
    #      #      #  (~(df['throughput'] > df['throughput'].mean() + df['throughput'].std()/2))
    #      #      #  .cumsum()
    #      #      #  .pipe(lambda s: s.groupby(s).transform(lambda g: g.size))
    #      #      #  .pipe(lambda s: s[s == s.max()])
    #      #      #  .pipe(lambda s:
    #      #      #      wqn_data[wqn_data.index.get_level_values('date').round('1D').isin(s.index)]
    #      #      #  )
    #      #  )
    #  )
    #  .pipe(lambda df: df
    #      #  # identify which buildings see the greatest mean temperature
    #      #  #   within each 6H block
    #      #  ['temp', 'case']
    #      #  .groupby(grp := Grouper(level='date', freq='6H')).agg(
    #      #      lambda g: g.groupby(['building']).mean().idxmax()
    #      #  )
    #      .pipe(lambda s: df
    #          #  .groupby([grp, 'building'])
    #          #  .filter(
    #          #      lambda g:
    #          #       s.loc[g.index.get_level_values('date')[0].round('6H')]
    #          #          == g.index.get_level_values('building')[0]
    #          #  )
    #      )
    #  )
    #  .pipe(lambda df: df
    #      #  # for each 1H window, find all the devices
    #      #  #   (restricted only to target dates/building) where
    #      #  #   case min temperature > 110% of board max temperature
    #      #  ['temp']
    #      #  .groupby(Grouper(level='date', freq='1H'))
    #      #  .apply(lambda g:
    #      #      {*
    #      #          g.groupby('entity').filter(
    #      #              lambda g: g['case'].min() > 1.05 * g['board'].max()
    #      #          ).index.get_level_values('entity')
    #      #      }
    #      #  )
    #  )
    #  ,
    sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)