seminars.fb

Embedded Learning Series (for Python !)

Theme: Data Analysis & Engineering

Topic: “How Python, numpy, and pandas ‘Fit’ Together”

Date: Monday, October 12, 2020

Problem statement:

read in traces.jsonl
- find all the unique node names
  - categorise traces
- find instances where swap is >50% used for over five minutes
- build a data structure for traces

from json import loads
from collections import defaultdict, namedtuple
from datetime import datetime

class Cpu(namedtuple('CpuBase', 'avg1min avg5min avg10min')):
    @classmethod
    def from_json(cls, json):
        return cls(*json)

class Mem(namedtuple('MemBase', 'used total')):
    @classmethod
    def from_json(cls, json):
        used = json['used']
        total = json['total']
        if used > total:
            raise ValueError()
        return cls(used, total)

class Swap(namedtuple('SwapBase', 'used total')):
    @property
    def pct(self):
        return self.used / self.total
    @classmethod
    def from_json(cls, json):
        used = json['used']
        total = json['total']
        if used > total:
            raise ValueError()
        return cls(used, total)

class Trace(namedtuple('TraceBase', 'timestamp node cpu mem swap')):
    @classmethod
    def from_json(cls, json):
        timestamp = datetime.fromisoformat(json['timestamp'])
        node = json['node']
        cpu = Cpu.from_json(json['cpu'])
        mem = json['mem']
        swap = Swap.from_json(json['swap'])
        return cls(timestamp, node, cpu, mem, swap)

nodes = defaultdict(list)
with open('traces.jsonl') as f:
    for line in f:
        trace = Trace.from_json(loads(line))
        print(f'{trace = }')
        break

for n, trs in sorted(nodes.items()):
    sorted_traces = sorted(trs, key=lambda t: t.timestamp)
    for a, b in zip(sorted_traces, sorted_traces[1:]):
        diff = b.timestamp - a.timestamp
        if diff.total_seconds() > 61:
            print(f'{diff.total_seconds()= }')
        elif diff.total_seconds() < 59:
            print(f'{diff.total_seconds()= }')
    print(f'{n = } {sorted_traces[0].timestamp} ~ {sorted_traces[-1].timestamp}')

from itertools import tee, islice
nwise = lambda g,n=2: zip(*(islice(g, i, None) for i, g in enumerate(tee(g, n))))

for n, trs in sorted(nodes.items()):
    sorted_traces = sorted(trs, key=lambda t: t.timestamp)
    for a, b, c, d, e in nwise(sorted_traces, 5):
        if all(x.swap.pct > .75 for x in [a, b, c, d, e]):
            print(f'{n = } {a.timestamp} {e.timestamp}')

Problem statement:

read in traces.jsonl
- compute average CPU usage
- find cases where memory usage increases for 10 samples in a row

from random import randint
xs = [randint(0, 100) for _ in range(10_000_000)]
print(f'{sum(xs) = }')

from numpy import array
xs = array(xs)
print(f'{sum(xs) = }')
print(f'{xs.sum() = }')

from numpy import array
xs = array([1, 2, 3])
print(f'{xs * 2 = }')

xs = array([1, 2, 160], dtype='int8')
ys = array([4, 5, 160], dtype='int8')
print(f'{xs + ys = }')
print(f'{xs.dtype = }')

from json import loads
from numpy import array

cpu_1min_avg = []
with open('traces.jsonl') as f:
    for line in f:
        data = loads(line)
        if data['node'] != 'a':
            continue
        cpu_1min_avg.append(data['cpu'][0])
cpu_1min_avg = array(cpu_1min_avg)

print(f'{cpu_1min_avg.mean() = }')
print(f'{cpu_1min_avg.var()  = }')
print(f'{cpu_1min_avg.std()  = }')
print(f'{cpu_1min_avg[:-1] - cpu_1min_avg[1:] = }')

# from pandas import array
from pandas import DataFrame, to_datetime
from json import loads
from numpy import array
from collections import defaultdict

cpu_1min_avg = defaultdict(list)
timestamps = defaultdict(list)
with open('traces.jsonl') as f:
    for line in f:
        data = loads(line)
        node = data['node']
        cpu_1min_avg[node].append(data['cpu'][0])
        timestamps[node].append(data['timestamp'])
cpu_1min_avg = DataFrame(cpu_1min_avg)
print(cpu_1min_avg)