Theme: Data Analysis & Engineering
Topic: “How Python, numpy
, and pandas
‘Fit’ Together”
Date: Monday, October 12, 2020
Problem statement:
traces.jsonl
from json import loads
from collections import defaultdict, namedtuple
from datetime import datetime
class Cpu(namedtuple('CpuBase', 'avg1min avg5min avg10min')):
@classmethod
def from_json(cls, json):
return cls(*json)
class Mem(namedtuple('MemBase', 'used total')):
@classmethod
def from_json(cls, json):
used = json['used']
total = json['total']
if used > total:
raise ValueError()
return cls(used, total)
class Swap(namedtuple('SwapBase', 'used total')):
@property
def pct(self):
return self.used / self.total
@classmethod
def from_json(cls, json):
used = json['used']
total = json['total']
if used > total:
raise ValueError()
return cls(used, total)
class Trace(namedtuple('TraceBase', 'timestamp node cpu mem swap')):
@classmethod
def from_json(cls, json):
timestamp = datetime.fromisoformat(json['timestamp'])
node = json['node']
cpu = Cpu.from_json(json['cpu'])
mem = json['mem']
swap = Swap.from_json(json['swap'])
return cls(timestamp, node, cpu, mem, swap)
nodes = defaultdict(list)
with open('traces.jsonl') as f:
for line in f:
trace = Trace.from_json(loads(line))
print(f'{trace = }')
break
for n, trs in sorted(nodes.items()):
sorted_traces = sorted(trs, key=lambda t: t.timestamp)
for a, b in zip(sorted_traces, sorted_traces[1:]):
diff = b.timestamp - a.timestamp
if diff.total_seconds() > 61:
print(f'{diff.total_seconds()= }')
elif diff.total_seconds() < 59:
print(f'{diff.total_seconds()= }')
print(f'{n = } {sorted_traces[0].timestamp} ~ {sorted_traces[-1].timestamp}')
from itertools import tee, islice
nwise = lambda g,n=2: zip(*(islice(g, i, None) for i, g in enumerate(tee(g, n))))
for n, trs in sorted(nodes.items()):
sorted_traces = sorted(trs, key=lambda t: t.timestamp)
for a, b, c, d, e in nwise(sorted_traces, 5):
if all(x.swap.pct > .75 for x in [a, b, c, d, e]):
print(f'{n = } {a.timestamp} {e.timestamp}')
Problem statement:
traces.jsonl
from random import randint
xs = [randint(0, 100) for _ in range(10_000_000)]
print(f'{sum(xs) = }')
from numpy import array
xs = array(xs)
print(f'{sum(xs) = }')
print(f'{xs.sum() = }')
from numpy import array
xs = array([1, 2, 3])
print(f'{xs * 2 = }')
xs = array([1, 2, 160], dtype='int8')
ys = array([4, 5, 160], dtype='int8')
print(f'{xs + ys = }')
print(f'{xs.dtype = }')
from json import loads
from numpy import array
cpu_1min_avg = []
with open('traces.jsonl') as f:
for line in f:
data = loads(line)
if data['node'] != 'a':
continue
cpu_1min_avg.append(data['cpu'][0])
cpu_1min_avg = array(cpu_1min_avg)
print(f'{cpu_1min_avg.mean() = }')
print(f'{cpu_1min_avg.var() = }')
print(f'{cpu_1min_avg.std() = }')
print(f'{cpu_1min_avg[:-1] - cpu_1min_avg[1:] = }')
# from pandas import array
from pandas import DataFrame, to_datetime
from json import loads
from numpy import array
from collections import defaultdict
cpu_1min_avg = defaultdict(list)
timestamps = defaultdict(list)
with open('traces.jsonl') as f:
for line in f:
data = loads(line)
node = data['node']
cpu_1min_avg[node].append(data['cpu'][0])
timestamps[node].append(data['timestamp'])
cpu_1min_avg = DataFrame(cpu_1min_avg)
print(cpu_1min_avg)