pandas
and statsmodels
for statistical analysis of large data sets)Title | “Give me your data analysis, stat!” |
Topic | using scipy , numpy , and statsmodels for statistical analysis of large data sets |
Date | Fri, Feb 11 2022 |
Time | 10am~11am PST |
Keywords | descriptive stats, inferential statistics, distributions, probability, modelling, statsmodels , scipy , numpy , pandas |
These sessions are designed for a broad audience of non-software engineers and software programmers of all backgrounds and skill-levels.
Our expected audience should comprise attendees with a…
numpy
, pandas
, scipy
usage)During this session, we will endeavor to guide our audience to developing…
In previous seminars, we have transformed, visualised, and analysed large data sets using pandas
. These analyses have been presented with a focus on the syntax and structure of the computational tools, rather than on the underlying analytical tasks.
In this seminar, we will focus on the analysis itself, and show how to do basic descriptive and inferential statistics tasks in Python using pandas
, numpy
, scipy
, and statsmodels
. We will discuss the motivation for this use of statistics, how to formulate and answer hypotheses, and how to do basic correleation and prediction.
Sample Agenda:
Did you enjoy this seminar? Did you learn something new that will help you as you analyse larger and larger data sets?
In a future seminar, we can go into greater depth on statistical modeling, inference, and prediction. We can also discuss how to approach statistics from the Bayesian approach, or tie our statistical & probability knowledge to topics in information theory.
We can discuss…
print("Let's get started!")
Question: “Can you give me the bottom line?” (Concisely describing a dataset with descriptive statistics: mean, median, mode, variance, standard deviation, skew, kurtosis.)
print("Let's take a look!")
from numpy import load
from pathlib import Path
data_dir = Path('data')
temperatures = load(data_dir / 'temperatures.npy')
print(
f'{temperatures[:3] = }',
f'{len(temperatures) = :,}',
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from numpy import load, median
from pathlib import Path
from scipy.stats import mode
data_dir = Path('data')
temps = load(data_dir / 'temps.npy')
print(
f'{temps.mean() = :.2f}',
f'{median(temps) = :.2f}',
f'{mode(temps) = :}',
sep='\n',
)
The mean, median, and mode are all measures of aggregate discrepancy from a central value.
from numpy import load, linspace, argmin, argmax, newaxis, median
from scipy.stats import mode
from pathlib import Path
data_dir = Path('data')
temps = load(data_dir / 'temps.npy')
m = linspace(70, 80, 1_000+1)
print(
'\n'.join([
f'{m[argmax((temps == m[..., newaxis]).sum(axis=-1))] = }',
f'{m[argmin((temps != m[..., newaxis]).sum(axis=-1))] = }',
f'{mode(temps) = :}',
]),
'\n'.join([
f'{m[argmin(abs(temps - m[..., newaxis]).sum(axis=-1))] = }',
f'{median(temps) = :.2f}',
]),
'\n'.join([
f'{m[argmin(((temps - m[..., newaxis]) ** 2).sum(axis=-1))] = }',
f'{temps.mean() = :.2f}',
]),
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from pandas import read_pickle, DataFrame
from matplotlib.pyplot import subplot_mosaic, show
from pathlib import Path
data_dir = Path('data')
i = read_pickle(data_dir / 'anscombe.i.pickle')
ii = read_pickle(data_dir / 'anscombe.ii.pickle')
iii = read_pickle(data_dir / 'anscombe.iii.pickle')
iv = read_pickle(data_dir / 'anscombe.iv.pickle')
_, axes = subplot_mosaic([['i', 'ii'], ['iii', 'iv']])
axes['i'].scatter(i.index, i)
axes['ii'].scatter(ii.index, ii)
axes['iii'].scatter(iii.index, iii)
axes['iv'].scatter(iv.index, iv)
show()
print(
DataFrame({
'i': i.apply(m := ['mean', 'std', 'skew', 'kurtosis']),
'ii': ii.apply(m),
'iii': iii.apply(m),
'iv': iv.apply(m),
}),
sep='\n',
)
Hausdorff’s Moment Problem.
from numpy import load
from pathlib import Path
from scipy.stats import skew, kurtosis
from pint import UnitRegistry
data_dir = Path('data')
temps = load(data_dir / 'temps.npy')
# temps = (temps * UnitRegistry().degF).to('kelvin')
print(
f'{temps.mean() = :.2f}',
'\n'.join([
f'{temps.var() = :.2f}',
f'{((temps - temps.mean())**2).mean() = :.2f}',
f'{temps.std() = :.2f}',
]),
'\n'.join([
f'{skew(temps) = :.2f}',
f'{kurtosis(temps, fisher=False) = :.2f}',
f'{((temps - temps.mean()) ** 3).mean() / temps.std() ** 3 = :.2f}',
f'{((temps - temps.mean()) ** 4).mean() / temps.std() ** 4 = :.2f}',
]),
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from numpy import load
from pathlib import Path
from scipy.stats import skew, kurtosis
# (
# ((xs_side - xs.mean()) / xs.std()) ** n
# ).mean()
# *
# len(xs_side)/len(xs)
# lft: xs[xs < xs.mean()]
# rgt: xs[xs > xs.mean()]
# if n is even…
# lft_mmt + rgt_mmt
# abs(lft_mmt) + abs(rgt_mmt)
# if n is odd…
# lft_mmt + rgt_mmt
# -abs(lft_mmt) + abs(rgt_mmt)
# for n, odd: abs(rgt_mmt) - abs(lft_mmt)
# for n, even: abs(rgt_mmt) + abs(lft_mmt)
data_dir = Path('data')
temps = load(data_dir / 'temps.npy')
temps_lft = temps[temps < temps.mean()]
temps_rgt = temps[temps > temps.mean()]
print(
f'{temps.mean() = :.2f}',
'\n'.join([
f'{skew(temps) = :.2f}',
f'{kurtosis(temps, fisher=False) = :.2f}',
f'''skew … = {
-abs(
((temps_lft - temps.mean()) / temps.std()) ** 3
).mean() * len(temps_lft)/len(temps)
+
abs(
((temps_rgt - temps.mean()) / temps.std()) ** 3
).mean() * len(temps_rgt)/len(temps)
:>.2f}''',
f'''kurtosis … = {
abs(
((temps_lft - temps.mean()) / temps.std()) ** 4
).mean() * len(temps_lft)/len(temps)
+
abs(
((temps_rgt - temps.mean()) / temps.std()) ** 4
).mean() * len(temps_rgt)/len(temps)
:>.2f}''',
]),
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from numpy import load
from numpy.random import default_rng
from pathlib import Path
from pandas import DataFrame, Series
rng = default_rng(0)
data_dir = Path('data')
temps_population = load(data_dir / 'temps.npy')
temps_sample = rng.choice(
temps_population,
size=round(temps_population.size * (frac := 0.01)),
replace=False,
)
print(
DataFrame({
'population': Series(temps_population)
.apply(stats := ['mean', 'var', 'skew', 'kurt', 'count'])
,
'sample': Series(temps_sample)
.apply(stats)
,
}).round(2),
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
Question: “So what am I looking at here?” (Visually describing a dataset with descriptive statistics: PDF, CDF, CMF, quantiles/per centiles.)
Follow-up:
print("Let's take a look!")
from numpy import load
from pathlib import Path
from matplotlib.pyplot import subplot_mosaic, show
data_dir = Path('data')
temps_before = load(data_dir / 'temps1.npy')
temps_after = load(data_dir / 'temps2.npy')
_, axes = subplot_mosaic([['before'], ['after']])
axes['before'].plot(temps_before)
axes['after'].plot(temps_after)
for title, ax in axes.items(): ax.set_title(title)
show()
from numpy import load
from pathlib import Path
from matplotlib.pyplot import subplot_mosaic, show
from pandas import DataFrame, Series
data_dir = Path('data')
temps_before = load(data_dir / 'temps1.npy')
temps_after = load(data_dir / 'temps2.npy')
_, axes = subplot_mosaic([['before'], ['after']])
axes['before'].hist(temps_before)
axes['after'].hist(temps_after)
for title, ax in axes.items(): ax.set_title(title)
show()
print(
DataFrame({
'before': Series(temps_before)
.apply(stats := ['mean', 'var', 'skew'])
,
'after': Series(temps_after)
.apply(stats)
,
}).round(2),
)
Probability Mass Function.
from numpy import load, unique
from pathlib import Path
from matplotlib.pyplot import subplot_mosaic, show
data_dir = Path('data')
temps_before = load(data_dir / 'temps1.npy')
temps_after = load(data_dir / 'temps2.npy')
before_values, before_counts = unique(temps_before, return_counts=True)
after_values, after_counts = unique(temps_after, return_counts=True)
_, axes = subplot_mosaic([['before'], ['after']])
axes['before'].plot(before_values, before_counts / before_counts.sum())
axes['after'].plot(after_values, after_counts / after_counts.sum())
for title, ax in axes.items(): ax.set_title(title)
show()
Probability Density Function.
from numpy import load, linspace
from pathlib import Path
from scipy.stats import gaussian_kde
from matplotlib.pyplot import subplot_mosaic, show
data_dir = Path('data')
temps_before = load(data_dir / 'temps1.npy')
temps_after = load(data_dir / 'temps2.npy')
_, axes = subplot_mosaic([['before'], ['after']])
axes['before'].plot(
sp := linspace(min(temps_before), max(temps_before), 100),
gaussian_kde(temps_before).evaluate(sp),
)
axes['after'].plot(
sp := linspace(min(temps_after), max(temps_after), 100),
gaussian_kde(temps_after).evaluate(sp),
)
for title, ax in axes.items(): ax.set_title(title)
show()
Cumulative Distribution Function.
from numpy import load, linspace, quantile
from pathlib import Path
from matplotlib.pyplot import subplot_mosaic, show
data_dir = Path('data')
temps_before = load(data_dir / 'temps1.npy')
temps_after = load(data_dir / 'temps2.npy')
_, axes = subplot_mosaic([['before'], ['after']])
axes['before'].hist(temps_before, density=True, cumulative=True, bins=100)
axes['after'].hist(temps_after, density=True, cumulative=True, bins=100)
axes['before'].axvline(quantile(temps_before, .95), color='red')
axes['before'].axvline(quantile(temps_before, .99), color='red')
axes['after'].axvline(quantile(temps_after, .95), color='red')
axes['after'].axvline(quantile(temps_after, .99), color='red')
for title, ax in axes.items(): ax.set_title(title)
print(
'\n'.join([
f'{quantile(temps_before, .95) = :.2f}',
f'{quantile(temps_before, .99) = :.2f}',
]),
'\n'.join([
f'{quantile(temps_after, .95) = :.2f}',
f'{quantile(temps_after, .99) = :.2f}',
]),
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
show()
Question: “How do I know I made a difference?” (Formulating & testing stastical hypotheses: p-values, Z-test, T-test.)
Follow-up:
print("Let's take a look!")
from numpy import load
from numpy.random import default_rng
from pathlib import Path
from pandas import DataFrame, Series
rng = default_rng(0)
data_dir = Path('data')
temps_population = load(data_dir / 'temps.npy')
temps_sample = rng.choice(
temps_population,
size=round(temps_population.size * (frac := 0.01)),
replace=False,
)
print(
DataFrame({
'population': Series(temps_population)
.apply(stats := ['mean', 'var', 'std', 'skew', 'kurt', 'count'])
,
'sample': Series(temps_sample)
.apply(stats)
,
}).round(2),
)
Z-Test:
from numpy import load, isclose, sqrt
from numpy.random import default_rng
from pathlib import Path
from scipy.stats import norm
from statsmodels.stats.weightstats import ztest
rng = default_rng(0)
data_dir = Path('data')
population = load(data_dir / 'temps.npy')
sample = rng.choice(
population,
size=round(population.size * (frac := 0.01)),
replace=False,
)
# Hypothesis
# - H₀: isclose(population.mean(), 76)
# - H₁: not isclose(population.mean(), 76)
# Assume we know the population std…
assert isclose(population.std(), 9.95, rtol=0.01)
print(
f'{sample.mean() = :.2f}',
'\n'.join([
# f'{(test_stat := abs(sample.mean() - 76) / (population.std() / sqrt(sample.size))) = :.4f}',
f'{(test_stat := abs(sample.mean() - 76) / (sample.std(ddof=1) / sqrt(sample.size))) = :.4f}',
f'{2 * norm().cdf(-test_stat) = :.4f}',
]),
'\n'.join([
f'{ztest(sample, value=76)[0] = :.4f}',
f'{ztest(sample, value=76)[1] = :.4f}',
]),
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from matplotlib.pyplot import subplots, show
from numpy import linspace
from scipy.stats import norm
fig, ax = subplots()
ax.plot(
sp := linspace(-3, 3, 100),
norm().pdf(sp),
)
ax.axvline(+2.0058)
ax.axvline(-2.0058)
show()
Type I Error: mistakenly reject null hypothesis (“false positive.”) Type II Error: mistakenly accept null hypothesis (“false negative.”)
α: level of significance.
p-value: assuming the null hypothesis is true (“there’s nothing there,”) probability of observing the given test results.
Test statistic: determined from the sample data and decides whether H₀ is rejected. Rejection region: the values of the test statistic for which H₀ is rejected.
Effect size: the magnitude of the effect.
“Cohen’s d”: µ₀ - µ₁ / σ
Effect size | d |
---|---|
Very small | 0.01 |
Small | 0.20 |
Medium | 0.50 |
Large | 0.80 |
Very large | 1.20 |
Huge | 2.00 |
from numpy import load, sqrt
from numpy.random import default_rng
from pathlib import Path
from scipy.stats import norm, t, ttest_1samp
rng = default_rng(0)
data_dir = Path('data')
population = load(data_dir / 'temps.npy')
sample = rng.choice(
population,
size=round(population.size * (frac := 0.01)),
replace=False,
)
# Hypothesis
# - H₀: isclose(population.mean(), 76)
# - H₁: not isclose(population.mean(), 76)
print(
f'{sample.mean() = :.2f}',
'\n'.join([
f'{(test_stat := abs(sample.mean() - 75) / (sample.std(ddof=1) / sqrt(sample.size))) = :.4f}',
f'{2 * norm().cdf(-test_stat) = :.4f}',
f'{2 * t(sample.size - 1).cdf(-test_stat) = :.4f}',
]),
'\n'.join([
f'{ttest_1samp(sample, popmean=75).statistic = :.4f}',
f'{ttest_1samp(sample, popmean=75).pvalue = :.4f}',
]),
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from numpy import load
from numpy.random import default_rng
from pathlib import Path
from scipy.stats import norm, t, ttest_1samp
rng = default_rng(0)
data_dir = Path('data')
population = load(data_dir / 'temps.npy')
sample = rng.choice(
population,
size=round(population.size * (frac := 0.01)),
replace=False,
)
# Hypothesis
# - H₀: population.mean() ≥ 80
# - H₁: population.mean() < 80
print(
f'{sample.mean() = :.2f}',
'\n'.join([
f'{ttest_1samp(sample, popmean=80, alternative="less").statistic = :.4f}',
f'{ttest_1samp(sample, popmean=80, alternative="less").pvalue = :.4f}',
]),
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from numpy import load, arange
from numpy.random import default_rng
from pathlib import Path
from scipy.stats import norm, t, ttest_rel
rng = default_rng(0)
data_dir = Path('data')
population_before = load(data_dir / 'temps.before.npy')
population_after = load(data_dir / 'temps.after.npy')
sample = rng.choice(
arange(len(population_before)),
size=round(population_before.size * (frac := 0.1)),
replace=False,
)
sample_before = population_before[sample]
sample_after = population_after[sample]
# Hypothesis
# - H₀: isclose(population_before.mean(), population_after.mean())
# - H₁: not isclose(population_before.mean(), population_after.mean())
print(
'\n'.join([
f'{ttest_rel(sample_before, sample_after).statistic = :,.4f}',
f'{ttest_rel(sample_before, sample_after).pvalue = :.4f}',
]),
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
Question: “How can I be more confident?” (Frequentist-vs-Bayesian frameworks.)
Follow-up:
print("Let's take a look!")
The classical approach…
H₀: “null hypothesis” H₁: “alternate hypothesis” α: “significance criterion” p-value: probability that H₀ is true given the data
An alternate approach…
Bayes’ Theorem: P(H₁ | data) = P(data | H₁)P(H₁) / P(data) |
P(data) = P(data | H₁)P(H₁) + P(data | H₀)P(H₀) |
P(H₁|data) → posterior probability (we want to determine) P(H₁) → prior probability (we have to provide) P(data|H₁) → likelihood (we have to compute) P(data) → model evidence
Question: “Are these related?” (Quantifying statistical relationships: correlations and covariances.)
Follow-up:
print("Let's take a look!")
from numpy import load, cov, diag, fliplr
from numpy.random import default_rng
from pathlib import Path
rng = default_rng(0)
data_dir = Path('data')
temps_population = load(data_dir / 'temperatures.npy')
temps_sample = rng.choice(
temps_population,
size=round(temps_population.size * (frac := 0.01)),
replace=False,
)
cpu_load_population = load(data_dir / 'cpu-load.npy')
cpu_load_sample = rng.choice(
temps_population,
size=round(temps_population.size * (frac := 0.01)),
replace=False,
)
print(
f'{((temps_sample - temps_sample.mean()) * (cpu_load_sample - cpu_load_sample.mean())).mean() = :.4f}',
f'{diag(fliplr(cov([temps_sample, cpu_load_sample], ddof=0)))[0] = :.4f}',
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
Covariance:
Pearson correlation coëfficient; Spearman’s rank correlation.
from numpy import load, corrcoef
from numpy.random import default_rng
from pathlib import Path
from textwrap import dedent
rng = default_rng(0)
data_dir = Path('data')
temps_population = load(data_dir / 'temperatures.npy')
temps_sample = rng.choice(
temps_population,
size=round(temps_population.size * (frac := 0.01)),
replace=False,
)
cpu_load_population = load(data_dir / 'cpu-load.npy')
cpu_load_sample = rng.choice(
temps_population,
size=round(temps_population.size * (frac := 0.01)),
replace=False,
)
print(
dedent(f'''{
(
(temps_sample - temps_sample.mean()) / temps_sample.std()
* (cpu_load_sample - cpu_load_sample.mean()) / cpu_load_sample.std()
).mean() = :.6f}''').strip(),
corrcoef([temps_sample, cpu_load_sample]).round(6), # Pearson product-moment correlation coefficient
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
Pearson Correlation Coefficient:
from numpy import load, corrcoef, polyval
from numpy.random import default_rng
from pathlib import Path
from itertools import product
rng = default_rng(0)
data_dir = Path('data')
data0 = rng.normal(size=5_000)
data1 = data0
data2 = 2 * data0 + 3
data3 = 4 * data0 + (5 * rng.random(size=data0.size))
data4 = polyval([6, 7, 8], data0)
data5 = polyval([9, 10, 11, 12, 13], data0)
print(
corrcoef([data0, data1]).round(4),
corrcoef([data0, data2]).round(4),
corrcoef([data0, data3]).round(4),
corrcoef([data0, data4]).round(4),
corrcoef([data0, data5]).round(4),
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from pathlib import Path
from pandas import read_pickle, DataFrame
from matplotlib.pyplot import subplot_mosaic, show
from numpy import corrcoef
data_dir = Path('data')
i = read_pickle(data_dir / 'anscombe.i.pickle')
ii = read_pickle(data_dir / 'anscombe.ii.pickle')
iii = read_pickle(data_dir / 'anscombe.iii.pickle')
iv = read_pickle(data_dir / 'anscombe.iv.pickle')
_, axes = subplot_mosaic([['i', 'ii'], ['iii', 'iv']])
for title, ax in axes.items(): ax.set_title(title)
axes['i'].scatter(i.index, i)
axes['ii'].scatter(ii.index, ii)
axes['iii'].scatter(iii.index, iii)
axes['iv'].scatter(iv.index, iv)
# show()
print(
DataFrame({
'i': i.apply(m := ['mean', 'std', 'skew', 'kurtosis']),
'ii': ii.apply(m),
'iii': iii.apply(m),
'iv': iv.apply(m),
}),
corrcoef(i.index, i).round(6),
corrcoef(ii.index, ii).round(6),
corrcoef(iii.index, iii).round(6),
corrcoef(iv.index, iv).round(6),
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
Spearman’s Rank Correlation:
from numpy import load, corrcoef, sqrt
from numpy.random import default_rng
from pathlib import Path
from textwrap import dedent
from scipy.stats import spearmanr, t, rankdata
rng = default_rng(0)
data_dir = Path('data')
temps_population = load(data_dir / 'temperatures.npy')
temps_sample = rng.choice(
temps_population,
size=round(temps_population.size * (frac := 0.01)),
replace=False,
)
cpu_load_population = load(data_dir / 'cpu-load.npy')
cpu_load_sample = rng.choice(
temps_population,
size=round(temps_population.size * (frac := 0.01)),
replace=False,
)
print(
# rankdata(temps_sample),
corrcoef([rankdata(temps_sample), rankdata(cpu_load_sample)]).round(6),
'\n'.join([
f'{spearmanr(temps_sample, cpu_load_sample).correlation = :>7.4f}',
f'{spearmanr(temps_sample, cpu_load_sample).pvalue = :>7.4f}',
]),
'\n'.join([
f'{(df := temps_sample.size - 2) = :>7.4f}',
f'{(corr := spearmanr(temps_sample, cpu_load_sample).correlation) = :>7.4f}',
f'{(stat := corr * sqrt(df) / sqrt(1 - corr**2)) = :>7.4f}',
f'{2 * t(df=df).sf(abs(stat)) = :>7.4f}',
]),
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
Question: “What’s coming next?” (Estimation with simple statistical models: linear regression, multiple linear regression, Ridge/LASSO regression.)
Follow-up:
print("Let's take a look!")
y = α + βx
ε = (α + βx) - y
from pathlib import Path
from numpy import load, arange, cov, diag, allclose, meshgrid, array, newaxis, argmin, ravel
from numpy.random import default_rng
from scipy.stats import linregress
from textwrap import dedent
rng = default_rng(0)
data_dir = Path('data')
xs = load(data_dir / 'temperatures.npy')[:100]
ys = 2 * xs + 3
# ys = (2 + rng.random(size=xs.size)/10) * xs + (3 + rng.random(size=xs.size))
alphas = arange(0, 5, .01)
betas = arange(0, 5, .01)
candidates = array([*map(ravel, meshgrid(betas, alphas))])
print(
dedent(f'''
{candidates[:,
argmin(((
(candidates[0][..., newaxis] * xs)
+
candidates[-1][..., newaxis]
-
ys
)**2).sum(axis=-1))
] = }
''').strip(),
'\n'.join([
f'{(beta := diag(cov(xs, ys, ddof=0), -1)[-1] / xs.var()) = :.2f}',
f'{(alpha := ys.mean() - beta * xs.mean()) = :.2f}',
]),
f'{allclose(alpha + beta * xs, ys) = }',
'\n'.join([
f'{linregress(xs, ys).slope = :.2f}',
f'{linregress(xs, ys).intercept = :.2f}',
]),
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
Coëfficient of determination.
from pathlib import Path
from numpy import load, sqrt, corrcoef, diag
from numpy.random import default_rng
from scipy.stats import linregress, pearsonr, t
rng = default_rng(0)
data_dir = Path('data')
xs = load(data_dir / 'temperatures.npy')
ys = (2 + rng.random(size=xs.size)/10) * xs + (3 + rng.random(size=xs.size))
zs = rng.random(size=xs.size)
print(
'\n'.join([
f'{(beta := linregress(xs, ys).slope) = :.2f}',
f'{(alpha := linregress(xs, ys).intercept) = :.2f}',
f'{linregress(xs, ys).rvalue = :.6f}', # correlation coëfficient
]),
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from pandas import DataFrame, Series
from pathlib import Path
from numpy import load, array, ones_like, newaxis
from numpy.linalg import lstsq, pinv
from numpy.random import default_rng
from scipy.stats import linregress, f_oneway
from statsmodels.formula.api import ols
from sklearn.linear_model import LinearRegression
from textwrap import dedent
rng = default_rng(0)
data_dir = Path('data')
xs0 = load(data_dir / 'measurement1.npy')
xs1 = load(data_dir / 'measurement2.npy')
xs2 = load(data_dir / 'measurement3.npy')
ys = (
(5 + (rng.random(size=(3, xs0.size)) - .5) / 10)
*
array([xs0, xs1, xs2])
+
(5 + rng.random(size=xs0.size))
).sum(axis=0)
print(
DataFrame({
('ys', name): Series(linregress(data, ys)._asdict())
for name, data in {'xs0': xs0, 'xs1': xs1, 'xs2': xs2}.items()
}).round(4),
# β₀x₀ + β₁x₁ + β₂x₂ + α = y
# [ x₀ x₁ x₂ 1 ] [ β₀ ] = [ y ]
# … … … … · [ β₁ ] …
# … … … … [ β₂ ] …
# … … … … [ α ] …
dedent(f'''
{(pinv(
array([xs0, xs1, xs2, ones_like(xs0)]).T,
) @ ys).round(4) = }
''').strip(),
dedent(f'''
{lstsq(
array([xs0, xs1, xs2, ones_like(xs0)]).T,
ys,
rcond=None,
)[0].round(4) = }
''').strip(),
# dedent(f'''
# {(lr := ols(
# formula='ys ~ xs0 + xs1 + xs2',
# data=DataFrame({'xs0': xs0, 'xs1': xs1, 'xs2': xs2, 'ys': ys}),
# ).fit())}
# ''').strip(),
# lr.params.round(4),
# f'{lr.rsquared.round(6) = }',
f'{(lr := LinearRegression().fit(array([xs0, xs1, xs2]).T, ys)) = }',
Series({
'Intercept': lr.intercept_,
**dict(zip('xs0 xs1 xs2'.split(), lr.coef_))
}).round(4),
'\n'.join([
f'{(data := array([xs0, xs1, xs2]).T)[0, :3] = }',
f'{(lr := LinearRegression().fit(data, ys)) = }',
f'{(guess := (lr.coef_ * data).sum(axis=-1) + lr.intercept_)[:3] = }',
f'{(actual := ys)[:3] = }',
f'{(ssto := ((actual - ys.mean())**2).sum()) = }', # total deviation
f'{(ssr := ((guess - ys.mean())**2).sum()) = }', # regression sum of squares
f'{(sse := ((actual - guess)**2).sum()) = }', # residual sum of squares
f'{(rsquared := ssr / ssto) = :.6f}',
f'{(rsquared := 1 - sse / ssto) = :.6f}',
]),
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)
from numpy import zeros
from numpy.random import default_rng
from scipy.optimize import minimize
from sklearn.linear_model import LinearRegression, Ridge, Lasso
rng = default_rng(0)
# y = β₀x₀ + β₁x₁ …
# guess = coeffs @ source
# Linear Regression:
# - minimise cost function
# - cost function: ((actual - guess)**2).sum()
# ((actual - coeffs @ source)**2).sum()
# Ridge Regression:
# - minimise cost function
# - cost function: ((actual - guess)**2).sum() + factor * (coeffs ** 2).sum()
# - factor = 1 ⇒ same as Linear Regression
# “shrinkage parameter”/“tuning parameter”
# Lasso Regression:
# - minimise cost function
# - cost function: ((actual - guess)**2).sum() + factor * abs(coeffs)
xs = rng.normal(size=(10, 100))
ws = rng.uniform(-10, 10, size=xs.shape[0])
ws[3] = 100
ws[4] = 0
ys = ws @ xs
print(
# '\n'.join([
# f'{(lr := LinearRegression(fit_intercept=False).fit(xs.T, ys)) = }',
# f'{lr.coef_.round(2)}',
# ]),
minimize(
fun=lambda ws, xs, ys: ((ys - ws @ xs)**2).sum(),
x0=zeros(shape=xs.shape[0]),
args=(xs, ys),
).x.round(2),
'\n'.join([
f'{(lr := Ridge(fit_intercept=False, alpha=1).fit(xs.T, ys)) = }',
f'{lr.coef_.round(2)}',
]),
minimize(
fun=lambda ws, xs, ys, alpha: ((ys - ws @ xs)**2).sum() + alpha * (ws**2).sum(),
x0=zeros(shape=xs.shape[0]),
args=(xs, ys, 1),
).x.round(2),
'\n'.join([
f'{(lr := Lasso(fit_intercept=False, alpha=1).fit(xs.T, ys)) = }',
f'{lr.coef_.round(2)}',
]),
# minimize(
# fun=lambda ws, xs, ys, alpha: ((ys - ws @ xs)**2).sum() + alpha * abs(ws).sum(),
# x0=zeros(shape=xs.shape[0]),
# args=(xs, ys, 1),
# ).x.round(2),
# minimize(
# fun=lambda ws, xs, ys, alpha: 1 / (2 * xs.shape[-1]) * ((ys - ws @ xs)**2).sum() + alpha * abs(ws).sum(),
# x0=zeros(shape=xs.shape[0]),
# args=(xs, ys, 1),
# ).x.round(2),
sep='\n{}\n'.format('\N{box drawings light horizontal}' * 40),
)