Statistical Analysis with NumPy
Duration: 8 min
Descriptive Statistics
import numpy as np
data = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
# Mean (average)
mean = np.mean(data) # 5.5
# Median (middle value)
median = np.median(data) # 5.5
# Mode (most frequent) - use scipy
from scipy import stats
mode = stats.mode(data)[0] # 1 (or any value, all appear once)
# Standard deviation
std = np.std(data) # ≈ 2.87
# Variance
var = np.var(data) # ≈ 8.25
# Min and max
min_val = np.min(data) # 1
max_val = np.max(data) # 10
# Range
range_val = max_val - min_val # 9Percentiles and Quantiles
data = np.random.normal(100, 15, 1000)
# Percentiles
p25 = np.percentile(data, 25) # 25th percentile
p50 = np.percentile(data, 50) # 50th percentile (median)
p75 = np.percentile(data, 75) # 75th percentile
# Quantiles
q1 = np.quantile(data, 0.25)
q2 = np.quantile(data, 0.50)
q3 = np.quantile(data, 0.75)
# Interquartile range
iqr = q3 - q1Hypothesis Testing Basics
# Compare two samples
sample1 = np.random.normal(100, 15, 100)
sample2 = np.random.normal(105, 15, 100)
# T-test
from scipy.stats import ttest_ind
t_stat, p_value = ttest_ind(sample1, sample2)
print(f"t-statistic: {t_stat}")
print(f"p-value: {p_value}")
# If p-value < 0.05, samples are significantly differentCorrelation Analysis
# Generate correlated data
x = np.random.randn(100)
y = 2 * x + np.random.randn(100)
# Pearson correlation
from scipy.stats import pearsonr
corr, p_value = pearsonr(x, y)
print(f"Correlation: {corr}") # ≈ 0.87
print(f"p-value: {p_value}")Regression Analysis
# Simple linear regression
x = np.array([1, 2, 3, 4, 5])
y = np.array([2, 4, 5, 4, 5])
# Fit line: y = mx + b
coefficients = np.polyfit(x, y, 1)
m, b = coefficients
print(f"Slope: {m}")
print(f"Intercept: {b}")
# Predictions
y_pred = m * x + b
# R-squared
ss_res = np.sum((y - y_pred)**2)
ss_tot = np.sum((y - np.mean(y))**2)
r_squared = 1 - (ss_res / ss_tot)
print(f"R²: {r_squared}")Distribution Fitting
# Generate data from normal distribution
data = np.random.normal(loc=100, scale=15, size=1000)
# Fit normal distribution
from scipy.stats import norm
mu, sigma = norm.fit(data)
print(f"Mean: {mu}")
print(f"Std Dev: {sigma}")
# Test goodness of fit
from scipy.stats import kstest
ks_stat, p_value = kstest(data, 'norm', args=(mu, sigma))
print(f"KS statistic: {ks_stat}")
print(f"p-value: {p_value}")❓ What does np.std() compute?