Statistical Analysis with NumPy

Duration: 8 min

Try it in Google Colab:

Descriptive Statistics

import numpy as np

data = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

# Mean (average)
mean = np.mean(data)  # 5.5

# Median (middle value)
median = np.median(data)  # 5.5

# Mode (most frequent) - use scipy
from scipy import stats
mode = stats.mode(data)[0]  # 1 (or any value, all appear once)

# Standard deviation
std = np.std(data)  # ≈ 2.87

# Variance
var = np.var(data)  # ≈ 8.25

# Min and max
min_val = np.min(data)  # 1
max_val = np.max(data)  # 10

# Range
range_val = max_val - min_val  # 9

Percentiles and Quantiles

data = np.random.normal(100, 15, 1000)

# Percentiles
p25 = np.percentile(data, 25)  # 25th percentile
p50 = np.percentile(data, 50)  # 50th percentile (median)
p75 = np.percentile(data, 75)  # 75th percentile

# Quantiles
q1 = np.quantile(data, 0.25)
q2 = np.quantile(data, 0.50)
q3 = np.quantile(data, 0.75)

# Interquartile range
iqr = q3 - q1

Hypothesis Testing Basics

# Compare two samples
sample1 = np.random.normal(100, 15, 100)
sample2 = np.random.normal(105, 15, 100)

# T-test
from scipy.stats import ttest_ind
t_stat, p_value = ttest_ind(sample1, sample2)

print(f"t-statistic: {t_stat}")
print(f"p-value: {p_value}")

# If p-value < 0.05, samples are significantly different

Correlation Analysis

# Generate correlated data
x = np.random.randn(100)
y = 2 * x + np.random.randn(100)

# Pearson correlation
from scipy.stats import pearsonr
corr, p_value = pearsonr(x, y)

print(f"Correlation: {corr}")  # ≈ 0.87
print(f"p-value: {p_value}")

Regression Analysis

# Simple linear regression
x = np.array([1, 2, 3, 4, 5])
y = np.array([2, 4, 5, 4, 5])

# Fit line: y = mx + b
coefficients = np.polyfit(x, y, 1)
m, b = coefficients

print(f"Slope: {m}")
print(f"Intercept: {b}")

# Predictions
y_pred = m * x + b

# R-squared
ss_res = np.sum((y - y_pred)**2)
ss_tot = np.sum((y - np.mean(y))**2)
r_squared = 1 - (ss_res / ss_tot)

print(f"R²: {r_squared}")

Distribution Fitting

# Generate data from normal distribution
data = np.random.normal(loc=100, scale=15, size=1000)

# Fit normal distribution
from scipy.stats import norm
mu, sigma = norm.fit(data)

print(f"Mean: {mu}")
print(f"Std Dev: {sigma}")

# Test goodness of fit
from scipy.stats import kstest
ks_stat, p_value = kstest(data, 'norm', args=(mu, sigma))

print(f"KS statistic: {ks_stat}")
print(f"p-value: {p_value}")

❓ What does np.std() compute?

The average value The variance The standard deviation The median

Statistical Analysis with NumPy

Descriptive Statistics

Percentiles and Quantiles

Hypothesis Testing Basics

Correlation Analysis

Regression Analysis

Distribution Fitting

Related Courses