Module 5 of 6 · Applied Maths with NumPy · Beginner

Statistical Analysis with NumPy

Duration: 8 min

Try it in Google Colab: Open in Colab

Descriptive Statistics

import numpy as np

data = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

# Mean (average)
mean = np.mean(data)  # 5.5

# Median (middle value)
median = np.median(data)  # 5.5

# Mode (most frequent) - use scipy
from scipy import stats
mode = stats.mode(data)[0]  # 1 (or any value, all appear once)

# Standard deviation
std = np.std(data)  # ≈ 2.87

# Variance
var = np.var(data)  # ≈ 8.25

# Min and max
min_val = np.min(data)  # 1
max_val = np.max(data)  # 10

# Range
range_val = max_val - min_val  # 9

Percentiles and Quantiles

data = np.random.normal(100, 15, 1000)

# Percentiles
p25 = np.percentile(data, 25)  # 25th percentile
p50 = np.percentile(data, 50)  # 50th percentile (median)
p75 = np.percentile(data, 75)  # 75th percentile

# Quantiles
q1 = np.quantile(data, 0.25)
q2 = np.quantile(data, 0.50)
q3 = np.quantile(data, 0.75)

# Interquartile range
iqr = q3 - q1

Hypothesis Testing Basics

# Compare two samples
sample1 = np.random.normal(100, 15, 100)
sample2 = np.random.normal(105, 15, 100)

# T-test
from scipy.stats import ttest_ind
t_stat, p_value = ttest_ind(sample1, sample2)

print(f"t-statistic: {t_stat}")
print(f"p-value: {p_value}")

# If p-value < 0.05, samples are significantly different

Correlation Analysis

# Generate correlated data
x = np.random.randn(100)
y = 2 * x + np.random.randn(100)

# Pearson correlation
from scipy.stats import pearsonr
corr, p_value = pearsonr(x, y)

print(f"Correlation: {corr}")  # ≈ 0.87
print(f"p-value: {p_value}")

Regression Analysis

# Simple linear regression
x = np.array([1, 2, 3, 4, 5])
y = np.array([2, 4, 5, 4, 5])

# Fit line: y = mx + b
coefficients = np.polyfit(x, y, 1)
m, b = coefficients

print(f"Slope: {m}")
print(f"Intercept: {b}")

# Predictions
y_pred = m * x + b

# R-squared
ss_res = np.sum((y - y_pred)**2)
ss_tot = np.sum((y - np.mean(y))**2)
r_squared = 1 - (ss_res / ss_tot)

print(f"R²: {r_squared}")

Distribution Fitting

# Generate data from normal distribution
data = np.random.normal(loc=100, scale=15, size=1000)

# Fit normal distribution
from scipy.stats import norm
mu, sigma = norm.fit(data)

print(f"Mean: {mu}")
print(f"Std Dev: {sigma}")

# Test goodness of fit
from scipy.stats import kstest
ks_stat, p_value = kstest(data, 'norm', args=(mu, sigma))

print(f"KS statistic: {ks_stat}")
print(f"p-value: {p_value}")

❓ What does np.std() compute?

← Previous Continue interactively → Next →

Related Courses