Loading HuggingFace Datasets
Duration: 5 min
The datasets library lets you load any of HuggingFace's 50,000+ datasets in one line, with automatic caching, streaming for large datasets, and easy conversion to pandas.
Loading a dataset
from datasets import load_dataset
# Load the IMDB sentiment dataset
dataset = load_dataset('imdb')
print(dataset)
# DatasetDict({
# train: Dataset({features: ['text', 'label'], num_rows: 25000})
# test: Dataset({features: ['text', 'label'], num_rows: 25000})
# })
# Access a split
train = dataset['train']
print(train[0]) # first example
print(train.features) # column typesFiltering and converting to pandas
from datasets import load_dataset
import pandas as pd
dataset = load_dataset('imdb')
# Filter to positive reviews only
positive = dataset['train'].filter(lambda x: x['label'] == 1)
print(f'Positive reviews: {len(positive)}')
# Convert to pandas for familiar analysis
df = dataset['train'].to_pandas()
print(df['label'].value_counts())
print(df['text'].str.len().describe())Streaming large datasets
from datasets import load_dataset
# Stream a huge dataset without downloading it all
# Common Crawl is terabytes — streaming makes it usable
dataset = load_dataset('wikipedia', '20220301.en', streaming=True)
# Iterate over batches
for i, example in enumerate(dataset['train']):
print(example['title'])
if i >= 4: break
# Algebra
# Anthropology
# Arithmetic
# Art
# Astronomy💡 Tip: Use streaming=True for any dataset over a few GB. You process it in batches without ever downloading the full thing.
❓ When should you use streaming=True with load_dataset()?