NLP & Language Models
1. Text Processing & Tokenization
Learn to clean and prepare text data for NLP models.
# Text preprocessing with NLTK
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
# Tokenization
text = "Natural Language Processing is fascinating!"
tokens = word_tokenize(text.lower())
# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_tokens = [w for w in tokens if w not in stop_words]
# Stemming
stemmer = PorterStemmer()
stemmed = [stemmer.stem(w) for w in filtered_tokens]
2. Sentiment Analysis
Build models to analyze emotions and opinions in text.
# Sentiment analysis with transformers
from transformers import pipeline
# Load pre-trained sentiment model
sentiment_pipeline = pipeline("sentiment-analysis")
# Analyze sentiment
texts = ["I love this product!", "This is terrible."]
results = sentiment_pipeline(texts)
for text, result in zip(texts, results):
print(f"Text: {text}")
print(f"Sentiment: {result['label']}, Score: {result['score']:.3f}")
3. Named Entity Recognition (NER)
Extract entities like names, locations, and organizations from text.
# NER with spaCy
import spacy
# Load language model
nlp = spacy.load("en_core_web_sm")
# Process text
text = "Apple Inc. was founded by Steve Jobs in Cupertino, California."
doc = nlp(text)
# Extract entities
for ent in doc.ents:
print(f"Entity: {ent.text}, Label: {ent.label_}, Description: {spacy.explain(ent.label_)}")
4. Building a Chatbot
Create conversational AI using modern language models.
# Simple chatbot with Hugging Face
from transformers import AutoTokenizer, AutoModelForCausalLM
# Load model and tokenizer
model_name = "microsoft/DialoGPT-medium"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
# Chat function
def chat_with_bot(user_input, chat_history_ids=None):
# Encode user input
new_user_input_ids = tokenizer.encode(user_input + tokenizer.eos_token, return_tensors='pt')
# Append to chat history
bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if chat_history_ids is not None else new_user_input_ids
# Generate response
chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)
# Decode response
response = tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)
return response, chat_history_ids