NLP & Language Models

1. Text Processing & Tokenization

Learn to clean and prepare text data for NLP models.

# Text preprocessing with NLTK
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Tokenization
text = "Natural Language Processing is fascinating!"
tokens = word_tokenize(text.lower())

# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_tokens = [w for w in tokens if w not in stop_words]

# Stemming
stemmer = PorterStemmer()
stemmed = [stemmer.stem(w) for w in filtered_tokens]

2. Sentiment Analysis

Build models to analyze emotions and opinions in text.

# Sentiment analysis with transformers
from transformers import pipeline

# Load pre-trained sentiment model
sentiment_pipeline = pipeline("sentiment-analysis")

# Analyze sentiment
texts = ["I love this product!", "This is terrible."]
results = sentiment_pipeline(texts)

for text, result in zip(texts, results):
    print(f"Text: {text}")
    print(f"Sentiment: {result['label']}, Score: {result['score']:.3f}")

3. Named Entity Recognition (NER)

Extract entities like names, locations, and organizations from text.

# NER with spaCy
import spacy

# Load language model
nlp = spacy.load("en_core_web_sm")

# Process text
text = "Apple Inc. was founded by Steve Jobs in Cupertino, California."
doc = nlp(text)

# Extract entities
for ent in doc.ents:
    print(f"Entity: {ent.text}, Label: {ent.label_}, Description: {spacy.explain(ent.label_)}")

4. Building a Chatbot

Create conversational AI using modern language models.

# Simple chatbot with Hugging Face
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load model and tokenizer
model_name = "microsoft/DialoGPT-medium"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Chat function
def chat_with_bot(user_input, chat_history_ids=None):
    # Encode user input
    new_user_input_ids = tokenizer.encode(user_input + tokenizer.eos_token, return_tensors='pt')
    
    # Append to chat history
    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if chat_history_ids is not None else new_user_input_ids
    
    # Generate response
    chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)
    
    # Decode response
    response = tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)
    return response, chat_history_ids