NLP Experiments

Experiment 1: Tokenization, Lemmatization, Stemming, Stop Words Removal, and Regular Expressions

import nltk
import re
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords

# Sample text
text = "Natural Language Processing is an exciting field. It involves making computers understand human languages. We will explore tokenization, stemming, and lemmatization."

print("Original Text:\n", text)

# 1. Tokenization
print("\n--- Tokenization ---")
# Sentence Tokenization
sentences = sent_tokenize(text)
print("Sentences:", sentences)
# Word Tokenization
words = word_tokenize(text.lower()) # Convert to lowercase for consistency
print("Words:", words)

# 2. Stemming
print("\n--- Stemming ---")
porter = PorterStemmer()
stemmed_words = [porter.stem(word) for word in words]
print("Stemmed Words:", stemmed_words)

# 3. Lemmatization
print("\n--- Lemmatization ---")
lemmatizer = WordNetLemmatizer()
# Note: Lemmatization can be improved by providing Part-of-Speech tags
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
print("Lemmatized Words (without POS tags):", lemmatized_words)
# Example with POS tag (simplified for demonstration)
lemmatized_words_pos = [lemmatizer.lemmatize(word, pos='v') for word in words] # Assuming all are verbs
# print("Lemmatized Words (assuming verbs):", lemmatized_words_pos)


# 4. Stop Words Removal
print("\n--- Stop Words Removal ---")
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word.isalnum() and word not in stop_words] # also remove punctuation
print("Words after Stopword Removal:", filtered_words)

# 5. Regular Expressions
print("\n--- Regular Expressions ---")
# a. Search for a pattern (e.g., words starting with 'lang')
pattern_search = r"\blang\w*"
found = re.findall(pattern_search, text, re.IGNORECASE)
print(f"Words matching '{pattern_search}': {found}")

# b. Split text by a delimiter (e.g., sentences based on punctuation)
pattern_split = r"[.!?]"
sentences_re = re.split(pattern_split, text)
print("Sentences split by regex:", [s.strip() for s in sentences_re if s.strip()])

# c. Substitution (e.g., replace 'NLP' with its full form)
pattern_sub = r"\bNLP\b"
substituted_text = re.sub(pattern_sub, "Natural Language Processing", text)
print("Text after substitution:", substituted_text)

# Example paragraph for exercise
paragraph = """Tokenization is the first step. Stemming reduces words to their root form, which may not be a valid word.
Lemmatization, on the other hand, reduces words to their dictionary form (lemma).
Stop words like 'is', 'a', 'the' are often removed."""

print("\n--- Processing Sample Paragraph ---")
words_p = word_tokenize(paragraph.lower())
print("Tokens:", words_p)

stemmed_p = [porter.stem(w) for w in words_p if w.isalnum()]
print("Stemmed:", stemmed_p)

lemmatized_p = [lemmatizer.lemmatize(w) for w in words_p if w.isalnum()]
print("Lemmatized:", lemmatized_p)

filtered_p = [w for w in words_p if w.isalnum() and w not in stop_words]
print("Stop words removed:", filtered_p)

Experiment 2: Bag of Words, TF-IDF

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Sample documents
doc1 = "The cat sat on the mat."
doc2 = "The dog played in the park."
doc3 = "The cat and the dog are friends."
corpus = [doc1, doc2, doc3]

print("Corpus:\n", corpus)

# 1. Bag of Words (BoW)
print("\n--- Bag of Words (BoW) ---")
vectorizer_bow = CountVectorizer()
bow_matrix = vectorizer_bow.fit_transform(corpus)
print("Feature Names (Vocabulary):", vectorizer_bow.get_feature_names_out())
print("BoW Matrix (dense):\n", bow_matrix.toarray())

# 2. TF-IDF (Term Frequency-Inverse Document Frequency)
print("\n--- TF-IDF ---")
vectorizer_tfidf = TfidfVectorizer()
tfidf_matrix = vectorizer_tfidf.fit_transform(corpus)
print("Feature Names (Vocabulary):", vectorizer_tfidf.get_feature_names_out())
print("TF-IDF Matrix (dense):\n", tfidf_matrix.toarray())

Experiment 3: Word2Vec, Word Embeddings

from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

# Sample sentences for training Word2Vec
# In a real scenario, you'd use a much larger corpus
corpus_text = [
    "king is a strong man",
    "queen is a wise woman",
    "man is a human",
    "woman is a human",
    "king and queen rule the kingdom",
    "boy is a young man",
    "girl is a young woman"
]

print("Training Corpus:\n", corpus_text)

# Tokenize the corpus
tokenized_corpus = [word_tokenize(sentence.lower()) for sentence in corpus_text]
print("\nTokenized Corpus for Word2Vec:\n", tokenized_corpus)

# Train Word2Vec model
# Parameters:
# - sentences: The tokenized corpus
# - vector_size: Dimensionality of the word vectors (e.g., 100)
# - window: Maximum distance between the current and predicted word within a sentence
# - min_count: Ignores all words with total frequency lower than this
# - workers: Number of CPU cores to use for training
# - sg: Training algorithm: 1 for skip-gram; otherwise CBOW (0).
model_w2v = Word2Vec(sentences=tokenized_corpus, vector_size=50, window=5, min_count=1, sg=0, workers=4)

print("\n--- Word2Vec ---")

# Get the vocabulary
vocabulary = list(model_w2v.wv.index_to_key)
print("\nModel Vocabulary:", vocabulary)

# Get the vector for a specific word
try:
    vector_king = model_w2v.wv['king']
    print("\nVector for 'king':\n", vector_king)
    print("Vector size:", len(vector_king))
except KeyError:
    print("\n'king' not in vocabulary.")


# Find similar words
try:
    similar_to_king = model_w2v.wv.most_similar('king', topn=3)
    print("\nWords most similar to 'king':", similar_to_king)
except KeyError:
    print("\n'king' not in vocabulary to find similar words.")

try:
    similar_to_woman = model_w2v.wv.most_similar('woman', topn=3)
    print("\nWords most similar to 'woman':", similar_to_woman)
except KeyError:
    print("\n'woman' not in vocabulary to find similar words.")

# Analogy: king - man + woman = queen
try:
    analogy_result = model_w2v.wv.most_similar(positive=['king', 'woman'], negative=['man'], topn=1)
    print("\nAnalogy 'king - man + woman':", analogy_result) # Expected: queen
except KeyError as e:
    print(f"\nCould not perform analogy, word not in vocab: {e}")


print("\n--- Word Embeddings ---")
print("Word embeddings are the vectors learned by models like Word2Vec.")
print("For example, the vector for 'queen' is its embedding:")
try:
    embedding_queen = model_w2v.wv['queen']
    print(embedding_queen)
except KeyError:
    print("'queen' not in vocabulary.")

Experiment 4: Minimum Edit Distance (Levenshtein Distance)

import nltk # NLTK provides a direct function for this

def levenshtein_distance_custom(s1, s2):
    """
    A simple implementation of Levenshtein distance.
    For assignments, nltk.edit_distance is usually sufficient and preferred.
    """
    if len(s1) < len(s2):
        return levenshtein_distance_custom(s2, s1)

    if len(s2) == 0:
        return len(s1)

    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    
    return previous_row[-1]

# Strings for comparison
string1 = "kitten"
string2 = "sitting"

string3 = "sunday"
string4 = "saturday"

string5 = "intention"
string6 = "execution"

print(f"--- Minimum Edit Distance (Levenshtein) ---")

# Using NLTK's built-in function
dist_nltk1 = nltk.edit_distance(string1, string2)
print(f"NLTK Edit distance between '{string1}' and '{string2}': {dist_nltk1}")

dist_nltk2 = nltk.edit_distance(string3, string4)
print(f"NLTK Edit distance between '{string3}' and '{string4}': {dist_nltk2}")

dist_nltk3 = nltk.edit_distance(string5, string6)
print(f"NLTK Edit distance between '{string5}' and '{string6}': {dist_nltk3}")

# Exercise example:
str_a = "apple"
str_b = "apply"
print(f"\nExercise: Edit distance between '{str_a}' and '{str_b}': {nltk.edit_distance(str_a, str_b)}")

str_c = "analyze"
str_d = "analyse"
print(f"Exercise: Edit distance between '{str_c}' and '{str_d}': {nltk.edit_distance(str_c, str_d)}")

Experiment 5: N-Gram Models

import nltk
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from collections import Counter

# Sample text
text = "Natural language processing is fascinating. We study language models like n-grams."
print("Original Text:\n", text)

# Tokenize the text
tokens = word_tokenize(text.lower())
print("\nTokens:", tokens)

# 1. Bigrams (N=2)
print("\n--- Bigrams ---")
bigrams = list(ngrams(tokens, 2))
print("Generated Bigrams:", bigrams)
bigram_freq = Counter(bigrams)
print("Bigram Frequencies:", bigram_freq.most_common(5)) # Display top 5

# 2. Trigrams (N=3)
print("\n--- Trigrams ---")
trigrams = list(ngrams(tokens, 3))
print("Generated Trigrams:", trigrams)
trigram_freq = Counter(trigrams)
print("Trigram Frequencies:", trigram_freq.most_common(5)) # Display top 5

# 3. Generic N-grams (e.g., N=4)
print("\n--- Quadgrams (N=4) ---")
n_value = 4
quadgrams = list(ngrams(tokens, n_value))
print(f"Generated {n_value}-grams:", quadgrams)
quadgram_freq = Counter(quadgrams)
print(f"{n_value}-gram Frequencies:", quadgram_freq.most_common(5)) # Display top 5


# Function to generate and analyze N-grams
def analyze_ngrams(text, n):
    tokens_ng = word_tokenize(text.lower())
    if len(tokens_ng) < n:
        print(f"Text too short for {n}-grams.")
        return None, None
    
    n_grams_list = list(ngrams(tokens_ng, n))
    n_gram_frequencies = Counter(n_grams_list)
    return n_grams_list, n_gram_frequencies

print("\n--- N-gram Analysis Example (Python code to analyze) ---")
sample_sentence = "This is a sample sentence for n-gram analysis."

# Analyze Bigrams
bigrams_list, bigram_counts = analyze_ngrams(sample_sentence, 2)
if bigrams_list:
    print("Bigrams found:", bigrams_list)
    print("Bigram counts:", bigram_counts)

# Analyze Trigrams
trigrams_list, trigram_counts = analyze_ngrams(sample_sentence, 3)
if trigrams_list:
    print("\nTrigrams found:", trigrams_list)
    print("Trigram counts:", trigram_counts)

# Analyze N-gram (e.g., 1-gram or unigram)
unigrams_list, unigram_counts = analyze_ngrams(sample_sentence, 1)
if unigrams_list:
    print("\nUnigrams found:", unigrams_list) # These are essentially the tokens
    print("Unigram counts:", unigram_counts)

Experiment 6: Cosine for Measuring Similarity

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Sample documents
doc1 = "The quick brown fox jumps over the lazy dog."
doc2 = "A fast brown fox leaps above a sleepy dog."
doc3 = "The weather is sunny today."
doc_list = [doc1, doc2, doc3]

print("Documents:")
for i, doc in enumerate(doc_list):
    print(f"Doc {i+1}: {doc}")

# 1. Vectorize the documents using TF-IDF
# This converts text into numerical vectors that can be used for cosine similarity
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(doc_list)

print("\nTF-IDF Matrix Shape:", tfidf_matrix.shape)

# 2. Calculate Cosine Similarity
# cosine_similarity calculates similarity between all pairs of documents

# Similarity between doc1 and doc2
# tfidf_matrix[0:1] is the vector for doc1
# tfidf_matrix[1:2] is the vector for doc2
cosine_sim_1_2 = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
print(f"\nCosine Similarity between Doc 1 and Doc 2: {cosine_sim_1_2[0][0]:.4f}")

# Similarity between doc1 and doc3
cosine_sim_1_3 = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[2:3])
print(f"Cosine Similarity between Doc 1 and Doc 3: {cosine_sim_1_3[0][0]:.4f}")

# Similarity between doc2 and doc3
cosine_sim_2_3 = cosine_similarity(tfidf_matrix[1:2], tfidf_matrix[2:3])
print(f"Cosine Similarity between Doc 2 and Doc 3: {cosine_sim_2_3[0][0]:.4f}")

# Calculate the full similarity matrix
# This compares every document with every other document (including itself)
cosine_sim_matrix = cosine_similarity(tfidf_matrix)
print("\nFull Cosine Similarity Matrix:\n", cosine_sim_matrix)
print("(Row/Col 0 is Doc1, 1 is Doc2, 2 is Doc3)")

# Exercise: Develop a python code to perform the task
text_data_set = [
    "Natural language processing is fun.",
    "NLP techniques are useful for text analysis.",
    "Machine learning and deep learning drive NLP advancements."
]

print("\n--- Exercise with new text data set ---")
print("Text Data Set:")
for i, text in enumerate(text_data_set):
    print(f"Text {i+1}: {text}")

tfidf_matrix_exercise = vectorizer.fit_transform(text_data_set)
cosine_sim_matrix_exercise = cosine_similarity(tfidf_matrix_exercise)

print("\nCosine Similarity Matrix for Exercise Data Set:\n", cosine_sim_matrix_exercise)
print(f"Similarity between Text 1 and Text 2: {cosine_sim_matrix_exercise[0,1]:.4f}")
print(f"Similarity between Text 1 and Text 3: {cosine_sim_matrix_exercise[0,2]:.4f}")
print(f"Similarity between Text 2 and Text 3: {cosine_sim_matrix_exercise[1,2]:.4f}")

Experiment 7: Part of Speech Tagging using Hidden Markov Model and Viterbi Algorithm

import nltk
from nltk.tokenize import word_tokenize

# Ensure 'averaged_perceptron_tagger' is downloaded
# nltk.download('averaged_perceptron_tagger')
# nltk.download('punkt')

# Sample text
text = "The quick brown fox jumps over the lazy dog. He is very energetic."
print("Original Text:\n", text)

# Tokenize the text
tokens = word_tokenize(text)
print("\nTokens:", tokens)

# Perform POS Tagging
# NLTK's `pos_tag` uses the PerceptronTagger by default, which is a good sequence model.
# While not explicitly an HMM with Viterbi that you code line-by-line here,
# it serves the purpose for an assignment demonstrating POS tagging.
# HMMs are a foundational concept for such taggers.
pos_tags_nltk = nltk.pos_tag(tokens)

print("\n--- Part of Speech Tagging (using NLTK's default tagger) ---")
print("POS Tags:", pos_tags_nltk)

print("\n--- Exercise: POS tagging based on Hidden Markov Model and Viterbi algorithm ---")
# The aim is to implement POS tagging. NLTK's `pos_tag` is a practical way.
# If you need to show the *use* of HMM/Viterbi rather than *implement* them from scratch,
# referencing that `nltk.pos_tag` or similar library functions often rely on these
# underlying principles (or advanced versions like MEMM, CRF) is usually acceptable.

exercise_sentence = "The cat sat on the mat with a gentle thud."
exercise_tokens = word_tokenize(exercise_sentence)
exercise_pos_tags = nltk.pos_tag(exercise_tokens)

print("\nPOS tagging for exercise sentence: '", exercise_sentence, "'")
print(exercise_pos_tags)

# Example of what the tags mean:
# NN: Noun, singular or mass
# DT: Determiner
# VBZ: Verb, 3rd person singular present
# IN: Preposition or subordinating conjunction
# JJ: Adjective
# NNS: Noun, plural
# .: Punctuation

Experiment 8: Probabilistic Context Free Grammar (PCFG)

import nltk
from nltk.grammar import PCFG
from nltk.parse import pchart # Probabilistic Chart Parser (uses CYK variant)

# Define a PCFG
# Rules are of the form: NonTerminal -> Expansion [Probability]
# Sum of probabilities for all productions from the same non-terminal must be 1.0
pcfg_grammar_str = """
    S -> NP VP [1.0]
    VP -> V NP [0.6] | V PP [0.4]
    PP -> P NP [1.0]
    V -> 'saw' [0.5] | 'ate' [0.3] | 'walked' [0.2]
    NP -> 'John' [0.2] | 'Mary' [0.2] | Det N [0.4] | Det N PP [0.2]
    Det -> 'a' [0.5] | 'the' [0.5]
    N -> 'man' [0.5] | 'dog' [0.3] | 'telescope' [0.2]
    P -> 'with' [0.6] | 'in' [0.4]
"""

print("--- Probabilistic Context Free Grammar (PCFG) ---")
try:
    pcfg_grammar = PCFG.fromstring(pcfg_grammar_str)
    print("\nPCFG Grammar:")
    for prod in pcfg_grammar.productions():
        print(prod)

    # Create a probabilistic parser (e.g., Probabilistic Chart Parser)
    # NLTK's ViterbiParser can also be used with PCFGs.
    parser = pchart.InsideChartParser(pcfg_grammar)
    # Alternative: from nltk.parse import ViterbiParser
    # parser = ViterbiParser(pcfg_grammar)


    print("\n--- Parsing a sentence with PCFG ---")
    sentence_str = "John saw a man with a telescope"
    tokens = sentence_str.lower().split() # Assumes tokenizer matches grammar terminals
    print("Sentence to parse:", tokens)

    # Parse the sentence
    # The parser will find the most probable parse(s)
    # For InsideChartParser, it calculates the total probability of the string.
    # For ViterbiParser, it finds the single most probable parse tree.
    
    # Using ViterbiParser for a single most probable tree
    from nltk.parse import ViterbiParser
    viterbi_parser = ViterbiParser(pcfg_grammar)
    parses = list(viterbi_parser.parse(tokens))

    if parses:
        print("\nMost probable parse tree(s):")
        for tree in parses:
            print(tree)
            tree.pretty_print()
            print(f"Probability of this parse: {tree.prob():.6e}")
    else:
        print("No parse found for the sentence with this grammar.")

    # Example of analyzing another sentence
    sentence2_str = "Mary ate the dog"
    tokens2 = sentence2_str.lower().split()
    print("\nParsing another sentence:", tokens2)
    parses2 = list(viterbi_parser.parse(tokens2))
    if parses2:
        for tree in parses2:
            tree.pretty_print()
            print(f"Probability: {tree.prob():.6e}")
    else:
        print("No parse found for the second sentence.")

except ValueError as e:
    print(f"Error creating PCFG: {e}")
    print("Ensure probabilities for productions from the same non-terminal sum to 1.0.")
    print("And ensure all terminals are quoted.")

print("\n--- Exercise: To develop a python code to analyse PCFG. ---")
print("The code above demonstrates defining a PCFG, parsing sentences,")
print("and obtaining the most probable parse trees with their probabilities.")

Experiment 9: Chunking

import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

# Sample sentence
sentence = "The little yellow dog barked at the big cat."
print("Original Sentence:\n", sentence)

# Tokenize and POS tag the sentence
tokens = word_tokenize(sentence)
tagged_tokens = pos_tag(tokens)
print("\nPOS Tagged Tokens:\n", tagged_tokens)

# Define a chunking grammar using regular expressions
# This grammar defines patterns for Noun Phrases (NP)
# ? : Optional Determiner
# * : Zero or more Adjectives
# + : One or more Nouns (any type: NN, NNS, NNP, NNPS)
# We can define multiple chunk patterns.
# A simple NP chunker:
grammar_np = r"""
    NP: {
?*+}  # Chunk sequences of DT, JJ, NN
"""
# A more complex grammar for NPs and VPs
grammar_vp_np = r"""
    NP: {
?*+}  # Chunk Noun Phrases
    PP: {}           # Chunk Prepositional Phrases (prep followed by NP)
    VP: {*}    # Chunk Verb Phrases (verb followed by NP or PP)
"""

print("\n--- Chunking with a simple Noun Phrase (NP) grammar ---")
# Create a RegexpParser with the grammar
chunk_parser_np = nltk.RegexpParser(grammar_np)
# Parse the tagged tokens to get chunks
tree_np = chunk_parser_np.parse(tagged_tokens)
print("Chunk Tree (NP only):\n", tree_np)

print("\nExtracted Noun Phrases (NPs):")
for subtree in tree_np.subtrees():
    if subtree.label() == 'NP':
        print(subtree)
        print(" ".join(token for token, pos in subtree.leaves()))


print("\n--- Chunking with NP, PP, and VP grammar ---")
chunk_parser_vp_np = nltk.RegexpParser(grammar_vp_np)
tree_vp_np = chunk_parser_vp_np.parse(tagged_tokens)
print("Chunk Tree (NP, PP, VP):\n", tree_vp_np)

print("\nExtracted Phrases based on VP/NP/PP grammar:")
for subtree in tree_vp_np.subtrees():
    if subtree.label() in ['NP', 'VP', 'PP']:
        print(f"{subtree.label()}: {' '.join(token for token, pos in subtree.leaves())}")


# Exercise: To develop a Python code to extract meaningful phrases (chunks) from unstructured text data.
print("\n--- Exercise Example ---")
exercise_text = "Natural Language Processing enables computers to understand human speech and text. Machine learning algorithms are often used."
exercise_tokens = word_tokenize(exercise_text)
exercise_tagged_tokens = pos_tag(exercise_tokens)
print("Tagged Exercise Text:", exercise_tagged_tokens)

# Using the NP chunker
exercise_tree_np = chunk_parser_np.parse(exercise_tagged_tokens)
print("\nNP Chunks from exercise text:")
for subtree in exercise_tree_np.subtrees(filter=lambda t: t.label() == 'NP'):
    print(" ".join(token for token, pos in subtree.leaves()))

# Using the more complex chunker
exercise_tree_vp_np = chunk_parser_vp_np.parse(exercise_tagged_tokens)
print("\nNP, PP, VP Chunks from exercise text:")
for subtree in exercise_tree_vp_np.subtrees(filter=lambda t: t.label() in ['NP', 'VP', 'PP']):
     print(f"{subtree.label()}: {' '.join(token for token, pos in subtree.leaves())}")

Experiment 10: Named Entity Recognition (NER)

import spacy

# Load the pre-trained spaCy model
# Make sure you've downloaded it: python -m spacy download en_core_web_sm
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("Downloading 'en_core_web_sm' model...")
    spacy.cli.download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")


# Sample text
text1 = "Apple is looking at buying U.K. startup for $1 billion in London."
text2 = "Sundar Pichai, the CEO of Google, announced new AI features at the conference in New York City on May 10th, 2024."

print("--- Named Entity Recognition (NER) using spaCy ---")

def perform_ner(text_to_analyze):
    print(f"\nAnalyzing text: \"{text_to_analyze}\"")
    doc = nlp(text_to_analyze)
    
    if not doc.ents:
        print("No named entities found.")
        return

    print("Named Entities Found:")
    for ent in doc.ents:
        print(f"  Text: {ent.text}, Label: {ent.label_}, Explanation: {spacy.explain(ent.label_)}")

# Perform NER on the sample texts
perform_ner(text1)
perform_ner(text2)

# Exercise: To develop a python code to perform NER tasks
print("\n--- Exercise Example ---")
exercise_text = """
Microsoft Corporation, based in Redmond, Washington, was co-founded by Bill Gates and Paul Allen on April 4, 1975.
Last year, they reported revenues of over $200 billion.
The current CEO is Satya Nadella. They have offices in many countries, including India.
"""
perform_ner(exercise_text)