import re
from collections import Counter, defaultdict
import random
import sys

# ----------------------------------------------------
# Optional command-line argument for sentence length
# If no argument is provided, default to 20.
# ----------------------------------------------------
if len(sys.argv) > 1:
    try:
        sentence_length = int(sys.argv[1])
    except ValueError:
        sentence_length = 20
else:
    sentence_length = 20

# -------------------------
# Load and Clean Text
# -------------------------
with open('sample.txt', 'r', encoding='utf-8') as f:
    text = f.read().lower()

# Basic cleaning: only keep letters and spaces
text = re.sub(r'[^a-z\s]', '', text)
text = re.sub(r'\s+', ' ', text).strip()

# -------------------------
# LETTER FREQUENCY MODEL
# -------------------------
letters = [ch for ch in text if ch.isalpha()]
letter_counts = Counter(letters)
total_letters = sum(letter_counts.values())

def sample_letter():
    r = random.randint(1, total_letters)
    cumulative = 0
    for l, c in letter_counts.items():
        cumulative += c
        if cumulative >= r:
            return l

print("Random letters (simple frequency):")
print(''.join(sample_letter() for _ in range(50)))
print()

# -------------------------
# LETTER-BASED MARKOV CHAIN (BIGRAM)
# -------------------------
# Build letter -> next-letter counts
letter_follow_counts = defaultdict(Counter)
for i in range(len(letters)-1):
    current_letter = letters[i]
    next_letter = letters[i+1]
    letter_follow_counts[current_letter][next_letter] += 1

def sample_next_letter(prev_letter):
    if prev_letter not in letter_follow_counts or len(letter_follow_counts[prev_letter]) == 0:
        # fallback
        return random.choice(list(letter_counts.keys()))
    cdict = letter_follow_counts[prev_letter]
    total = sum(cdict.values())
    r = random.randint(1, total)
    cumulative = 0
    for l, cnt in cdict.items():
        cumulative += cnt
        if cumulative >= r:
            return l

current = random.choice(letters)
generated_letters = [current]
for _ in range(50):
    current = sample_next_letter(current)
    generated_letters.append(current)

print("Letter-based Markov chain text:")
print(''.join(generated_letters))
print()

# -------------------------
# WORD FREQUENCY MODEL
# -------------------------
words = text.split()
word_counts = Counter(words)
total_words = sum(word_counts.values())

def sample_word():
    r = random.randint(1, total_words)
    cumulative = 0
    for w, c in word_counts.items():
        cumulative += c
        if cumulative >= r:
            return w

print("Random words (simple frequency):")
print(' '.join(sample_word() for _ in range(sentence_length)))
print()

# -------------------------
# WORD-BASED MARKOV CHAIN (BIGRAM)
# -------------------------
word_follow_counts = defaultdict(Counter)
for i in range(len(words)-1):
    current_word = words[i]
    next_word = words[i+1]
    word_follow_counts[current_word][next_word] += 1

def sample_next_word(prev_word):
    if prev_word not in word_follow_counts or len(word_follow_counts[prev_word]) == 0:
        return random.choice(list(word_counts.keys()))
    cdict = word_follow_counts[prev_word]
    total = sum(cdict.values())
    r = random.randint(1, total)
    cumulative = 0
    for w, cnt in cdict.items():
        cumulative += cnt
        if cumulative >= r:
            return w

current = random.choice(words)
generated_sentence = [current]
for _ in range(sentence_length - 1):
    current = sample_next_word(current)
    generated_sentence.append(current)

print("Word-based Markov chain sentence:")
print(' '.join(generated_sentence))
print()

# -------------------------
# TRIGRAM MODEL (WORD-BASED)
# -------------------------
trigram_counts = defaultdict(Counter)
for i in range(len(words)-2):
    pair = (words[i], words[i+1])
    next_word = words[i+2]
    trigram_counts[pair][next_word] += 1

def sample_trigram_word(prev_two_words):
    if prev_two_words not in trigram_counts or len(trigram_counts[prev_two_words]) == 0:
        return random.choice(list(word_counts.keys()))
    cdict = trigram_counts[prev_two_words]
    total = sum(cdict.values())
    r = random.randint(1, total)
    cumulative = 0
    for w, cnt in cdict.items():
        cumulative += cnt
        if cumulative >= r:
            return w

start_index = random.randint(0, len(words)-3)
current_pair = (words[start_index], words[start_index+1])
generated_trigram_sentence = list(current_pair)
for _ in range(sentence_length - 2):
    next_w = sample_trigram_word(current_pair)
    generated_trigram_sentence.append(next_w)
    current_pair = (current_pair[1], next_w)

print("Trigram-based sentence:")
print(' '.join(generated_trigram_sentence))