import re from collections import Counter, defaultdict import random import sys # ---------------------------------------------------- # Optional command-line argument for sentence length # If no argument is provided, default to 20. # ---------------------------------------------------- if len(sys.argv) > 1: try: sentence_length = int(sys.argv[1]) except ValueError: sentence_length = 20 else: sentence_length = 20 # ------------------------- # Load and Clean Text # ------------------------- with open('sample.txt', 'r', encoding='utf-8') as f: text = f.read().lower() # Basic cleaning: only keep letters and spaces text = re.sub(r'[^a-z\s]', '', text) text = re.sub(r'\s+', ' ', text).strip() # ------------------------- # LETTER FREQUENCY MODEL # ------------------------- letters = [ch for ch in text if ch.isalpha()] letter_counts = Counter(letters) total_letters = sum(letter_counts.values()) def sample_letter(): r = random.randint(1, total_letters) cumulative = 0 for l, c in letter_counts.items(): cumulative += c if cumulative >= r: return l print("Random letters (simple frequency):") print(''.join(sample_letter() for _ in range(50))) print() # ------------------------- # LETTER-BASED MARKOV CHAIN (BIGRAM) # ------------------------- # Build letter -> next-letter counts letter_follow_counts = defaultdict(Counter) for i in range(len(letters)-1): current_letter = letters[i] next_letter = letters[i+1] letter_follow_counts[current_letter][next_letter] += 1 def sample_next_letter(prev_letter): if prev_letter not in letter_follow_counts or len(letter_follow_counts[prev_letter]) == 0: # fallback return random.choice(list(letter_counts.keys())) cdict = letter_follow_counts[prev_letter] total = sum(cdict.values()) r = random.randint(1, total) cumulative = 0 for l, cnt in cdict.items(): cumulative += cnt if cumulative >= r: return l current = random.choice(letters) generated_letters = [current] for _ in range(50): current = sample_next_letter(current) generated_letters.append(current) print("Letter-based Markov chain text:") print(''.join(generated_letters)) print() # ------------------------- # WORD FREQUENCY MODEL # ------------------------- words = text.split() word_counts = Counter(words) total_words = sum(word_counts.values()) def sample_word(): r = random.randint(1, total_words) cumulative = 0 for w, c in word_counts.items(): cumulative += c if cumulative >= r: return w print("Random words (simple frequency):") print(' '.join(sample_word() for _ in range(sentence_length))) print() # ------------------------- # WORD-BASED MARKOV CHAIN (BIGRAM) # ------------------------- word_follow_counts = defaultdict(Counter) for i in range(len(words)-1): current_word = words[i] next_word = words[i+1] word_follow_counts[current_word][next_word] += 1 def sample_next_word(prev_word): if prev_word not in word_follow_counts or len(word_follow_counts[prev_word]) == 0: return random.choice(list(word_counts.keys())) cdict = word_follow_counts[prev_word] total = sum(cdict.values()) r = random.randint(1, total) cumulative = 0 for w, cnt in cdict.items(): cumulative += cnt if cumulative >= r: return w current = random.choice(words) generated_sentence = [current] for _ in range(sentence_length - 1): current = sample_next_word(current) generated_sentence.append(current) print("Word-based Markov chain sentence:") print(' '.join(generated_sentence)) print() # ------------------------- # TRIGRAM MODEL (WORD-BASED) # ------------------------- trigram_counts = defaultdict(Counter) for i in range(len(words)-2): pair = (words[i], words[i+1]) next_word = words[i+2] trigram_counts[pair][next_word] += 1 def sample_trigram_word(prev_two_words): if prev_two_words not in trigram_counts or len(trigram_counts[prev_two_words]) == 0: return random.choice(list(word_counts.keys())) cdict = trigram_counts[prev_two_words] total = sum(cdict.values()) r = random.randint(1, total) cumulative = 0 for w, cnt in cdict.items(): cumulative += cnt if cumulative >= r: return w start_index = random.randint(0, len(words)-3) current_pair = (words[start_index], words[start_index+1]) generated_trigram_sentence = list(current_pair) for _ in range(sentence_length - 2): next_w = sample_trigram_word(current_pair) generated_trigram_sentence.append(next_w) current_pair = (current_pair[1], next_w) print("Trigram-based sentence:") print(' '.join(generated_trigram_sentence))