import sys import json from nltk.probability import FreqDist
from nltk.util import ngrams from nltk.tokenize import RegexpTokenizer
#Set up a tokenizer that captures only lowercase letters and spaces #This requires that input has been preprocessed to lowercase all letters TOKENIZER = RegexpTokenizer("[a-z ]")
def count_ngrams(input_fp, frequencies, order, buffer_size=1024): '''Read the text content of a file and keep a running count of how often each bigram (sequence of two) characters appears.
Arguments: input_fp -- file pointer with input text frequencies -- mapping from each bigram to its counted frequency buffer_size -- incremental quantity of text to be read at a time, in bytes (1024 if not otherwise specified) order -- The N in each N-gram (i.e. number of items)
Returns: nothing ''' #Read the first chunk of text, and set all letters to lowercase text = input_fp.read(buffer_size).lower() #Loop over the file while there is text to read while text: #This step is needed to collapse runs of space characters into one text = ' '.join(text.split()) spans = TOKENIZER.span_tokenize(text) tokens = (text[begin : end] for (begin, end) in spans) for bigram in ngrams(tokens, order): #Increment the count for the bigram.Automatically handles any #bigram not seen before.The join expression turns 2 separate #single-character strings into one 2-character string if ' ' not in ''.join(bigram): frequencies[''.join(bigram)] += 1 #Read the next chunk of text, and set all letters to lowercase text = input_fp.read(buffer_size).lower()
return
if __name__ == '__main__': #Initialize the mapping frequencies = FreqDist() #The order of the ngrams is the first command line argument ngram_order = int(sys.argv[1]) #Pull the input data from the console count_ngrams(sys.stdin, frequencies, ngram_order) outputfp = open(sys.argv[2], 'w') json.dump(dict(frequencies), outputfp) print('Stored frequencies of {} encountered N-grams.'.format(len(frequencies)))
import sys import json import string import random
population = ' ' + string.ascii_lowercase
def preprocess_frequencies(frequencies, order): '''Compile simple mapping from N-grams to frequencies into data structures to help compute the probability of state transitions to complete an N-gram
Arguments: frequencies -- mapping from N-gram to frequency recorded in the training text order -- The N in each N-gram (i.e. number of items)
Returns: sequencer -- Set of mappings from each N-1 sequence to the frequency of possible items completing it item_freqs -- individual frequency of each item in the training text ''' sequencer = {} item_freqs = {} for ngram in frequencies: #Separate the N-1 lead of each N-gram from its item completions freq = frequencies[ngram] lead = ngram[:-1] final = ngram[-1] sequencer.setdefault(lead, {}) sequencer[lead][final] = freq #Accumulate the overall frequency of each item for c in ngram: item_freqs.setdefault(c, 0) item_freqs[c] += freq return sequencer, item_freqs
def generate_letters(sequencer, item_freqs, length, order): '''Generate text based on probabilities derived from statistics for initializing and continuing sequences of letters
Arguments: sequencer -- mapping from each leading sequence to frequencies of the next letter item_freqs -- mapping from each item to its overall frequency regardless of sequence length -- approximate number of characters to generate before ending the program order -- The N in each N-gram (i.e. number of items)
Returns: nothing ''' #The lead is the initial part of the N-Gram to be completed, of length N-1 #containing the last N-1 items produced lead = '' #Keep track of how many items have been generated generated_count = 0 #Turn item frequencies into weights for selection probability item_weights = [ item_freqs.get(c, 0) for c in population ] while generated_count < length: #This condition will be true until the initial lead N-gram is constructed #It will also be true if we get to a dead end where there are no stats #For the next item from the current lead if lead not in sequencer: #Returns a list, length 1.Extract that one item. chosen = random.choices(population, weights=item_weights)[0] #end='' prevents a newline from being printed after each letter #flush=True forces output to be displayed right away, not buffered print(chosen, end='', flush=True) #If needed to accommodate the new item, clip the first item from the lead if len(lead) == order: lead = lead[1:] #Tack on the new item lead += chosen generated_count += 1 else: freq = sequencer[lead] weights = [ freq.get(c, 0) for c in population ] chosen = random.choices(population, weights=weights)[0] print(chosen, end='', flush=True) #Clip the first item from the lead and tack on the new item lead = lead[1:]+ chosen generated_count += 1
return
if __name__ == '__main__': #File with N-gram frequencies is the first argument raw_freq_fp = open(sys.argv[1]) length = int(sys.argv[2]) raw_freqs = json.load(raw_freq_fp)
#Figure out the N-gram order.Just pull the first N-gram and check its length order = len(next(iter(raw_freqs))) sequencer, item_freqs = preprocess_frequencies(raw_freqs, order) generate_letters(sequencer, item_freqs, length, order)
import sys import json import string import random
POPULAR_NGRAM_COUNT = 10000
#Population is all the possible items that can be generated population = ' ' + string.ascii_lowercase
def preprocess_frequencies(frequencies, order): '''Compile simple mapping from N-grams to frequencies into data structures to help compute the probability of state transitions to complete an N-gram
Arguments: frequencies -- mapping from N-gram to frequency recorded in the training text order -- The N in each N-gram (i.e. number of items)
Returns: sequencer -- Set of mappings from each N-1 sequence to the frequency of possible items completing it popular_ngrams -- list of most common N-grams ''' sequencer = {} ngrams_sorted_by_freq = [ k for k in sorted(frequencies, key=frequencies.get, reverse=True) ] popular_ngrams = ngrams_sorted_by_freq[:POPULAR_NGRAM_COUNT] for ngram in frequencies: #Separate the N-1 lead of each N-gram from its item completions freq = frequencies[ngram] lead = ngram[:-1] final = ngram[-1] sequencer.setdefault(lead, {}) sequencer[lead][final] = freq return sequencer, popular_ngrams
def generate_letters(sequencer, popular_ngrams, length, order): '''Generate text based on probabilities derived from statistics for initializing and continuing sequences of letters
Arguments: sequencer -- mapping from each leading sequence to frequencies of the next letter popular_ngrams -- list of the highest frequency N-Grams length -- approximate number of characters to generate before ending the program order -- The N in each N-gram (i.e. number of items)
Returns: nothing ''' #The lead is the initial part of the N-Gram to be completed, of length N-1 #containing the last N-1 items produced lead = '' #Keep track of how many items have been generated generated_count = 0 while generated_count < length: #This condition will be true until the initial lead N-gram is constructed #It will also be true if we get to a dead end where there are no stats #For the next item from the current lead if lead not in sequencer: #Pick an N-gram at random from the most popular reset = random.choice(popular_ngrams) #Drop the final item so that lead is N-1 lead = reset[:-1] for item in lead: print(item, end='', flush=True) generated_count += len(lead) else: freq = sequencer[lead] weights = [ freq.get(c, 0) for c in population ] chosen = random.choices(population, weights=weights)[0] print(chosen, end='', flush=True) #Clip the first item from the lead and tack on the new item lead = lead[1:]+ chosen generated_count += 1
return
if __name__ == '__main__': #File with N-gram frequencies is the first argument raw_freq_fp = open(sys.argv[1]) length = int(sys.argv[2]) raw_freqs = json.load(raw_freq_fp)
#Figure out the N-gram order.Just pull the first N-gram and check its length order = len(next(iter(raw_freqs))) sequencer, popular_ngrams = preprocess_frequencies(raw_freqs, order) generate_letters(sequencer, popular_ngrams, length, order)
import sys import json from nltk.probability import FreqDist
from nltk.util import ngrams from nltk.tokenize import RegexpTokenizer
#Set up a tokenizer that only captures words #Requires that input has been preprocessed to lowercase all letters TOKENIZER = RegexpTokenizer("[a-z]+")
def count_ngrams(input_fp, frequencies, order, buffer_size=1024): '''Read the text content of a file and keep a running count of how often each bigram (sequence of two) characters appears.
Arguments: input_fp -- file pointer with input text frequencies -- mapping from each bigram to its counted frequency order -- The N in each N-gram (i.e. number of items) buffer_size -- incremental quantity of text to be read at a time, in bytes (1024 if not otherwise specified)
Returns: nothing ''' #Read the first chunk of text, and set all letters to lowercase text = input_fp.read(buffer_size).lower() #Loop over the file while there is text to read while text: #This step is needed to collapse runs of space characters into one text = ' '.join(text.split()) spans = TOKENIZER.span_tokenize(text) tokens = (text[begin : end] for (begin, end) in spans) for ngram in ngrams(tokens, order): #Join ngrams into a single space separated string ngram_text = ' '.join(ngram) #Extra layer to make sure no multiple runs of spaces sneak through ngram_text = ' '.join(ngram_text.split()) frequencies[ngram_text] += 1 #Read the next chunk of text, and set all letters to lowercase text = input_fp.read(buffer_size).lower()
return
if __name__ == '__main__': #Initialize the mapping frequencies = FreqDist() #The order of the ngrams is the first command-line argument ngram_order = int(sys.argv[1]) #Pull the input data from the console count_ngrams(sys.stdin, frequencies, ngram_order) outputfp = open(sys.argv[2], 'w') json.dump(dict(frequencies), outputfp) print('Stored frequencies of {} encountered N-grams.'.format(len(frequencies)))
import sys import json import string import random
POPULAR_NGRAM_COUNT = 10000
def preprocess_frequencies(frequencies, order): '''Compile simple mapping from N-grams to frequencies into data structures to help compute the probability of state transitions to complete an N-gram
Arguments: frequencies -- mapping from N-gram to frequency recorded in the training text order -- The N in each N-gram (i.e. number of items)
Returns: sequencer -- Set of mappings from each N-1 sequence to the frequency of possible items completing it popular_ngrams -- list of most common N-grams all_words -- list of all unique words that occur in the training text ''' sequencer = {} ngrams_sorted_by_freq = [ k for k in sorted(frequencies, key=frequencies.get, reverse=True) ] popular_ngrams = ngrams_sorted_by_freq[:POPULAR_NGRAM_COUNT] all_words = set() for ngram in frequencies: #Separate the N-1 lead of each N-gram from its item completions freq = frequencies[ngram] words = ngram.split() lead = words[:-1] final = words[-1] #A rule of Python means we must convert a list to a tuple before using it as #A mapping key sequencer.setdefault(tuple(lead), {}) sequencer[tuple(lead)][final] = freq for word in words: all_words.add(word) #We used the set data structure to keep the words unique #But the way we need to use it, we must convert to list all_words = list(all_words) return sequencer, popular_ngrams, all_words
def generate_letters(sequencer, popular_ngrams, all_words, length, order): '''Generate text based on probabilities derived from statistics for initializing and continuing sequences of letters
Arguments: sequencer -- mapping from each leading sequence to frequencies of the next letter popular_ngrams -- list of the highest frequency N-Grams length -- approximate number of characters to generate before ending the program order -- The N in each N-gram (i.e. number of items)
Returns: nothing ''' #The lead is the initial part of the N-Gram to be completed, of length N-1 #containing the last N-1 items produced lead = [] #Keep track of how many items have been generated generated_count = 0 while generated_count < length: #This condition will be true until the initial lead N-gram is constructed #It will also be true if we get to a dead end where there are no stats #For the next item from the current lead if tuple(lead) not in sequencer: #Pick an N-gram at random from the most popular reset = random.choice(popular_ngrams) #Split it up into a list to server as a lead reset = reset.split() #Drop the final item so that lead is N-1 lead = reset[:-1] for item in lead: #Note we now print a space between items, which are words print(item, end=' ', flush=True) generated_count += len(lead) else: freq = sequencer[tuple(lead)] weights = [ freq.get(w, 0) for w in all_words ] chosen = random.choices(all_words, weights=weights)[0] print(chosen, end=' ', flush=True) #Clip the first item from the lead and tack on the new item lead = lead[1:]+ [chosen] generated_count += 1
return
if __name__ == '__main__': #File with N-gram frequencies is the first argument raw_freq_fp = open(sys.argv[1]) length = int(sys.argv[2]) raw_freqs = json.load(raw_freq_fp)
#Figure out the N-gram order.Just pull the first N-gram and check how many words in it order = len(next(iter(raw_freqs)).split()) sequencer, popular_ngrams, all_words = preprocess_frequencies(raw_freqs, order) generate_letters(sequencer, popular_ngrams, all_words, length, order)
Reprint policy:
All articles in this blog are used except for special statements
CC BY 4.0
reprint policy. If reproduced, please indicate source
John Doe
!