# Programmer: Sriram Pemmaraju # Date: March 26th, 2013 # This program aims to do simple text analysis to figure out the principal # characters of a novel. Since character names are proper nouns, starting with # upper case letters, the idea is to look for words starting with upper case # letters that do not appear at the beginning of sentences. So the program also # attempts to partition the text into sentences, assuming that ".", "!", and "?" # are all the possible sentence delimiters. Then we count the frequency of the # proper nouns and report the most frequent of these. It is likely that the # principal characters of the novel can be found among this list. # Takes a string as parameter and "splits" it into "sentences." # We assume that ".", "!", and "?" are sentence delimiters def parseSentences(bigString): # First split using ".". This creates a list of sentences, which need to # be further split using "!" and "?" sentenceList = bigString.split(".") # For each delimiter that is either "?" or "!", split according to # that delimiter for delimiter in ["?", "!"]: sentenceList = [x.split(delimiter) for x in sentenceList] # This creates a nested list, that needs to be flattened. We use a list # comprehension to flatten the list. sentenceList = [y for x in sentenceList for y in x] return sentenceList # Takes a list of sentences and parses each sentence in this list into a list of words. # So the result is a list of lists, e.g., [["This", "is", "ok"], ["This", "is", "not"]]. # We use the same definition of a word as before. It is a contiguous sequence of letters. def parseWords(sentenceList): # Make a list of all non-letters. Note the use of the list comprehension here nonLetters = [chr(x) for x in range(0, ord("A")) + range(ord("Z")+1, ord("a")) + range(ord("z")+1, 127)] # Replaces each non-letter character in each sentence by a blank for i in range(len(sentenceList)): for char in nonLetters: sentenceList[i] = sentenceList[i].replace(char, " ") # Once non-letters have beeb replaced by blanks then a simple split() using # blank as the delimiter will help us get all the words. Note that this # constructs a nested list of words for each sentence. nestedWordList = [x.split() for x in sentenceList] return nestedWordList # Searches for a word in wordList and returns the index of the first occurrence # of word in wordList if found; otherwise returns -1. def getIndex(wordList, word): index = 0 while index < len(wordList): if wordList[index] == word: return index index = index + 1 return -1 # Used to compute frequencies of words in a given word list, that may contain # duplicates. def computeFrequencies(wordList): masterList = [] # for maintaining the list of words frequencies = [] # for maintaining frequencies of these words for word in wordList: # if word is already in masterList, increase its frequecy loc = getIndex(masterList, word) if loc >= 0: frequencies[loc] = frequencies[loc] + 1 else: # this is the first occurrence of the word, so just append # and set frequency to be 1 masterList.append(word) frequencies.append(1) return [masterList, frequencies] # main program f = open("war.txt", "r") bigString = f.read() sentenceList = parseSentences(bigString) nestedWordList = parseWords(sentenceList) # This block of code walks through the list of words, ignores # the first word in each sentence and of the remaining words, picks # ones that start with an upper case and have length at least 4. characterNames = [] for sentenceWords in nestedWordList: restOfWords = sentenceWords[1:] for w in restOfWords: if w[0].isupper() and len(w) > 3: characterNames.append(w) [masterList, frequencies] = computeFrequencies(characterNames) # zips the frequencies and words together and sorts the zipped list in descending # order of frequencies. combinedList = zip(frequencies, masterList) combinedList.sort(reverse = True) # Prints the 20 most frequent character names print combinedList[:20]