# Programmer: Sriram Pemmaraju
# Date: March 26th, 2013

# This program aims to do simple text analysis to figure out the principal 
# characters of a novel. Since character names are proper nouns, starting with
# upper case letters, the idea is to look for words starting with upper case
# letters that do not appear at the beginning of sentences. So the program also
# attempts to partition the text into sentences, assuming that ".", "!", and "?"
# are all the possible sentence delimiters. Then we count the frequency of the
# proper nouns and report the most frequent of these. It is likely that the
# principal characters of the novel can be found among this list.

# Takes a string as parameter and "splits" it into "sentences." 
# We assume that ".", "!", and "?" are sentence delimiters
def parseSentences(bigString):
    # First split using ".". This creates a list of sentences, which need to
    # be further split using "!" and "?"
    sentenceList = bigString.split(".")
    
    # For each delimiter that is either "?" or "!", split according to 
    # that delimiter
    for delimiter in ["?", "!"]:
        sentenceList = [x.split(delimiter) for x in sentenceList]
        
        # This creates a nested list, that needs to be flattened. We use a list
        # comprehension to flatten the list.
        sentenceList = [y for x in sentenceList for y in x]   

    return sentenceList


# Takes a list of sentences and parses each sentence in this list into a list of words.
# So the result is a list of lists, e.g., [["This", "is", "ok"], ["This", "is", "not"]].
# We use the same definition of a word as before. It is a contiguous sequence of letters.
def parseWords(sentenceList):
    # Make a list of all non-letters. Note the use of the list comprehension here
    nonLetters = [chr(x) for x in range(0, ord("A")) + range(ord("Z")+1, ord("a")) + range(ord("z")+1, 127)]
    
    # Replaces each non-letter character in each sentence by a blank
    for i in range(len(sentenceList)):
        for char in nonLetters:
            sentenceList[i] = sentenceList[i].replace(char, " ")

    # Once non-letters have beeb replaced by blanks then a simple split() using
    # blank as the delimiter will help us get all the words. Note that this
    # constructs a nested list of words for each sentence.
    nestedWordList = [x.split() for x in sentenceList]
    return nestedWordList

# Searches for a word in wordList and returns the index of the first occurrence
# of word in wordList if found; otherwise returns -1.
def getIndex(wordList, word):
    index = 0
    while index < len(wordList):
        if wordList[index] == word:
            return index
        index = index + 1
    return -1

# Used to compute frequencies of words in a given word list, that may contain
# duplicates.
def computeFrequencies(wordList):
    masterList = [] # for maintaining the list of words
    frequencies = [] # for maintaining frequencies of these words
    
    for word in wordList:
        # if word is already in masterList, increase its frequecy
        loc = getIndex(masterList, word)
        if loc >= 0:
            frequencies[loc] = frequencies[loc] + 1
        else: # this is the first occurrence of the word, so just append
              # and set frequency to be 1
            masterList.append(word)
            frequencies.append(1)
            
    return [masterList, frequencies]

# main program
f = open("war.txt", "r")
bigString = f.read()
sentenceList = parseSentences(bigString)
nestedWordList = parseWords(sentenceList)

# This block of code walks through the list of words, ignores
# the first word in each sentence and of the remaining words, picks
# ones that start with an upper case and have length at least 4.
characterNames = []
for sentenceWords in nestedWordList:
    restOfWords = sentenceWords[1:]
    for w in restOfWords:
        if w[0].isupper() and len(w) > 3:
            characterNames.append(w)
            
[masterList, frequencies] = computeFrequencies(characterNames)

# zips the frequencies and words together and sorts the zipped list in descending
# order of frequencies.
combinedList = zip(frequencies, masterList)
combinedList.sort(reverse = True)

# Prints the 20 most frequent character names
print combinedList[:20]