def conditional(lyrics): # prior["gaga"] = 0.5 for example priors = {} # probabilities["gaga"]["baby"] = 0.8 for example probabilities = {} # word["gaga"] = 5 words = {} for line in lyrics: line = line.split() singer = line[0] total_words = len(line) - 1 if singer in words: words[singer] += total_words else: words[singer] = total_words # add class to priors if not there yet if singer in priors: priors[singer] += 1 # add one to it otherwise else: priors[singer] = 1 if singer not in probabilities: probabilities[singer] = {} # number of occurrances of each word for i in range(1, len(line)): if line[i] in probabilities[singer]: probabilities[singer][line[i]] += 1 else: probabilities[singer][line[i]] = 1 return priors, probabilities, words def classify(lyrics, priors, probabilities, words): candidates = {} lyrics = lyrics.split() for candidate in priors: probability = priors[candidate] for word in lyrics: if word in probabilities[candidate]: probability *= (1. + probabilities[candidate][word]) / words[candidate] else: probability *= 1. / words[candidate] print candidate, probability lyrics = ["gaga i am on the right track baby i was born this way oh there aint no other way baby i was born this way", "katy i got the eye of the tiger the fire dancing through the fire cause i am a champion and you are gonna hear me roar louder louder than a lion"] priors, probabilities, words = conditional(lyrics) classify("baby i am on fire", priors, probabilities, words)