from __future__ import with_statement import re, collections def words(text): #return re.findall("[a-z']+", text.lower()) return [x for x in re.findall("[a-z']+", text.lower()) if "'" not in x] def train(features): model = collections.defaultdict(lambda : 0.0) for f in features: model[f] += 1 return model def wordfrequency() : with open(r"c:\wordfreq.txt") as f : lines = [ line.strip() for line in f.readlines() if len(line.strip()) ] regex = r"(?P\S+) = (?P\d+) ?" recomp = re.compile(regex) d = collections.defaultdict(lambda : 1.0) for line in lines : for m in recomp.findall(line) : word, freq = m[0].lower(), m[1] d[word] = float(freq) return d def find_candidate(wf, filename) : expected_size = sum(v for v in wf.values()) with open(filename) as f : NWORDS = train(words(f.read())) text_size = sum(v for v in NWORDS.values()) score = dict() for w in NWORDS : actual_freq, expected_freq = NWORDS[w]/text_size, wf[w]/expected_size score[w] = actual_freq / expected_freq #print(w, actual_freq, expected_freq, NWORDS[w], wf[w]) a = sorted(set(score.values())) a.reverse() google_words = [] for aa in a[:10] : google_words += [k for k,v in score.items() if v == aa] if len(google_words) >= 10 : break print(google_words) print ("http://www.google.com/search?hl=en&q=" + "%2B".join(google_words[:10])) if __name__ == "__main__" : import sys wf = wordfrequency() for filename in sys.argv[1 :] : find_candidate(wf, filename)