2024-03-14 16:59:06 +01:00

87 lines
2.0 KiB
Python

#!/usr/bin/env python
__author__ = "Mia Stein"
'''
This program calculate the frequency of letters in a files
so we can use this for cryptoanalysis later.
For example, the 10 most frequent words in english:
e -> 0.104
t -> 0.072
a -> 0.065
0 -> 0.059
n -> 0.056
i -> 0.055
s -> 0.051
r -> 0.049
h -> 0.049
d -> 0.034
'''
import chardet
from collections import defaultdict
# calculate the mean values from the table
def taste_like_english(dict_mean, word):
mean_word = 0
counter = 0
for c in word:
if c in dict_mean.keys():
mean_word += dict_mean[c]
counter += 1
return mean_word/counter
# count number of letters
def count_letters(FILE):
dict_letters = defaultdict(int)
with open(FILE) as file:
for line in file:
for word in line.lower().split():
for c in word:
if c!='!' and c!="," and c!="-" and c!="."\
and c!=";" and chardet.detect(c)['encoding'] == 'ascii':
dict_letters[c] += 1
return dict_letters
# calculate the frequency for the letters
def calculate_mean(dict_letters):
dict_mean = defaultdict(float)
sum_all = sum(dict_letters.values())
for letter in sorted(dict_letters.keys()):
dict_mean[letter] = float(dict_letters[letter])/sum_all
return dict_mean
# first, test letters with official values
def test_values():
dict_letters_test = {'e':0.104, 't':0.072, 'a':0.065, 'o':0.059, \
'n': 0.056, 'i': 0.055, 's':0.051, 'r':0.049, 'h':0.049, 'd':0.034}
print('Test for "english": ', taste_like_english(dict_letters_test, 'english')) # == 0.045"
# now, test with some file, creating dictionary of letters
def test_file():
dict_letters = count_letters(FILE)
dict_mean = calculate_mean(dict_letters)
for key in sorted(dict_mean, key=dict_mean.get, reverse=True):
print(key + ' --> ' + str(dict_mean[key]))
if __name__ == '__main__':
test_values()
FILE = 'Ariel_Sylvia_Plath.txt'
test_file()