sec-pentesting-toolkit/Cryptography/Rotation-Ciphers/taste_like_english.py

#!/usr/bin/env python

__author__ = "Mia Stein"

'''
This program calculate the frequency of letters in a files
so we can use this for cryptoanalysis later.

For example, the 10 most frequent words in english:
e -> 0.104
t -> 0.072
a -> 0.065
0 -> 0.059
n -> 0.056
i -> 0.055
s -> 0.051
r -> 0.049
h -> 0.049
d -> 0.034
'''

import chardet
from collections import defaultdict


# calculate the mean values from the table
def taste_like_english(dict_mean, word):
    mean_word = 0
    counter = 0
    for c in word:
        if c in dict_mean.keys():
            mean_word += dict_mean[c]
            counter += 1
    return mean_word/counter


# count number of letters
def count_letters(FILE):
    dict_letters = defaultdict(int)
    with open(FILE) as file:
        for line in file:
            for word in line.lower().split():
                    for c in word:
                        if c!='!' and c!="," and c!="-" and c!="."\
                         and c!=";" and chardet.detect(c)['encoding'] == 'ascii':
                            dict_letters[c] += 1
    return dict_letters


# calculate the frequency for the letters
def calculate_mean(dict_letters):
    dict_mean = defaultdict(float)
    sum_all = sum(dict_letters.values())
    for letter in sorted(dict_letters.keys()):
        dict_mean[letter] = float(dict_letters[letter])/sum_all
    return dict_mean


# first, test letters with official values
def test_values():
    dict_letters_test = {'e':0.104, 't':0.072, 'a':0.065, 'o':0.059, \
        'n': 0.056, 'i': 0.055, 's':0.051, 'r':0.049, 'h':0.049, 'd':0.034}
    print('Test for "english": ', taste_like_english(dict_letters_test, 'english')) # == 0.045"


# now, test with some file, creating dictionary of letters
def test_file():
    dict_letters = count_letters(FILE)
    dict_mean = calculate_mean(dict_letters)
    for key in sorted(dict_mean, key=dict_mean.get, reverse=True):
        print(key + ' --> ' + str(dict_mean[key]))


if __name__ == '__main__':
    test_values()

    FILE = 'Ariel_Sylvia_Plath.txt'
    test_file()