textfiles-politics/pythonCode/main.py

import spacy
from collections import Counter
import os
# Uncomment this line if you need the language model.
# If you already have it, comment it ou.
# Let's try the different spaCy language models for this. We can compare _lg with _md or _sm
workingDir = os.getcwd()
CollPath = os.path.join(workingDir, '../regexConsp')
insideDir = os.listdir(CollPath)
print(insideDir)

nlp = spacy.load("en_core_web_lg")
def readTextFiles(filepath):
    with open(filepath, 'r', encoding='utf8') as f:
        readFile = f.read()
        # print(readFile)
        stringFile = str(readFile)
        # lengthFile = len(readFile)
        # print(lengthFile)
        tokens = nlp(stringFile)
        # print(tokens)
        listEntities = entitycollector(tokens)
        print(listEntities)
        # cardinal_freq = Counter(listCardinals)
        # topTen = cardinal_freq.most_common(10)
        # print(topTen)


def entitycollector(tokens):
    entities = []
    for entity in tokens.ents:
        # if entity.label_ == "CARDINAL":
        print(entity.text, entity.label_, spacy.explain(entity.label_))
        entities.append(entity.text)
    return entities


for file in os.listdir(CollPath):
    if file.endswith(".xml"):
        filepath = f"{CollPath}/{file}"
        print(filepath)
        readTextFiles(filepath)


# print(listCardinals)
# cardinal_freq = Counter(listCardinals)
# topTen = cardinal_freq.most_common(10)
# print(topTen)

# grimm = open('grimm.txt', 'r', encoding='utf8')
# grimmDoc = grimm.read()
# grimmNLP = nlp(grimmDoc)
# grimmmSentences = grimmNLP.sents

# def sentenceLengths(sentences):
#     lengths = []
#     for s in sentences:
#         length = len(s.text)
#         lengths.append(length)
#     return sorted(lengths)


# grimmLengths = sentenceLengths(grimmmSentences)
# # print(grimmLengths)
# maxVal = max(grimmLengths)
# minVal = min(grimmLengths)
# print('The shortest sentence is ' + str(minVal) + ' characters long.')
# print('The longest sentence is ' + str(maxVal) + ' characters long.')

# for sentence in grimmNLP.sents:
# #    print(sentence.text)
#     length = len(sentence.text)
#     if length == minVal:
#         print("The shortest sentence is: " + sentence.text)
#     if len(sentence.text) == maxVal:
#         print('The longest sentence is: ' + sentence.text + ' :' + str(maxVal) + 'characters')
created main.py with code 2023-03-27 17:14:07 -04:00			`import spacy`
			`from collections import Counter`
			`import os`
			`# Uncomment this line if you need the language model.`
			`# If you already have it, comment it ou.`
			`# Let's try the different spaCy language models for this. We can compare _lg with _md or _sm`
			`workingDir = os.getcwd()`
			`CollPath = os.path.join(workingDir, '../regexConsp')`
			`insideDir = os.listdir(CollPath)`
			`print(insideDir)`

			`nlp = spacy.load("en_core_web_lg")`
			`def readTextFiles(filepath):`
			`with open(filepath, 'r', encoding='utf8') as f:`
			`readFile = f.read()`
			`# print(readFile)`
			`stringFile = str(readFile)`
			`# lengthFile = len(readFile)`
			`# print(lengthFile)`
			`tokens = nlp(stringFile)`
			`# print(tokens)`
			`listEntities = entitycollector(tokens)`
			`print(listEntities)`
			`# cardinal_freq = Counter(listCardinals)`
			`# topTen = cardinal_freq.most_common(10)`
			`# print(topTen)`



			`def entitycollector(tokens):`
			`entities = []`
			`for entity in tokens.ents:`
			`# if entity.label_ == "CARDINAL":`
			`print(entity.text, entity.label_, spacy.explain(entity.label_))`
			`entities.append(entity.text)`
			`return entities`


			`for file in os.listdir(CollPath):`
			`if file.endswith(".xml"):`
			`filepath = f"{CollPath}/{file}"`
			`print(filepath)`
			`readTextFiles(filepath)`





			`# print(listCardinals)`
			`# cardinal_freq = Counter(listCardinals)`
			`# topTen = cardinal_freq.most_common(10)`
			`# print(topTen)`

			`# grimm = open('grimm.txt', 'r', encoding='utf8')`
			`# grimmDoc = grimm.read()`
			`# grimmNLP = nlp(grimmDoc)`
			`# grimmmSentences = grimmNLP.sents`

			`# def sentenceLengths(sentences):`
			`# lengths = []`
			`# for s in sentences:`
			`# length = len(s.text)`
			`# lengths.append(length)`
			`# return sorted(lengths)`


			`# grimmLengths = sentenceLengths(grimmmSentences)`
			`# # print(grimmLengths)`
			`# maxVal = max(grimmLengths)`
			`# minVal = min(grimmLengths)`
			`# print('The shortest sentence is ' + str(minVal) + ' characters long.')`
			`# print('The longest sentence is ' + str(maxVal) + ' characters long.')`

			`# for sentence in grimmNLP.sents:`
			`# # print(sentence.text)`
			`# length = len(sentence.text)`
			`# if length == minVal:`
			`# print("The shortest sentence is: " + sentence.text)`
			`# if len(sentence.text) == maxVal:`
			`# print('The longest sentence is: ' + sentence.text + ' :' + str(maxVal) + 'characters')`