textfiles-politics/pythonCode/test.py

###### Original name checker thing ######

import spacy
from collections import Counter
import re as regex
import os
# nlp = spacy.cli.download("en_core_web_lg")
# If you already have it, comment it ou.
# Let's try the different spaCy language models for this. We can compare _lg with _md or _sm
workingDir = os.getcwd()
CollPath = os.path.join(workingDir, '../regexConsp')
insideDir = os.listdir(CollPath)
print(insideDir)

if os.path.isfile("outputNames.txt"):
    open("outputNames.txt", 'w').close();

nlp = spacy.load("en_core_web_lg")
def readTextFiles(filepath):
    with open(filepath, 'r', encoding='utf8') as f:
        readFile = f.read()
        stringFile = str(readFile)
        # Using REGEX to delete all element tags.
        elementsRemoved = regex.sub('<.+?>', '', stringFile)
        # Using REGEX to delete all \n.
        cleanedFile = regex.sub('\n', ' ', elementsRemoved)
        tokens = nlp(cleanedFile)

        listEntities = entitycollector(tokens)

def entitycollector(tokens):
    entities = []
    for entity in tokens.ents:
        if entity.label_ == "PERSON":
            with open("outputNames.txt", 'a') as f:
                f.write("\n" + entity.text)
                print("Writing in outputNames.txt: " + entity.text)
            ## Below includes entity values and stuf
            # print(entity.text, entity.label_, spacy.explain(entity.label_))
            entities.append(entity.text)
        return entities

for file in os.listdir(CollPath):
    if file.endswith(".xml"):
        filepath = f"{CollPath}/{file}"
        print(filepath)
        readTextFiles(filepath)