main.py outputs new files and wraps elements around tokens

2025-08-12 15:55:23 -04:00 · 2023-03-29 14:22:05 -04:00 · 2023-03-29 14:22:05 -04:00 · 6c04235b78
commit 6c04235b78
parent 21283b1ebb
335 changed files with 242619 additions and 29 deletions
--- a/pythonCode/main.py
+++ b/pythonCode/main.py
@ -2,44 +2,64 @@ import spacy
 from collections import Counter
 import re as regex
 import os
-# nlp = spacy.cli.download("en_core_web_lg")
-# If you already have it, comment it ou.
-# Let's try the different spaCy language models for this. We can compare _lg with _md or _sm
+
+#### Loads all of the necessary variables and functions.
+nlp = spacy.load("en_core_web_lg")
 workingDir = os.getcwd()
 CollPath = os.path.join(workingDir, '../regexConsp')
+outputPath = os.path.join(workingDir, 'output/')
+# Everything in original conspiracy directory.
 insideDir = os.listdir(CollPath)
 print(insideDir)

-if os.path.isfile("outputNames.txt"):
-    open("outputNames.txt", 'w').close();

-nlp = spacy.load("en_core_web_lg")
-def readTextFiles(filepath):
-    with open(filepath, 'r', encoding='utf8') as f:
-        readFile = f.read()
-        stringFile = str(readFile)
-        # Using REGEX to delete all element tags.
-        elementsRemoved = regex.sub('<.+?>', '', stringFile)
-        # Using REGEX to delete all \n.
-        cleanedFile = regex.sub('\n', ' ', elementsRemoved)
-        tokens = nlp(cleanedFile)
-
-        listEntities = entitycollector(tokens)
+# Copies files in case they do not exist
+def copyTextFiles(file):
+    content = []
+    # Reads the contents of file, and saves each line of file into the content array.
+    with open(CollPath + "/" + file, 'r', encoding='utf8') as inFile:
+        for line in inFile:
+            content.append(line)
+        print("copying " + file)
+        inFile.close()
+    # With the contents copied, a loop will go through the array and write it all in a new file in output folder.
+    with open(outputPath + "/" + file, 'w', encoding='utf8') as f:
+        for line in content:
+            f.write(str(line))

+# Function runs through the tokens of given file. Entities are stored in array, then returned. Called by regexFile().
 def entitycollector(tokens):
    entities = []
    for entity in tokens.ents:
        if entity.label_ == "PERSON":
-            with open("outputNames.txt", 'a') as f:
-                f.write("\n" + entity.text)
-                print("Writing in outputNames.txt: " + entity.text)
-            ## Below includes entity values and stuf
-            # print(entity.text, entity.label_, spacy.explain(entity.label_))
            entities.append(entity.text)
        return entities

-for file in os.listdir(CollPath):
-    if file.endswith(".xml"):
-        filepath = f"{CollPath}/{file}"
-        print(filepath)
-        readTextFiles(filepath)
+# Function runs regex through given file.
+def regexFile(file):
+    #  First, it reads file given. Supposedly, the newly created file in output folder.
+    with open(outputPath + "/" + file, 'r', encoding='utf8') as inFile:
+        rawText = str(inFile.read())
+        # Regex finds all elements in a file and deletes them. Then Regex finds anything that is not a letter, and
+        # deletes. It is stored in a variable that is supposedly clean from anything extra.
+        cleanedText = regex.sub('[^A-z]+', ' ', regex.sub('<.+?>', ' ', rawText))
+        # token stuff
+        tokens = nlp(cleanedText)
+        listEntities = entitycollector(tokens)
+        # If the listEntity array has content in it, it will go through the list to see if the content is located
+        # anywhere in the original, raw text.
+        if listEntities:
+            for entity in listEntities:
+                wrappedText = regex.sub(str(entity), '<person>' + entity + '</person>',rawText)
+                # Saves newly wrapped elements and then writes it into the copied file.
+                with open(outputPath + "/" + file, 'w', encoding='utf8') as f:
+                    f.write(wrappedText)
+                    print("WRAPPING " + entity)
+                    f.close()
+        else:
+            print("No names... Probably did not detect any?")
+
+# Goes through all of the original conspiracy texts
+for file in insideDir:
+    copyTextFiles(file)
+    regexFile(file)