modifed main.py to test regex lib

2025-08-11 07:20:03 -04:00 · 2023-03-27 18:22:30 -04:00 · 2023-03-27 18:22:30 -04:00 · b5373e4b4b
commit b5373e4b4b
parent d18a6f5b16
3 changed files with 59 additions and 56 deletions
--- a/pythonCode/main.py
+++ b/pythonCode/main.py
@ -1,5 +1,6 @@
 import spacy
 from collections import Counter
+import re as regex
 import os
 # Uncomment this line if you need the language model.
 # If you already have it, comment it ou.
@ -9,72 +10,35 @@ CollPath = os.path.join(workingDir, '../regexConsp')
 insideDir = os.listdir(CollPath)
 print(insideDir)

+if os.path.isfile("outputNames.txt"):
+    open("outputNames.txt", 'w').close();
+
 nlp = spacy.load("en_core_web_lg")
 def readTextFiles(filepath):
    with open(filepath, 'r', encoding='utf8') as f:
        readFile = f.read()
-        # print(readFile)
        stringFile = str(readFile)
-        # lengthFile = len(readFile)
-        # print(lengthFile)
-        tokens = nlp(stringFile)
-        # print(tokens)
+        # Using REGEX to delete all element tags.
+        elementsRemoved = regex.sub('<.+?>', '', stringFile)
+        # Using REGEX to delete all \n.
+        cleanedFile = regex.sub('\n', ' ', elementsRemoved)
+        tokens = nlp(cleanedFile)
+
        listEntities = entitycollector(tokens)
-        print(listEntities)
-        # cardinal_freq = Counter(listCardinals)
-        # topTen = cardinal_freq.most_common(10)
-        # print(topTen)
-
-

 def entitycollector(tokens):
    entities = []
    for entity in tokens.ents:
-        # if entity.label_ == "CARDINAL":
-        print(entity.text, entity.label_, spacy.explain(entity.label_))
+        if entity.label_ == "PERSON":
+            with open("outputNames.txt", 'a') as f:
+                f.write("\n" + entity.text)
+                print("Writing in outputNames.txt: " + entity.text)
+            # print(entity.text, entity.label_, spacy.explain(entity.label_))
            entities.append(entity.text)
        return entities

-
 for file in os.listdir(CollPath):
    if file.endswith(".xml"):
        filepath = f"{CollPath}/{file}"
        print(filepath)
        readTextFiles(filepath)
-
-
-
-
-
-# print(listCardinals)
-# cardinal_freq = Counter(listCardinals)
-# topTen = cardinal_freq.most_common(10)
-# print(topTen)
-
-# grimm = open('grimm.txt', 'r', encoding='utf8')
-# grimmDoc = grimm.read()
-# grimmNLP = nlp(grimmDoc)
-# grimmmSentences = grimmNLP.sents
-
-# def sentenceLengths(sentences):
-#     lengths = []
-#     for s in sentences:
-#         length = len(s.text)
-#         lengths.append(length)
-#     return sorted(lengths)
-
-
-# grimmLengths = sentenceLengths(grimmmSentences)
-# # print(grimmLengths)
-# maxVal = max(grimmLengths)
-# minVal = min(grimmLengths)
-# print('The shortest sentence is ' + str(minVal) + ' characters long.')
-# print('The longest sentence is ' + str(maxVal) + ' characters long.')
-
-# for sentence in grimmNLP.sents:
-# #    print(sentence.text)
-#     length = len(sentence.text)
-#     if length == minVal:
-#         print("The shortest sentence is: " + sentence.text)
-#     if len(sentence.text) == maxVal:
-#         print('The longest sentence is: ' + sentence.text + ' :' + str(maxVal) + 'characters')
--- a/pythonCode/outputNames.txt
+++ b/pythonCode/outputNames.txt
--- a/pythonCode/test.py
+++ b/pythonCode/test.py
@ -1,6 +1,45 @@
+import spacy
+from collections import Counter
+import re as regex
 import os
-
+# Uncomment this line if you need the language model.
+# If you already have it, comment it ou.
+# Let's try the different spaCy language models for this. We can compare _lg with _md or _sm
 workingDir = os.getcwd()
 CollPath = os.path.join(workingDir, '../regexConsp')
 insideDir = os.listdir(CollPath)
 print(insideDir)
+
+if os.path.isfile("outputNames.txt"):
+    open("outputNames.txt", 'w').close();
+
+nlp = spacy.load("en_core_web_lg")
+def readTextFiles(filepath):
+    with open(filepath, 'r', encoding='utf8') as f:
+        readFile = f.read()
+        stringFile = str(readFile)
+        # Using REGEX to delete all element tags.
+        elementsRemoved = regex.sub('<.+?>', '', stringFile)
+        # Using REGEX to delete all \n.
+        cleanedFile = regex.sub('\n', ' ', elementsRemoved)
+        tokens = nlp(cleanedFile)
+
+        listEntities = entitycollector(tokens)
+        print(listEntities)
+
+def entitycollector(tokens):
+    entities = []
+    for entity in tokens.ents:
+        if entity.label_ == "PERSON":
+            with open("outputNames.txt", 'a') as f:
+                f.write("\n" + entity.text)
+                print("Writing in outputNames.txt: " + entity.text)
+            # print(entity.text, entity.label_, spacy.explain(entity.label_))
+            entities.append(entity.text)
+        return entities
+
+for file in os.listdir(CollPath):
+    if file.endswith(".xml"):
+        filepath = f"{CollPath}/{file}"
+        print(filepath)
+        readTextFiles(filepath)