repairing and restoring XSLT, spot fixes to personTagger.py

This commit is contained in:
ebeshero 2023-04-28 01:26:51 -04:00
parent 204363595d
commit 946f25035e
3 changed files with 101 additions and 23 deletions

View File

@ -1266,11 +1266,11 @@ patterns = [
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "W\.\s+?Citrine"}}]}, {"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "W\.\s+?Citrine"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "E\.\s+?Moore"}}]}, {"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "E\.\s+?Moore"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Lyndon\s+?Johnson"}}]}, {"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Lyndon\s+?Johnson"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "John\s+?Kennedy"}}]}, {"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "John(\s*F\.?)\s*Kennedy"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Zia\s+?ul-Haque"}}]}, {"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Zia\s+?ul-Haque"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Ronald\s+?Payne"}}]}, {"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Ronald\s*Payne"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "R\.\s+?Muldoon"}}]}, {"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "R\.\s*Muldoon"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "F\.\s+?Orr"}}]}, {"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "F\.\s*Orr"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Frank\s+?H\.\s+?Schwable"}}]}, {"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Frank\s+?H\.\s+?Schwable"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "R\.\s+?F\.\s+?Doyle"}}]}, {"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "R\.\s+?F\.\s+?Doyle"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "David\s+?Munson"}}]}, {"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "David\s+?Munson"}}]},
@ -1382,7 +1382,6 @@ patterns = [
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Frank\s+?H\.\s+?Schwable"}}]}, {"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Frank\s+?H\.\s+?Schwable"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Billy\s+?Goodman"}}]}, {"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Billy\s+?Goodman"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Leonard\s+?Pullin"}}]}, {"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Leonard\s+?Pullin"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "John\s+?F\.\s+?Kennedy"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "R\.\s+?J\.\s+?Biggar"}}]}, {"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "R\.\s+?J\.\s+?Biggar"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "David\s+?R\.\s+?Hunter"}}]}, {"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "David\s+?R\.\s+?Hunter"}}]},
{"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Hugh\s+?Everett,\s+?III"}}]}, {"label": "PERSON", "pattern": [{"TEXT": {"REGEX": "Hugh\s+?Everett,\s+?III"}}]},
@ -1523,7 +1522,7 @@ patterns = [
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
workingDir = os.getcwd() workingDir = os.getcwd()
CollPath = os.path.join(workingDir, '../regexConspTest') CollPath = os.path.join(workingDir, '../pre-src-xml')
outputPath = os.path.join(workingDir, 'personTestingOutput/') outputPath = os.path.join(workingDir, 'personTestingOutput/')
# Everything in original conspiracy directory. # Everything in original conspiracy directory.
insideDir = os.listdir(CollPath) insideDir = os.listdir(CollPath)
@ -1636,7 +1635,8 @@ def checkTags(file):
newLine = regex.sub(r"(<ent type=')<ent type='ORG'>(ORG)</ent>('>)", r"\1\2\3", newLine) newLine = regex.sub(r"(<ent type=')<ent type='ORG'>(ORG)</ent>('>)", r"\1\2\3", newLine)
newLine = regex.sub(r"(<ent type='[A-Z]+'>)<ent type='[A-Z]+'>(\w+)</ent><ent type='[A-Z]+'>(\w+)</ent>(</ent>)", "\1\2 \3\4", newLine) newLine = regex.sub(r"(<ent type='[A-Z]+'>)<ent type='[A-Z]+'>(\w+)</ent><ent type='[A-Z]+'>(\w+)</ent>(</ent>)", "\1\2 \3\4", newLine)
newLine = regex.sub(r"(<ent type='[A-Z]+?'>)(\w+)\s+?(<ent type='[A-Z+?]'>)(\w+)(</ent>)(\w+)(<ent type='[A-Z]+?'>)(\w+)(</ent>)(</ent>)", r"\1\2 \4 \6 \8\9", newLine) newLine = regex.sub(r"(<ent type='[A-Z]+?'>)(\w+)\s+?(<ent type='[A-Z+?]'>)(\w+)(</ent>)(\w+)(<ent type='[A-Z]+?'>)(\w+)(</ent>)(</ent>)", r"\1\2 \4 \6 \8\9", newLine)
newLine = regex.sub(r"<ent type='\w+'>(\w+)</ent>('\w)", r"\1\2", newLine) # ebb: Problem line below: eliminates <ent type="PERSON">John Kennedy</ent>'s
# newLine = regex.sub(r"<ent type='\w+'>(\w+)</ent>('\w)", r"\1\2", newLine)
# #
# <spe<ent type='ORG'>cia</ent>l> # <spe<ent type='ORG'>cia</ent>l>
# <<ent type='ORG'>di</ent>v> # <<ent type='ORG'>di</ent>v>

View File

@ -0,0 +1,93 @@
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:math="http://www.w3.org/2005/xpath-functions/math"
xmlns="http://www.w3.org/1999/xhtml"
exclude-result-prefixes="xs math"
version="3.0">
<xsl:output method="xhtml" html-version="5" omit-xml-declaration="yes" include-content-type="no" indent="yes"/>
<!-- 2023-04-27 ebb: This is XSLT Stage 2: Outputting HTML for the individual collection files -->
<xsl:variable name="conspiracy" as="document-node()+" select="collection('../src-xml')"/>
<xsl:template match="/">
<xsl:for-each select="$conspiracy">
<xsl:variable name="filename" as="xs:string" select="current() ! base-uri() ! tokenize(., '/')[last()] ! substring-before(., '.xml')"/>
<xsl:result-document method="xml" indent="yes" href="../docs/collection/{$filename}.html">
= <html>
<head>
<title><xsl:value-of select="$filename"/></title>
<link rel="stylesheet" href="../CSSstyle.css"/>
<!--Fill in your link line for CSS and JS in the XSLT here! -->
<xsl:comment>Fill in your link line for CSS and JS in the XSLT here! </xsl:comment>
</head>
<body>
<h1 id="title-index"><xsl:value-of select="$filename"/></h1>
<nav id="menu">
<a href="../index.html">
<div class="button">Home</div>
</a>
<a href="../fulltext.html">
<div class="button">Fulltext</div>
</a>
<a href="../analysis.html">
<div class="button">Analysis</div>
</a>
<a href="../gallery.html">
<div class="button">Gallery</div>
</a>
<a href="../methods.html">
<div class="button">Methods</div>
</a>
<a href="../about.html">
<div class="button">About</div>
</a>
<a href="../GitHub.html">
<div class="button">GitHub <img alt="github icon"
src="https://logos-download.com/wp-content/uploads/2016/09/GitHub_logo.png"
width="15"/>
</div>
</a>
</nav>
<xsl:apply-templates/>
</body>
</html>
</xsl:result-document>
</xsl:for-each>
</xsl:template>
<xsl:template match="p">
<p>
<xsl:apply-templates/>
</p>
</xsl:template>
<!-- ebb: adding mouseover tooltip via title attribute-->
<xsl:template match="ent">
<span class="{@type}" title="{@type}">
<xsl:apply-templates/>
</span>
</xsl:template>
<!--ebb: What about the special and info XML tags? -->
<xsl:template match="special">
<span class="special">
<xsl:apply-templates/>
</span>
</xsl:template>
<xsl:template match="info">
<span class="info" title="{@type}">
<xsl:apply-templates/>
</span>
</xsl:template>
</xsl:stylesheet>

View File

@ -21,7 +21,7 @@
<xsl:for-each select="$conspiracy"> <xsl:for-each select="$conspiracy">
<xsl:variable name="filename" as="xs:string" select="current() ! base-uri() ! tokenize(., '/')[last()]"/> <xsl:variable name="filename" as="xs:string" select="current() ! base-uri() ! tokenize(., '/')[last()]"/>
<xsl:result-document method="xml" indent="yes" href="../src-xml/{$filename}"> <xsl:result-document method="xml" indent="yes" href="../pre-src-xml/{$filename}">
<!-- ebb: NEED TO LOOK UP HOW TO SET UP INDIVIDUAL RESULT DOCUMENTS output to folder --> <!-- ebb: NEED TO LOOK UP HOW TO SET UP INDIVIDUAL RESULT DOCUMENTS output to folder -->
<xsl:choose> <xsl:choose>
<xsl:when test="count(descendant::p) gt 1"> <xsl:when test="count(descendant::p) gt 1">
@ -57,19 +57,4 @@
</xsl:analyze-string> </xsl:analyze-string>
</div> </div>
</xsl:template> </xsl:template>
<xsl:template match="info">
<info type="{@type}">
<xsl:apply-templates/>
</info>
</xsl:template>
<xsl:template match="special">
<info type="{@type}">
<xsl:apply-templates/>
</info>
</xsl:template>
</xsl:stylesheet> </xsl:stylesheet>