erkenner.py

vollform.py

Download (Stand: 30. Januar 2001)
 
#!/usr/bin/python

import popen2
import re
import string
import sys

# vollform.py 
#
# Eingabe: ISO 8859/1 Text, Satzzeichen etc. abgerueckt, z.B. bereits
#          tokenisierter Text
#
# Ausgabe: je Wort wort:  (   )* 
#
#          formatInfo: 
#                Tags:   
#                IMS:     " "  
#                All:    "("  ","  ","  ","  ")" 
#                Python: Python-Code, sonst wie All
#
#          formatExtra: 
#                Rebol    = [] um info herum
#                Python   = [] um info herum und "," dazwischen
#                Alex     = alex-import Code Tab-separiert
#                IMS      = Tab nach , Leerzeichen zwischen info
#                Printing = viele Leerzeilen und Tabs
#                letztere beide schliessen sich gegenseitig aus
#
#          mitnr 1 = Tag enthaelt Regelnummer in der Form "R"
#          
#          newline 
#                1 = Zeilenumbrueche der Eingabe werden uebernommen 
#      
#          putTagsOnANewLine
#                1 = bereits fuer das erste Tag wird eine neue Zeile genommen
#                    keine Wirkung, wenn "IMS" in formatExtra
# 
#          substWSinInfo
#                1 = info enthaelt "_" anstatt " "
#
#          substWSinTag
#                1 = tag enthaelt "_" anstatt " "
#
#          auchUngetaggt
#                1 = Woerter, die kein Tag erhalten, werden auch ausgegeben
#                0 = solche Woerter erscheinen nicht in der Ausgabe

mitnr              = 0
newline            = 0
putTagsOnANewLine  = 0
substWSinInfo      = 0
substWSinTag       = 1
auchUngetaggt      = 0
formatInfo         = ["IMS"]  # ["All", "Tags", "IMS", "Python"]
formatExtra        = ["IMS"]  # ["Printing", "Rebol", "IMS", "Python"] 

leereTags  = 0   # leere Tagklammern zum manuellen Nachbearbeiten ausgeben
nomenRaten = 0   # Grossgeschriebenen Woerter ohne Tag als Nomen taggen

alexdir = "../alex/"

debug = 0

#
# normiereString()
#

def normiereString(s, nr): #{
  s= string.replace(s, "e:", "\xeb")   # Britta (ë)
  s= string.replace(s, "E:", "\xcb")   # weitere nach gleichem Schema
  s= string.replace(s, "c:", "\xe7")
  s= string.replace(s, "C:", "\xc7")
  s= string.replace(s, "\x89", "\xeb") # Thomas (ë)
  s= string.replace(s, "\x87", "\xe7") # weitere nach MS-DOS Codepage 437 / 850
  s= string.replace(s, "\x80", "\xc7")
  s= string.replace(s, "\xd3", "\xcb") # nur Codepage 850
  for sonder in [":vok:", ":kons:"]: #{
    pos = 0
    pos = string.find(s, sonder, pos)
    while pos >= 0: #{
      if pos == 0 \
      or (pos+len(sonder)) >= len(s) \
      or s[pos+len(sonder)] != "]":
        if pos == 0 \
        or s[pos-1] != "^":
          s= s[0:pos] + "[" + sonder + "]" + s[pos+len(sonder):sys.maxint]
        else:
          s= s[0:pos-1] + "[^" + sonder + "]" + s[pos+len(sonder):sys.maxint]
        pos = pos + 1
      #}
      pos = string.find(s, sonder, pos + 1)
    #}
  #}
  if s != string.strip(s):
    sys.stderr.write("Zusaetzliche Leerzeichen in Zeile " + `nr` +".\n")
  return s
#}                

#
# suffixtabelle()
# bereitet die Suffixtabelle auf
#

#
# kategorien(wort)
# liefert eine Liste der Suffixergebnisse, die im Vollformlexikon zum wort
# verzeichnet sind
#

lex = popen2.popen2(alexdir+"alex -f "+alexdir+"vollform.db -b - -i -v")
if not lex or not lex[0] or not lex[1]:
  print "Unable to start alex."
  sys.exit(2)

def kategorien(wort): #{
  if "\n"  in wort \
  or "\t" in wort \
  or wort == ""   \
  or len(wort) >= 39:            # nicht nachschlagbar
    return []
  lex[1].write(wort+"\n")
  lex[1].flush()
  zeile = lex[0].readline()
  if zeile == "":
    print "Premature termination of alex."
    sys.exit(3)
  zeile = string.rstrip(zeile)
  splitpos = len(wort)
  if wort != zeile[0:splitpos]:
    print ("Error while looking up '" + wort + \
           "': Alex is not properly synchronized; got" + zeile + ".")
    sys.exit(4)
  zeile = zeile[ splitpos+1 : sys.maxint ]
  zeile = string.strip(zeile)
  if zeile == "":
    return []
  else:
    return eval(zeile)
#}


def klein(wort): #{
  retval = ""
  wort = string.lower(wort)
  for buchstabe in wort:                       # Sonderzeichen behandeln
    if (ord(buchstabe) & 0xe0) == 0xc0:
      buchstabe = chr(ord(buchstabe) + 0x20)
    retval = retval + buchstabe
  return retval
#}     
  
#
# tagausgabe(...): Format siehe oben
#

def ausgabeWort(wort): #{
  ausgabeliste.append(wort)
#}

def tagausgabe(tag, lemma, suffixkandidat, lexkat, nr):
  if mitnr:
    tag = tag[0:len(tag)-1] + " R" + `nr` + tag[len(tag)-1]
  if substWSinTag:
    tag = string.replace(tag, " ", "_")
    tag = string.replace(tag, "\t", "_")
    tag = string.replace(tag, "\n", "_")
  info = ""
  if "Tags" in formatInfo:
    info = info + tag
  if "IMS" in formatInfo:
    info = info + tag + " " + lemma
  if "Python" in formatInfo:
    info = info + `(tag, lemma, suffixkandidat, lexkat, nr)`
  if "All" in formatInfo:
    info = info + "(" + lemma  + ", "  + suffixkandidat + ", "  \
                      + lexkat + ", '" + tag  + "')"
  if substWSinInfo:
    info = string.replace(info, " ", "_")
    info = string.replace(info, "\t", "_")
    info = string.replace(info, "\n", "_")
  ausgabeliste.append(info)
#}

def ausgabeNachWort(ausgabeliste): #{
    wswort = " " 
    wsinfo = " "
    wsende = " "
    if "IMS" in formatExtra: #{
      wswort = "\t"
      wsinfo = " "
      wsende = "\n"
    #}
    if "Printing" in formatExtra: #{
      wswort = "\t"
      wsinfo = "\n\t"
      wsende = "\n\n"
    #}
    if putTagsOnANewLine:
      wswort = "\n" + wswort
    if "Rebol" in formatExtra: #{
      wswort = wswort + "[ " + wsinfo
      wsende = wsinfo + " ]" + wsende 
    #}
    if "Python" in formatExtra: #{
      wswort = wswort + ": [ " + wsinfo
      wsende = wsinfo + " ]" + wsende 
      wsinfo = "," + wsinfo
      ausgabe = string.join(ausgabeliste[1:sys.maxint], wsinfo) + wsende
      ausgabe = `ausgabeliste[0]` + wswort + ausgabe
    #}
    elif "Alex" in formatExtra: #{
      wswort = wswort + "\t[ " + wsinfo
      wsende = wsinfo + " ]" + wsende 
      wsinfo = "," + wsinfo
      ausgabe = string.join(ausgabeliste[1:sys.maxint], wsinfo) + wsende
      ausgabe = ausgabeliste[0] + wswort + ausgabe
    #}
    else: #{
      ausgabe = string.join(ausgabeliste[1:sys.maxint], wsinfo) + wsende
      ausgabe = ausgabeliste[0] + wswort + ausgabe
    #}
    if auchUngetaggt or len(ausgabeliste)>1:
      sys.stdout.write(ausgabe)
#}

def ausgabeNachZeile(): #{
  if newline: #{
    sys.stdout.write("\n")
  #}
#}

#
# Hauptprogramm
#


if debug: print "erwarte Eingabe"

eingabe = sys.stdin.readline()                 # zeilenweise bearbeiten
while eingabe != "":                           # Ende erreicht?
#{
  wortliste = string.split(eingabe)            
  ausgabeliste = []
  for wort in wortliste: #{                    # wortweise abarbeiten
    tagausgegeben = 0
    ausgabeWort(wort) 
    for (tag, lexKey, suffixkandidat, lexkat, nr) in kategorien(wort): #{
      tagausgabe(tag, lexKey, suffixkandidat, lexkat, nr)
      tagausgegeben = 1
    #}
    if not tagausgegeben: #{
      if nomenRaten and len(wort)>0 and wort[0] != klein(wort[0]): 
        tag = "[n nom sg -def fem]"
        tagausgabe(tag, "", "", "", -1)
      elif leereTags:
        tag = "[]"
        tagausgabe(tag, "", "", "", -1)
    #}
    ausgabeNachWort(ausgabeliste)
    ausgabeliste = []
  #}
  ausgabeNachZeile()
  eingabe = sys.stdin.readline()               # naechste Zeile
#}