erkenner.py

Download (Stand: 30. Januar 2001)
 
#!/usr/bin/python

import popen2
import re
import string
import sys

# erkenner.py 
#
# Eingabe: ISO 8859/1 Text, Satzzeichen etc. abgerueckt, z.B. bereits
#          tokenisierter Text
#
# Ausgabe: je Wort wort:  (   )* 
#
#          formatInfo: 
#                Tags:   
#                IMS:     " "  
#                All:    "("  ","  ","  ","  ")" 
#                Python: Python-Code, sonst wie All
#
#          formatExtra: 
#                Rebol    = [] um info herum
#                Python   = [] um info herum und "," dazwischen
#                Alex     = alex-import Code Tab-separiert
#                IMS      = Tab nach , Leerzeichen zwischen info
#                Printing = viele Leerzeilen und Tabs
#                letztere beide schliessen sich gegenseitig aus
#
#          mitnr 1 = Tag enthaelt Regelnummer in der Form "R"
#          
#          newline 
#                1 = Zeilenumbrueche der Eingabe werden uebernommen 
#      
#          putTagsOnANewLine
#                1 = bereits fuer das erste Tag wird eine neue Zeile genommen
#                    keine Wirkung, wenn "IMS" in formatExtra
# 
#          substWSinInfo
#                1 = info enthaelt "_" anstatt " "
#
#          substWSinTag
#                1 = tag enthaelt "_" anstatt " "
#
#          auchUngetaggt
#                1 = Woerter, die kein Tag erhalten, werden auch ausgegeben
#                0 = solche Woerter erscheinen nicht in der Ausgabe   
#

mitnr              = 0
newline            = 0
putTagsOnANewLine  = 0
substWSinInfo      = 0
substWSinTag       = 0
auchUngetaggt      = 1
formatInfo         = ["Tags"]  # ["All", "Tags", "IMS", "Python"]
formatExtra        = ["Printing"]  # ["Printing", "Rebol", "IMS", "Python"] 

leereTags  = 0   # leere Tagklammern zum manuellen Nachbearbeiten ausgeben
nomenRaten = 0   # Grossgeschriebenen Woerter ohne Tag als Nomen taggen

alexdir = "../alex/"

debug = 0

ignorieren = [               # beim direkten Nachschlagen nicht ausgeben
"[a]",
"[n fem]",
"[fem]",
"[f]",
"[n masc]",
"[masc]",
"[m]",
"[ptl conj]",
"[v]"
]

#"[adv]",
#"[conj]",
#"[excl]",
#"[indp]",         
#"[n]",             
#"[num]",
#"[onomat]",
#"[prep]",
#]

#
# normiereString()
#

def normiereString(s, nr): #{
  s= string.replace(s, "e:", "\xeb")   # Britta (ë)
  s= string.replace(s, "E:", "\xcb")   # weitere nach gleichem Schema
  s= string.replace(s, "c:", "\xe7")
  s= string.replace(s, "C:", "\xc7")
  s= string.replace(s, "\x89", "\xeb") # Thomas (ë)
  s= string.replace(s, "\x87", "\xe7") # weitere nach MS-DOS Codepage 437 / 850
  s= string.replace(s, "\x80", "\xc7")
  s= string.replace(s, "\xd3", "\xcb") # nur Codepage 850
  for sonder in [":vok:", ":kons:"]: #{
    pos = 0
    pos = string.find(s, sonder, pos)
    while pos >= 0: #{
      if pos == 0 \
      or (pos+len(sonder)) >= len(s) \
      or s[pos+len(sonder)] != "]":
        if pos == 0 \
        or s[pos-1] != "^":
          s= s[0:pos] + "[" + sonder + "]" + s[pos+len(sonder):sys.maxint]
        else:
          s= s[0:pos-1] + "[^" + sonder + "]" + s[pos+len(sonder):sys.maxint]
        pos = pos + 1
      #}
      pos = string.find(s, sonder, pos + 1)
    #}
  #}
  if s != string.strip(s):
    sys.stderr.write("Zusaetzliche Leerzeichen in Zeile " + `nr` +".\n")
  return s
#}                

#
# suffixtabelle()
# bereitet die Suffixtabelle auf
#

def suffixtabelle(dateiname): #{
  f=open(dateiname, "r")
  tabelle={}
  if f: #{
    nr=0
    zeile=f.readline()
    nr=nr+1
    while zeile!="": #{
      szeile=string.strip(zeile)
      if (len(szeile)>0) \
      and (szeile[0] != "#"): #{
        eintrag = string.split(zeile, "\t")
        if len(eintrag) != 5:
          sys.stderr.write("Fehler in Zeile " +`nr` +" ignoriert: "+ zeile)
        else: #{
          suche = normiereString(eintrag[0], nr)
          if len(suche) > 0 and suche[0]=="0":   # ggf. 0 wegschneiden
            suche = suche[1:sys.maxint]
          suche = string.replace(suche, "^:kons:", ":vok:")
          suche = string.replace(suche, ":kons:", "^:vok:")
          suche = string.replace(suche, ":vok:", "aeëioyu")
          suche = re.compile("^.*"+suche+"$")
          endung = normiereString(eintrag[1], nr)
          if len(endung) > 0 and endung[0]=="0":   # ggf. 0 wegschneiden
            endung = endung[1:sys.maxint]
          ersatz = normiereString(eintrag[2], nr)
          if len(ersatz) > 0 and ersatz[0]=="0":   # ggf. 0 wegschneiden
            ersatz = ersatz[1:sys.maxint]
          lexkat = normiereString(eintrag[3], nr)
          if len(lexkat) > 0 and lexkat[0]=="0":   # ggf. 0 wegschneiden
            lexkat = lexkat[1:sys.maxint]
          ausgabe = string.rstrip(eintrag[4])      # nachfolgenden WS entf.
          if len(ausgabe) > 0 and ausgabe[0]=="0":   # ggf. 0 wegschneiden
            ausgabe = ausgabe[1:sys.maxint]
          if ausgabe != string.strip(ausgabe):
            sys.stderr.write("Zusaetzliche Leerzeichen in Zeile "+`nr`+".\n")
          element = (suche, ersatz, lexkat, ausgabe, nr)
          if tabelle.has_key(endung):
            liste = tabelle[endung]
            liste.append(element)
          else:
            liste = [element]  
          tabelle[endung] = liste
        #}
      #}
      zeile=f.readline()
      nr=nr+1
    #}
  #}
  else: #{
    print "Oeffnen der Datei", dateiname, "fehlgeschlagen"
  #}
  return tabelle
#}  
  
#
# kategorien(lemma)
# liefert eine Liste der Kategorien, die im Lexikon zum lemma
# verzeichnet sind
#

lex = popen2.popen2(alexdir+"alex -f "+alexdir+"albanian.db -b - -i -v")
if not lex or not lex[0] or not lex[1]:
  print "Unable to start alex."
  sys.exit(2)

def kategorien(lemma): #{
  if ":"  in lemma \
  or "\n" in lemma \
  or "\t" in lemma \
  or "\n" in lemma \
  or lemma == ""   \
  or len(lemma) >= 39:            # nicht nachschlagbar
    return []
  lex[1].write(lemma+"\n")
  lex[1].flush()
  zeile = lex[0].readline()
  if zeile == "":
    print "Premature termination of alex."
    sys.exit(3)
  zeile = string.rstrip(zeile)
  splitpos = string.find(zeile,":")
  if lemma != zeile[0:splitpos]:
    print ("Error while looking up '" + lemma + \
           "': Alex is not properly synchronized; got" + zeile + ".")
    sys.exit(4)
  zeile = zeile[ splitpos+1 : sys.maxint ]
  zeile = string.strip(zeile)
  if zeile == "":
    tags = []
  else:
    tags = string.split(zeile, ",")   

  if debug:
    print ("Rohtags zu " + lemma + ": " + `tags`)

  ntags = []
  for atag in tags:
    tag = string.strip(atag)    # whitespace entfernen
    ntags.append(tag)
    append=None
    if tag in ["trans", "intrans", "v", "[trans]", "[intrans]"]:     
      append="[v]"
    if tag in ["[f]", "[fem]"]:  
      append="[n fem]"
    if tag in ["[m]", "[masc]"]: 
      append="[n masc]"
    if tag in ["[ptl conj]"]: 
      append="[ptl subj]"
    if append:
      sys.stderr.write("Fehler im Lexikon zu " + lemma + "\n")
      ntags.append(append)
  if debug:
    print ("Tags zu " + lemma + ": " + `ntags`)
  return ntags
#}


def klein(wort): #{
  retval = ""
  wort = string.lower(wort)
  for buchstabe in wort:                       # Sonderzeichen behandeln
    if (ord(buchstabe) & 0xe0) == 0xc0:
      buchstabe = chr(ord(buchstabe) + 0x20)
    retval = retval + buchstabe
  return retval
#}     
  
#
# tagausgabe(...): Format siehe oben
#

def ausgabeWort(wort): #{
  ausgabeliste.append(wort)
#}

def tagausgabe(tag, lemma, suffixkandidat, lexkat, nr):
  if mitnr:
    tag = tag[0:len(tag)-1] + " R" + `nr` + tag[len(tag)-1]
  if substWSinTag:
    tag = string.replace(tag, " ", "_")
    tag = string.replace(tag, "\t", "_")
    tag = string.replace(tag, "\n", "_")
  info = ""
  if "Tags" in formatInfo:
    info = info + tag
  if "IMS" in formatInfo:
    info = info + tag + " " + lemma
  if "Python" in formatInfo:
    info = info + `(tag, lemma, suffixkandidat, lexkat, nr)`
  if "All" in formatInfo:
    info = info + "(" + lemma  + ", "  + suffixkandidat + ", "  \
                      + lexkat + ", '" + tag  + "')"
  if substWSinInfo:
    info = string.replace(info, " ", "_")
    info = string.replace(info, "\t", "_")
    info = string.replace(info, "\n", "_")
  ausgabeliste.append(info)
#}

def ausgabeNachWort(ausgabeliste): #{
    wswort = " " 
    wsinfo = " "
    wsende = " "
    if "IMS" in formatExtra: #{
      wswort = "\t"
      wsinfo = " "
      wsende = "\n"
    #}
    if "Printing" in formatExtra: #{
      wswort = "\t"
      wsinfo = "\n\t"
      wsende = "\n\n"
    #}
    if putTagsOnANewLine:
      wswort = "\n" + wswort
    if "Rebol" in formatExtra: #{
      wswort = wswort + "[ " + wsinfo
      wsende = wsinfo + " ]" + wsende 
    #}
    if "Python" in formatExtra: #{
      wswort = wswort + ": [ " + wsinfo
      wsende = wsinfo + " ]" + wsende 
      wsinfo = "," + wsinfo
      ausgabe = string.join(ausgabeliste[1:sys.maxint], wsinfo) + wsende
      ausgabe = `ausgabeliste[0]` + wswort + ausgabe
    #}
    elif "Alex" in formatExtra: #{
      wswort = wswort + "\t[ " + wsinfo
      wsende = wsinfo + " ]" + wsende 
      wsinfo = "," + wsinfo
      ausgabe = string.join(ausgabeliste[1:sys.maxint], wsinfo) + wsende
      ausgabe = ausgabeliste[0] + wswort + ausgabe
    #}
    else: #{
      ausgabe = string.join(ausgabeliste[1:sys.maxint], wsinfo) + wsende
      ausgabe = ausgabeliste[0] + wswort + ausgabe
    #}
    if auchUngetaggt or len(ausgabeliste)>1:
      sys.stdout.write(ausgabe)  
#}

def ausgabeNachZeile(): #{
  if newline: #{
    sys.stdout.write("\n")
  #}
#}

#
# Hauptprogramm
#

endungen = suffixtabelle("endungen.txt")


#if debug: print endungen
#if debug: print kategorien("LDK")
if debug: print "erwarte Eingabe"

eingabe = sys.stdin.readline()                 # zeilenweise bearbeiten
while eingabe != "":                           # Ende erreicht?
#{
  wortliste = string.split(eingabe)            
  ausgabeliste = []
  for wort in wortliste: #{                    # wortweise abarbeiten
    tagausgegeben = 0
    ausgabeWort(wort) 
    startposlist = range(len(wort))
    startposlist.append(len(wort))
    for suffixabpos in startposlist: #{        # moegliche Startpos. durchgehen
      suffixkandidat = wort[suffixabpos:sys.maxint]
      if debug: print "Suffixkandidat:", `suffixkandidat`
      if endungen.has_key(suffixkandidat): #{  # Kandidat im Suffix-Dictionary?
        wortanfang = wort[0:suffixabpos]
        liste = endungen[suffixkandidat] 
	if debug: print "gefunden. Wortanfang:", `wortanfang`
        for (suche, ersatz, lexkat, ausgabe, nr) in liste: #{
          if suche.match(klein(wortanfang)) or suche.match(wortanfang): #{
            lexKey = wortanfang + ersatz
	    lexEintrag = kategorien(klein(lexKey))
            if not lexEintrag:
              lexEintrag = kategorien(lexKey)  # vielleicht doch gross?
	    if debug: 
	      print (lexKey+": "+`lexEintrag`+", noetig: "+`lexkat`)
            if (lexkat == "" and ausgabe != "") \
            or (lexkat in lexEintrag) \
	    or (ausgabe == "" and lexEintrag != []): #{
              if debug: print "Treffer"
	      if (ausgabe == "") and lexEintrag != []:
                tags = []
                for i in lexEintrag:
                  if not (i in ignorieren):
                    tags.append(i)
              else:
                tags = [ausgabe]
              for tag in tags: #{
                tagausgegeben = 1
	        tagausgabe(tag, lexKey, suffixkandidat, lexkat, nr)
              #}
            #}
            else:
              if debug: print "Kategorie nicht gefunden"
          #}
        #}
      #}
    #}
    if not tagausgegeben: #{
      if nomenRaten and len(wort)>0 and wort[0] != klein(wort[0]): 
        tag = "[n nom sg -def fem]"
        tagausgabe(tag, "", "", "", -1)
      elif leereTags:
        tag = "[]"
        tagausgabe(tag, "", "", "", -1)
    #}
    ausgabeNachWort(ausgabeliste)
    ausgabeliste = []
  #}
  ausgabeNachZeile()
  eingabe = sys.stdin.readline()               # naechste Zeile
#}