tokenize.py

Download (Stand: 29. Januar 2001)
 
#!/usr/bin/python

# tokenize.py 
# Eingabe: ISO 8859/1 Text formatiert
# Ausgabe: siehe anforderungen.txt

import string
import sys

moegliche_absaetze_angeben = 0      # \n einschalten
moegliche_absaetze_raten = 1        # Absaetze raten mit
absatz_schwelle = 15                # Positionsschwelle

bindestrichSonderbehandlung = 1
apostrophSonderbehandlung   = 1

maxlaenge = 72         # normalerweise sind Zeilen in etwa so lang

letters = string.lowercase + string.uppercase + "\xeb\xcb\xe7\xc7"

def is_upper(zeichen):
  if (ord("A") <= ord(zeichen)) \
  and (ord(zeichen) <= ord("Z")):
    return 1
  if string.find("\xc7\xcb", zeichen) >= 0:
    return 1
  return 0

absatz_schon_markiert = 1
in_zeile = 0
moeglicher_absatz = 0
eingabe = sys.stdin.readline()
while eingabe != "":
#{
  eingabe = string.replace(eingabe, "=EB", "\xeb")
  eingabe = string.replace(eingabe, "=E7", "\xe7")
  eingabe = string.replace(eingabe, "=CB", "\xcb")
  eingabe = string.replace(eingabe, "=C7", "\xc7")
  eingabe = string.replace(eingabe, "=20", " ")
  eingabe = string.replace(eingabe, "=A0", " ")
  eingabe = string.replace(eingabe, "=2E", ".")  # Escape fuer "." in mails
  eingabe = string.replace(eingabe, "=09", "\t")  
  eingabe = string.replace(eingabe, "=3D", "=")  # Escape fuer "=" in mails
  eingabe = string.replace(eingabe, "=46", "F")  # Escape fuer "From" in mails
  # eingabe= string.replace(s, "e:", "\xeb")   # Britta (ë)
  # eingabe= string.replace(s, "E:", "\xcb")   # weitere nach gleichem Schema
  # eingabe= string.replace(s, "c:", "\xe7")
  # eingabe= string.replace(s, "C:", "\xc7")
  eingabe= string.replace(eingabe, "\x89", "\xeb") # nach MS-DOS Codepage 
  eingabe= string.replace(eingabe, "\x87", "\xe7") #      437 / 850
  eingabe= string.replace(eingabe, "\x80", "\xc7")
  eingabe= string.replace(eingabe, "\xd3", "\xcb") # nur Codepage 850  
  fuehrendeLeerzeichen = (eingabe[0] == " ")
  laenge = len(eingabe)
  if string.strip(eingabe) == "":  # nur Whitespace
    if absatz_schon_markiert == 0:
      sys.stdout.write("\n")
      absatz_schon_markiert = 1
      in_zeile = 0
  else: #{
    #
    # Satzzeichen etc. abruecken
    #
    einzeichentoken = [".", ",", ":", ";", "!", "?", ";", "(", ")", \
                       '"', "&", "/", "@", "%", \
                       "<", ">", "[", "]", "{", "}", \
                       "=", "#" ]
    if not bindestrichSonderbehandlung:
      einzeichentoken.append("-")
    if not apostrophSonderbehandlung:
      einzeichentoken.append("'")
    for satzzeichen in einzeichentoken:
      eingabe = string.replace(eingabe, satzzeichen, " "+satzzeichen+" ")
    #
    # ggf. Bindestrich-Sonderbehandlung nach Abkuerzungen
    #
    if bindestrichSonderbehandlung: #{
      pos = string.find(eingabe, "-")
      while pos>=0: #{
        if (pos>0) \
        and (is_upper(eingabe[pos-1])==1): 
          pass 
        elif (pos>2) \
        and (eingabe[pos-3: pos] in ["Ish", "ish"]):
          pass
        else: #{
          eingabe = eingabe[0:pos] + " - " + eingabe[pos+1: sys.maxint]
          pos = pos + 1
        #}
        pos = string.find(eingabe, "-", pos+1)
      #}   
    #}
    #
    # ggf. Apostroph-Sonderbehandlung nach m und s
    #
    if apostrophSonderbehandlung: #{
      pos = string.find(eingabe, "'")
      while pos>=0: #{
        if (pos>0) \
        and (eingabe[pos-1] in "sSmMtT") \
	and (not ((pos>1) and (eingabe[pos-2] in letters))): 
          eingabe = eingabe[0:pos] + "' " + eingabe[pos+1: sys.maxint]
        else: #{
          eingabe = eingabe[0:pos] + " ' " + eingabe[pos+1: sys.maxint]
          pos = pos + 1
        #}
        pos = string.find(eingabe, "'", pos+1)
      #}   
    #}
    #
    # Whitespace aufraeumen
    #
    for whzeichen in ["\t", "\n"]:
      eingabe = string.replace(eingabe, whzeichen, " ")
    vergleich = ""
    while eingabe != vergleich:
      vergleich = eingabe
      eingabe = string.replace(eingabe, "  ", " ")
    eingabe = string.strip(eingabe)
    #
    # ggf. moeglichen Absatz an der Laenge der Zeile erkennen
    #
    if absatz_schon_markiert == 0 and moeglicher_absatz > 0:
      if moegliche_absaetze_angeben == 1:
        eingabe = "\\n"+`moeglicher_absatz`+" "+eingabe
      elif moegliche_absaetze_raten == 1:
        if moeglicher_absatz > absatz_schwelle:
          sys.stdout.write('\n')
          in_zeile = 0
    if laenge>maxlaenge:
      maxlaenge = laenge
    moeglicher_absatz = 100 - 100*laenge/maxlaenge
    #
    # Token ausgeben
    #
    if in_zeile == 1:
      eingabe = " " + eingabe
    sys.stdout.write(eingabe)
    in_zeile = 1
    absatz_schon_markiert = 0
  #}
  eingabe = sys.stdin.readline()
#}