X-Git-Url: https://git.p6c8.net/selfforum.git/blobdiff_plain/ba659b53059e637777865e646f0f2a6fb7f2988e..f91793544f0d5c7f978e5cf6214d9882d7c4db7c:/selfforum-cgi/shared/Encode/Plain.pm diff --git a/selfforum-cgi/shared/Encode/Plain.pm b/selfforum-cgi/shared/Encode/Plain.pm index cf7ef06..2379e38 100644 --- a/selfforum-cgi/shared/Encode/Plain.pm +++ b/selfforum-cgi/shared/Encode/Plain.pm @@ -1,402 +1,536 @@ -# Encode/Plain.pm - -# ==================================================== -# Autor: n.d.p. / 2001-01-07 -# lm : n.d.p. / 2001-02-06 -# ==================================================== -# Funktion: -# Codierung von non-ASCII-Zeichen fuer -# HTML -# ==================================================== - -use strict; - package Encode::Plain; -require 5.6.0; +################################################################################ +# # +# File: shared/Encode/Plain.pm # +# # +# Authors: André Malo # +# # +# Description: Encode text for HTML Output (entities, spaces) # +# # +################################################################################ -use vars qw(@ISA @EXPORT %sonder %unimap $utf8); +use strict; +use vars qw( + @EXPORT + %sonder + %unimap + $utf8 + $v56 +); + +################################################################################ +# +# Version check +# +# last modified: +# $Date$ (GMT) +# by $Author$ +# +sub VERSION {(q$Revision$ =~ /([\d.]+)\s*$/)[0] or '0.0'} -# ==================================================== -# Funktionsexport -# ==================================================== +$v56 = eval {local $SIG{__DIE__}; require 5.6.0;}; -require Exporter; -@ISA = qw(Exporter); +################################################################################ +# +# Export +# +use base qw(Exporter); @EXPORT = qw(plain multiline toUTF8); -################################ -# sub plain +### sub myunpack ############################################################### +# +# if perl version < 5.6 use myunpack instead of unpack 'U' ;( # -# einfache Sonderzeichen -> -# Entity-Codierung -################################ +# Params: $string - UTF8-encoded string to unpack +# +# Return: Number - unpacked UTF8 +# +sub myunpack ($) { + return unless defined $_[0]; + + my @c = map {ord} split // => shift; + + return ($c[0] & 31) << 6 | $c[1] & 63 + if ( + @c == 2 + and ($c[0] & 224) == 192 + and ($c[1] & 192) == 128 + ); + + return ($c[0] & 15) << 12 | ($c[1] & 63) << 6 | $c[2] && 63 + if ( + @c == 3 + and ($c[0] & 240) == 224 + and ($c[1] & 192) == 128 + and ($c[2] & 192) == 128 + ); + + return; +} +### sub plain ################################################################## +# +# encode characters of plain text into entities for HTML output +# (includes < > " &) +# (excludes space problem) +# +# Params: $old - String (or scalar reference) to encode +# $ref - (optional) (hash reference) Options +# (-amp -except -utf8) +# +# Return: encoded string (or scalar reference) +# sub plain ($;$) { - my ($old,$ref)=@_; + my ($old, $ref) = @_; my $exreg; - return \'' unless (defined $old); + return unless (defined $old); + + my $new = ref ($old) ? $$old : $old; + $ref = $ref || {}; + $new ='' unless (defined $new); - my $new=(ref ($old))?$$old:$old;; - $ref=($ref or {}); + my $unicode = defined ($ref -> {-utf8}) + ? $ref -> {-utf8} + : $utf8; - # Ausnahmen - my $except=exists($ref->{-except}); + # Exceptions + # + my $except = exists($ref->{-except}); if ($except) { - # Referenz, also Liste uebergeben -> umwandeln in Regex if (ref ($ref -> {-except})) { - $exreg = join ('|',map {quotemeta $_} @{$ref -> {-except}});} - - # keine Referenz, also Regex angegeben + # turn list into a regex + # + $exreg = join '|' => map {quotemeta $_} @{$ref -> {-except}}; + } else { + # quote regex delimiters + # $exreg = $ref -> {-except}; - $exreg =~ s/\//\\\//g;}} # LTS :-) + $exreg =~ s|/|\\/|g; + } + } + # encode the &-character + # if (lc($ref->{-amp}) eq 'soft') { if ($except) { - $new=~s/($exreg)|(?:\&(?!(?:#[Xx][\da-fA-F]+|#\d+|[a-zA-Z]+);))/(length($1))?$1:'&'/eg;} - + $new=~s/($exreg)|(?:\&(?!(?:#[Xx][\da-fA-F]+|#\d+|[a-zA-Z]+);))/defined($1)?$1:'&'/eg; + } else { - $new=~s/\&(?!(?:#[Xx][\da-fA-F]+|#\d+|[a-zA-Z]+);)/&/g;}} - + $new=~s/\&(?!(?:#[Xx][\da-fA-F]+|#\d+|[a-zA-Z]+);)/&/g; + } + } elsif (lc($ref->{-amp}) ne 'no') { if ($except) { - $new=~s/($exreg)|\&/(length($1))?$1:'&'/eg;} - + $new=~s/($exreg)|\&/defined($1)?$1:'&'/eg; + } else { - $new=~s/\&/&/g;}} + $new=~s/\&/&/g; + } + } - # Weitere Zeichen + # further characters + # if ($except) { - $new =~ s/($exreg)|/(length($1))?$1:'>'/eg; - $new =~ s/($exreg)|\|/(length($1))?$1:'|'/eg; # nich wahr - $new =~ s/($exreg)|"/(length($1))?$1:'"'/eg; # Diese Zeile wird den Bannerklickern - # zu schaffen machen, sowas aber auch... - - # Der grosse Hash - if ($utf8 or $ref -> {-utf8}) { - my $x; - $new =~ s/($exreg)|([\300-\337][\200-\277]|[\340-\357][\200-\277][\200-\277])/ - length($1)?$1:(exists($unimap{$x = unpack('U',$2)})?$unimap{$x}:"&#$x;")/eg;} - - $new =~ s/($exreg)|([\177-\377])/(length($1))?$1:$sonder{$2}/eg;} + $new =~ s/($exreg)|/defined($1)?$1:'>'/eg; + $new =~ s/($exreg)|"/defined($1)?$1:'"'/eg; + # the big hash + # + if ($unicode) { + my $x; + if ($v56) { + $new =~ s/($exreg)|([\300-\337][\200-\277]|[\340-\357][\200-\277][\200-\277])/ + defined($1) + ? $1 + : ( exists($unimap{$x = unpack('U',$2)}) + ? $unimap{$x} + : "&#$x;" + ) + /eg; + } + else { + $new =~ s/($exreg)|([\300-\337][\200-\277]|[\340-\357][\200-\277][\200-\277])/ + defined($1) + ? $1 + : ( exists($unimap{$x = myunpack($2)}) + ? $unimap{$x} + : "&#$x;" + ) + /eg; + } + } + $new =~ s/($exreg)|([\177-\377])/defined($1)?$1:$sonder{$2}/eg; + } else { + # no exceptions + # $new =~ s//>/g; - $new =~ s/\|/|/g; $new =~ s/"/"/g; - # Der grosse Hash - if ($utf8 or $ref -> {-utf8}) { + # the big hash + # + if ($unicode) { my $x; - $new =~ s/([\300-\337][\200-\277]|[\340-\357][\200-\277][\200-\277])/ - exists($unimap{$x = unpack('U',$1)})?$unimap{$x}:"&#$x;"/eg;} - - $new =~ s/([\177-\377])/$sonder{$1}/g;} - - # Zeichen <= 31 - $new=~s/([\001-\010\013\014\016-\037])/'&#'.ord($1).';'/eg; + if ($v56) { + $new =~ s/([\300-\337][\200-\277]|[\340-\357][\200-\277][\200-\277])/ + exists($unimap{$x = unpack('U',$1)}) + ? $unimap{$x} + : "&#$x;" + /eg; + } + else { + $new =~ s/([\300-\337][\200-\277]|[\340-\357][\200-\277][\200-\277])/ + exists($unimap{$x = myunpack($1)}) + ? $unimap{$x} + : "&#$x;" + /eg; + } + } + $new =~ s/([\177-\377])/$sonder{$1}/g; + } + + # characters < 32, but whitespaces + # + $new=~s/([^\041-\377\000\s])/ + '&#' . ord($1) . ';' + /eg; $new=~s/\000/ /g; - # Rueckgabe - ref($old)?\$new:$new; + # return + # + ref $old + ? \$new + : $new; } -################################ -# sub multiline +### sub multiline ############################################################## +# +# solve the space problem +# +# Params: $old - String (or scalar reference): text to encode # -# Whitespacecodierung -# fuer Leerzeilen -################################ +# Return: scalar reference: encoded string +# +sub multiline ($) { + my $old = shift; + my $string=(ref ($old)) + ? $$old + : $old; -sub multiline { - my $old=shift; - my $string=(ref ($old))?$$old:$old; + $string='' unless (defined $string); - # Zeilenumbrueche normalisieren + # normalize newlines + # $string=~s/\015\012|\015|\012/\n/g; - # Zeilenumbrueche in
umwandeln - $string=~s/\n/
/g; + # turn \n into
+ # + $string=~s!\n!
!g; - # mehr als ein aufeinanderfolgendes - # Leerzeichen in feste Leerzeichen umwandeln + # more than 1 space =>   + # $string=~s/(\s\s+)/(' ' x (length($1)-1)) . ' '/eg; - # Leerzeichen nach einem
in feste - # Spaces umwandeln - $string=~s/(?:^|(
))\s/$1 /g; + # Single Spaces after
=>   + # (save ascii arts ;) + # + $string=~s!(?:^|())\s!($1?$1:'').' '!eg; - # Rueckgabe + # return + # \$string; } +### sub toUTF8 ################################################################# +# +# map ISO-8859-1 to UTF8 +# +# Params: String or scalar reference: string to map +# +# Return: String or scalar reference: mapped string +# sub toUTF8 ($) { my $ref = shift; - my $string = ref($ref)?$$ref:$ref; - no warnings 'utf8'; - - $string =~ tr/\x80-\xff//CU; - - ref($ref)?\$string:$string; + my $string = ref($ref) + ? $$ref + : $ref; + + $string =~ s + {([\x80-\xff])} + { chr((ord ($1) >> 6) | 192) + .chr((ord ($1) & 191)) + }eg; + + ref($ref) + ? \$string + : $string; } -# ==================================================== -# Modulinitialisierung -# ==================================================== - +################################################################################ +# +# package init +# BEGIN { - # Latin 1 + geraten - %sonder=("\177" => '', # Delete-Zeichen - "\200" => '€', # Euro-Zeichen - "\201" => 'ü', # ue - DOS-Zeichensatz - "\202" => '‚', # einfaches Anfuehrungszeichen unten - "\203" => 'ƒ', # forte - "\204" => '„', # doppelte Anfuehrungszeichen unten - "\205" => '…', # drei punkte - "\206" => '†', # dagger - "\207" => '‡', # Dagger - "\210" => 'ˆ', # circ - "\211" => '‰', # Promille - "\212" => 'Š', # so ein S mit Haken drueber :-) - "\213" => '‹', # lsaquo - "\214" => 'Œ', # OE (so verhakelt - daenisch?) wer weiss das schon - "\215" => 'ì', # Codepage 850; - "\216" => 'Ž', # Z mit Haken drueber (Latin Extended B) - "\217" => 'Å', # Codepage 850 (Win) - "\220" => 'ü', # ue - Mac-Zeichensatz - "\221" => "'", # einfache Anfuehrungszeichen oben - "\222" => "'", # dito - "\223" => '“', # doppelte Anfuehrungszeichen oben - "\224" => '“', # dito - "\225" => '•', # Bullet - "\226" => '-', # Bindestrich - "\227" => '-', # dito - "\230" => '˜', # tilde...? - "\231" => '™', # Trade-Mark - "\232" => 'š', # kleines s mit Haken drueber - "\233" => '›', # rsaquo; - "\234" => 'œ', # oe verhakelt - "\235" => 'Ø', # Codepage 850 (Win) - "\236" => '×', # Codepage 850 (Win) - "\237" => 'Ÿ', # Y mit Punkten drueber - "\240" => ' ', # nbsp; - "\241" => '¡', # umgedrehtes ! - "\242" => '¢', # cent-Zeichen - "\243" => '£', # (engl.)Pfund-Zeichen - "\244" => '¤', # Waehrungszeichen - "\245" => '¥', # Yen halt :-) - "\246" => '¦', # eigentlich soll es wohl ein | sein .-) - "\247" => '§', # Paragraph - "\250" => '¨', # zwei Punkte oben - "\251" => '©', # (C) - "\252" => 'ª', # hochgestelltes unterstrichenes a - "\253" => '«', # left-pointing double angle quotation mark (besser koennte ichs auch nicht beschreiben...) - "\254" => '¬', # Negationszeichen - "\255" => '-', # Bindestrich - "\256" => '®', # (R) - "\257" => 'ß', # sz, was auch immer fuern Zeichensatz (DOS?) - "\260" => '°', # Grad-Zeichen - "\261" => '±', # Plusminus - "\262" => '²', # hoch 2 - "\263" => '³', # hoch 3 - "\264" => '‚', # einf. anfuehrungszeichen unten - "\265" => 'µ', # my-Zeichen (griech) - "\266" => '¶', # Absatzzeichen - "\267" => '·', # Mal-Zeichen - "\270" => '¸', - "\271" => '¹', # hoch 1 - "\272" => 'º', # masculine ordinal indicator (spanish) - "\273" => '»', # right-pointing double angle quotation mark - "\274" => '¼', # 1/4 - "\275" => '½', # 1/2 - "\276" => '¾', # 3/4 - "\277" => '¿', # umgedrehtes ? - "\300" => 'À', - "\301" => 'Á', - "\302" => 'Â', - "\303" => 'Ã', - "\304" => 'Ä', - "\305" => 'Å', - "\306" => 'Æ', - "\307" => 'Ç', - "\310" => 'È', - "\311" => 'É', - "\312" => 'Ê', - "\313" => 'Ë', - "\314" => 'Ì', - "\315" => 'Í', - "\316" => 'Î', - "\317" => 'Ï', - "\320" => 'Ð', # keine Ahnung, was das wohl sein soll, auf jeden Fall was islaendisches... - "\321" => 'Ñ', - "\322" => 'Ò', - "\323" => 'Ó', - "\324" => 'Ô', - "\325" => 'Õ', - "\326" => 'Ö', - "\327" => '×', # eigentlich × funzt afaik aber nicht aufm Mac (ob das hier funktioniert, weiss ich nicht) - "\330" => 'Ø', - "\331" => 'Ù', - "\332" => 'Ú', - "\333" => 'Û', - "\334" => 'Ü', - "\335" => 'Ý', - "\336" => 'Þ', - "\337" => 'ß', - "\340" => 'à', - "\341" => 'á', - "\342" => 'â', - "\343" => 'ã', - "\344" => 'ä', - "\345" => 'å', - "\346" => 'æ', - "\347" => 'ç', - "\350" => 'è', - "\351" => 'é', - "\352" => 'ê', - "\353" => 'ë', - "\354" => 'ì', - "\355" => 'í', - "\356" => 'î', - "\357" => 'ï', - "\360" => 'ð', - "\361" => 'ñ', - "\362" => 'ò', - "\363" => 'ó', - "\364" => 'ô', - "\365" => 'õ', - "\366" => 'ö', - "\367" => '÷', - "\370" => 'ø', - "\371" => 'ù', - "\372" => 'ú', - "\373" => 'û', - "\374" => 'ü', - "\375" => 'ý', - "\376" => 'þ', - "\377" => 'ÿ'); + $utf8 = 0; + + # Latin 1 + guessed + # + %sonder=( + "\177" => '', + "\200" => '€', + "\201" => 'ü', + "\202" => '‚', + "\203" => 'ƒ', + "\204" => '„', + "\205" => '…', + "\206" => '†', + "\207" => '‡', + "\210" => 'ˆ', + "\211" => '‰', + "\212" => 'Š', + "\213" => '‹', + "\214" => 'Œ', + "\215" => 'ì', + "\216" => 'Ž', + "\217" => 'Å', + "\220" => 'ü', + "\221" => "'", + "\222" => "'", + "\223" => '“', + "\224" => '“', + "\225" => '•', + "\226" => '-', + "\227" => '-', + "\230" => '˜', + "\231" => '™', + "\232" => 'š', + "\233" => '›', + "\234" => 'œ', + "\235" => 'Ø', + "\236" => '×', + "\237" => 'Ÿ', + "\240" => ' ', + "\241" => '¡', + "\242" => '¢', + "\243" => '£', + "\244" => '¤', + "\245" => '¥', + "\246" => '¦', + "\247" => '§', + "\250" => '¨', + "\251" => '©', + "\252" => 'ª', + "\253" => '«', + "\254" => '¬', + "\255" => '-', + "\256" => '®', + "\257" => 'ß', + "\260" => '°', + "\261" => '±', + "\262" => '²', + "\263" => '³', + "\264" => '´', + "\265" => 'µ', + "\266" => '¶', + "\267" => '·', + "\270" => '¸', + "\271" => '¹', + "\272" => 'º', + "\273" => '»', + "\274" => '¼', + "\275" => '½', + "\276" => '¾', + "\277" => '¿', + "\300" => 'À', + "\301" => 'Á', + "\302" => 'Â', + "\303" => 'Ã', + "\304" => 'Ä', + "\305" => 'Å', + "\306" => 'Æ', + "\307" => 'Ç', + "\310" => 'È', + "\311" => 'É', + "\312" => 'Ê', + "\313" => 'Ë', + "\314" => 'Ì', + "\315" => 'Í', + "\316" => 'Î', + "\317" => 'Ï', + "\320" => 'Ð', + "\321" => 'Ñ', + "\322" => 'Ò', + "\323" => 'Ó', + "\324" => 'Ô', + "\325" => 'Õ', + "\326" => 'Ö', + "\327" => '×', + "\330" => 'Ø', + "\331" => 'Ù', + "\332" => 'Ú', + "\333" => 'Û', + "\334" => 'Ü', + "\335" => 'Ý', + "\336" => 'Þ', + "\337" => 'ß', + "\340" => 'à', + "\341" => 'á', + "\342" => 'â', + "\343" => 'ã', + "\344" => 'ä', + "\345" => 'å', + "\346" => 'æ', + "\347" => 'ç', + "\350" => 'è', + "\351" => 'é', + "\352" => 'ê', + "\353" => 'ë', + "\354" => 'ì', + "\355" => 'í', + "\356" => 'î', + "\357" => 'ï', + "\360" => 'ð', + "\361" => 'ñ', + "\362" => 'ò', + "\363" => 'ó', + "\364" => 'ô', + "\365" => 'õ', + "\366" => 'ö', + "\367" => '÷', + "\370" => 'ø', + "\371" => 'ù', + "\372" => 'ú', + "\373" => 'û', + "\374" => 'ü', + "\375" => 'ý', + "\376" => 'þ', + "\377" => 'ÿ' + ); # Unicode-Mapping - %unimap=(128 => '€', - 129 => 'ü', - 130 => '‚', - 131 => 'ƒ', - 132 => '„', - 133 => '…', - 134 => '†', - 135 => '‡', - 136 => 'ˆ', - 137 => '‰', - 138 => 'Š', - 139 => '‹', - 140 => 'Œ', - 141 => 'ì', - 142 => 'Ž', - 143 => 'Å', - 144 => 'ü', - 145 => "'", - 146 => "'", - 147 => '“', - 148 => '“', - 149 => '•', - 150 => '-', - 151 => '-', - 152 => '˜', - 153 => '™', - 154 => 'š', - 155 => '›', - 156 => 'œ', - 157 => 'Ø', - 158 => '×', - 159 => 'Ÿ', - 160 => ' ', - 163 => '£', - 165 => '¥', - 167 => '§', - 169 => '©', - 171 => '«', - 173 => '-', - 174 => '®', - 175 => 'ß', - 180 => '‚', - 184 => '¸', - 185 => '¹', - 187 => '»', - 192 => 'À', - 193 => 'Á', - 194 => 'Â', - 195 => 'Ã', - 196 => 'Ä', - 197 => 'Å', - 198 => 'Æ', - 199 => 'Ç', - 200 => 'È', - 201 => 'É', - 202 => 'Ê', - 203 => 'Ë', - 204 => 'Ì', - 205 => 'Í', - 206 => 'Î', - 207 => 'Ï', - 208 => 'Ð', - 209 => 'Ñ', - 210 => 'Ò', - 211 => 'Ó', - 212 => 'Ô', - 213 => 'Õ', - 214 => 'Ö', - 216 => 'Ø', - 217 => 'Ù', - 218 => 'Ú', - 219 => 'Û', - 220 => 'Ü', - 221 => 'Ý', - 222 => 'Þ', - 223 => 'ß', - 224 => 'à', - 225 => 'á', - 226 => 'â', - 227 => 'ã', - 228 => 'ä', - 229 => 'å', - 230 => 'æ', - 231 => 'ç', - 232 => 'è', - 233 => 'é', - 234 => 'ê', - 235 => 'ë', - 236 => 'ì', - 237 => 'í', - 238 => 'î', - 239 => 'ï', - 240 => 'ð', - 241 => 'ñ', - 242 => 'ò', - 243 => 'ó', - 244 => 'ô', - 245 => 'õ', - 246 => 'ö', - 247 => '÷', - 248 => 'ø', - 249 => 'ù', - 250 => 'ú', - 251 => 'û', - 252 => 'ü', - 253 => 'ý', - 254 => 'þ', - 255 => 'ÿ'); + %unimap=( + 128 => '€', + 129 => 'ü', + 130 => '‚', + 131 => 'ƒ', + 132 => '„', + 133 => '…', + 134 => '†', + 135 => '‡', + 136 => 'ˆ', + 137 => '‰', + 138 => 'Š', + 139 => '‹', + 140 => 'Œ', + 141 => 'ì', + 142 => 'Ž', + 143 => 'Å', + 144 => 'ü', + 145 => "'", + 146 => "'", + 147 => '“', + 148 => '“', + 149 => '•', + 150 => '-', + 151 => '-', + 152 => '˜', + 153 => '™', + 154 => 'š', + 155 => '›', + 156 => 'œ', + 157 => 'Ø', + 158 => '×', + 159 => 'Ÿ', + 160 => ' ', + 163 => '£', + 165 => '¥', + 167 => '§', + 169 => '©', + 171 => '«', + 173 => '-', + 174 => '®', + 175 => 'ß', + 180 => '´', + 184 => '¸', + 185 => '¹', + 187 => '»', + 192 => 'À', + 193 => 'Á', + 194 => 'Â', + 195 => 'Ã', + 196 => 'Ä', + 197 => 'Å', + 198 => 'Æ', + 199 => 'Ç', + 200 => 'È', + 201 => 'É', + 202 => 'Ê', + 203 => 'Ë', + 204 => 'Ì', + 205 => 'Í', + 206 => 'Î', + 207 => 'Ï', + 208 => 'Ð', + 209 => 'Ñ', + 210 => 'Ò', + 211 => 'Ó', + 212 => 'Ô', + 213 => 'Õ', + 214 => 'Ö', + 216 => 'Ø', + 217 => 'Ù', + 218 => 'Ú', + 219 => 'Û', + 220 => 'Ü', + 221 => 'Ý', + 222 => 'Þ', + 223 => 'ß', + 224 => 'à', + 225 => 'á', + 226 => 'â', + 227 => 'ã', + 228 => 'ä', + 229 => 'å', + 230 => 'æ', + 231 => 'ç', + 232 => 'è', + 233 => 'é', + 234 => 'ê', + 235 => 'ë', + 236 => 'ì', + 237 => 'í', + 238 => 'î', + 239 => 'ï', + 240 => 'ð', + 241 => 'ñ', + 242 => 'ò', + 243 => 'ó', + 244 => 'ô', + 245 => 'õ', + 246 => 'ö', + 247 => '÷', + 248 => 'ø', + 249 => 'ù', + 250 => 'ú', + 251 => 'û', + 252 => 'ü', + 253 => 'ý', + 254 => 'þ', + 255 => 'ÿ' + ); } -# making require happy +# keep 'require' happy 1; -# ==================================================== -# end of Encode::Plain -# ==================================================== \ No newline at end of file +# +# +### end of Encode::Plain ####################################################### \ No newline at end of file