]> git.p6c8.net - selfforum.git/blob - selfforum-cgi/shared/CheckRFC.pm
further and further it goes... (not yet ready)
[selfforum.git] / selfforum-cgi / shared / CheckRFC.pm
1 package CheckRFC;
2
3 ################################################################################
4 # #
5 # File: shared/CheckRFC.pm #
6 # #
7 # Authors: Andre Malo <nd@o3media.de>, 2001-03-30 #
8 # #
9 # Description: implement several string checks on RFC correctness #
10 # #
11 ################################################################################
12
13 use strict;
14
15 use vars qw(%url $email @EXPORT);
16
17 use autouse 'Carp' => qw(croak);
18
19 ################################################################################
20 #
21 # Export
22 #
23 use base qw(Exporter);
24 @EXPORT = qw(is_URL is_email);
25
26 ### is_URL ($@) ################################################################
27 #
28 # check URL
29 #
30 # Params: $string string to check
31 # @schemes possible URL schemes in $string
32 # qw(http ftp news nntp telnet gopher wais mailto file prospero)
33 # if there's no scheme given, 'http' is default
34 # use ':ALL' (without quotes) for all schemes
35 #
36 # Return: Status code (Bool)
37 #
38 sub is_URL ($@) {
39 my ($string, @schemes) = @_;
40 @schemes = qw(http) unless (@schemes);
41 @schemes = keys %url if (@schemes == 1 and $schemes[0] eq ':ALL');
42
43 for (@schemes) {
44 croak "unknown url scheme '$_'" unless exists $url{$_};
45 return 1 if $string =~ /$url{$_}/;
46 }
47
48 # no match => return false
49 return;
50 }
51
52 ### is_email ($) ###############################################################
53 #
54 # check email (comments can be nested)
55 #
56 # Params: $string string to check
57 #
58 # Return: Status code (Bool)
59 #
60 sub is_email ($) {
61 my $string = shift;
62
63 # false if any non-ascii chars
64 return if $string =~ /[\200-\377]/;
65
66 # remove nested comments
67 while ($string =~ s/\([^()]*\)//g) {};
68
69 return ($string =~ /^$email$/);
70 }
71
72 ### BEGIN # (1) ################################################################
73 #
74 # define regex for nearly RFC 822 email address
75 #
76 BEGIN {
77 # Thanx to J. Friedl:
78
79 my $esc = '\\\\';
80 my $Period = '\.';
81 my $space = '\040';
82 my $tab = '\t';
83 my $OpenBR = '\[';
84 my $CloseBR = '\]';
85 my $OpenParen = '\(';
86 my $CloseParen = '\)';
87 my $NonASCII = '\x80-\xff';
88 my $ctrl = '\000-\037';
89 my $CRlist = '\n\015';
90 my $qtext = qq/[^$esc$NonASCII$CRlist\"]/;
91 my $dtext = qq/[^$esc$NonASCII$CRlist$OpenBR$CloseBR]/;
92 my $quoted_pair = qq< $esc [^$NonASCII] >;
93 my $ctext = qq< [^$esc$NonASCII$CRlist()] >;
94 my $Cnested = qq< $OpenParen $ctext* (?: $quoted_pair $ctext* )* $CloseParen >;
95 my $comment = qq< $OpenParen $ctext* (?: (?: $quoted_pair | $Cnested ) $ctext* )* $CloseParen >;
96 my $X = qq< [$space$tab]* (?: $comment [$space$tab]* )* >;
97 my $atom_char = qq/[^($space)<>\@,;:\".$esc$OpenBR$CloseBR$ctrl$NonASCII]/;
98 my $atom = qq< $atom_char+ (?!$atom_char) >;
99 my $quoted_str = qq< \" $qtext * (?: $quoted_pair $qtext * )* \" >;
100 my $word = qq< (?: $atom | $quoted_str ) >;
101 my $domain_ref = $atom;
102 my $domain_lit = qq< $OpenBR (?: $dtext | $quoted_pair )* $CloseBR >;
103 my $sub_domain = qq< (?: $domain_ref | $domain_lit ) $X >;
104 my $domain = qq< $sub_domain (?: $Period $X $sub_domain )* >;
105 my $route = qq< \@ $X $domain (?: , $X \@ $X $domain )* : $X >;
106 my $local_part = qq< $word $X (?: $Period $X $word $X )* >;
107 my $addr_spec = qq< $local_part \@ $X $domain >;
108 my $route_addr = qq[ < $X (?: $route )? $addr_spec > ];
109 my $phrase_ctrl = '\000-\010\012-\037';
110 my $phrase_char = qq/[^()<>\@,;:\".$esc$OpenBR$CloseBR$NonASCII$phrase_ctrl]/;
111 my $phrase = qq< $word $phrase_char * (?: (?: $comment | $quoted_str ) $phrase_char * )* >;
112 $email = qq< $X (?: $addr_spec | $phrase $route_addr ) >;
113 $email = qr /$email/x;
114 }
115
116 ### BEGIN # (2) ################################################################
117 #
118 # define regexes for URLs
119 #
120 BEGIN {
121 # credits to an unknown(?) programmer ;)
122 # modified by n.d.p.
123
124 my $lowalpha = '(?:[a-z])';
125 my $hialpha = '(?:[A-Z])';
126 my $alpha = "(?:$lowalpha|$hialpha)";
127 my $digit = '(?:\d)';
128 my $safe = '(?:[$_.+-])';
129 my $extra = '(?:[!*\'(),])';
130 my $national = '(?:[{}|\\\\^~\[\]`])';
131 my $punctuation = '(?:[<>#%"])';
132 my $reserved = '(?:[;/?:@&=])';
133 my $hex = '(?:[\dA-Fa-f])';
134 my $escape = "(?:%$hex$hex)";
135 my $unreserved = "(?:$alpha|$digit|$safe|$extra)";
136 my $uchar = "(?:$unreserved|$escape)";
137 my $xchar = "(?:$unreserved|$escape|$reserved)";
138 my $digits = '(?:\d+)';
139 my $alphadigit = "(?:$alpha|\\d)";
140
141 # URL schemeparts for ip based protocols:
142 my $urlpath = "(?:$xchar*)";
143 my $user = "(?:(?:$uchar|[;?&=])*)";
144 my $password = "(?:(?:$uchar|[;?&=])*)";
145 my $port = '(?:[0-5]?\d\d?\d?\d?|6[0-4]\d\d\d|65[0-4]\d\d|655[0-2]\d|6553[0-5])';
146 my $ip4part = '(?:[01]?\d\d?|2[0-4]\d|25[0-5])';
147 my $hostnumber = '(?:(?!0+\.0+\.0+\.0+)'."$ip4part\\.$ip4part\\.$ip4part\\.$ip4part)";
148 my $toplabel = "(?:(?:$alpha(?:$alphadigit|-)*$alphadigit)|$alpha)";
149 my $domainlabel = "(?:(?:$alphadigit(?:$alphadigit|-)*$alphadigit)|$alphadigit)";
150 my $hostname = "(?:(?:$domainlabel\\.)*$toplabel)";
151 my $host = "(?:(?:$hostname)|(?:$hostnumber))";
152 my $hostport = "(?:(?:$host)(?::$port)?)";
153 my $login = "(?:(?:$user(?::$password)?\@)?$hostport)";
154 my $ip_schemepart = "(?://$login(?:/$urlpath)?)";
155
156 my $schemepart = "(?:$xchar*|$ip_schemepart)";
157 my $scheme = "(?:(?:$lowalpha|$digit|[+.-])+)";
158
159 # The predefined schemes:
160
161 # FTP (see also RFC959)
162 my $fsegment = "(?:(?:$uchar|[?:\@&=])*)";
163 my $ftptype = "(?:[AIDaid])";
164 my $fpath = "(?:$fsegment(?:/$fsegment)*)";
165 my $ftpurl = "(?:ftp://$login(?:/$fpath(?:;type=$ftptype)?)?)";
166
167 # FILE
168 my $fileurl = "(?:file://(?:(?:$host)|localhost)?/$fpath)";
169
170 # HTTP
171 my $httpuchar = "(?:(?:$alpha|$digit|$safe|(?:[!*\',]))|$escape)";
172 my $hsegment = "(?:(?:$httpuchar|[;:\@&=~])*)";
173 my $search = "(?:(?:$httpuchar|[;:\@&=~])*)";
174 my $hpath = "(?:$hsegment(?:/$hsegment)*)";
175 my $httpurl = "(?:http://$hostport(?:/$hpath(?:\\?$search)?)?(?:#$xchar*)?)";
176
177 # GOPHER (see also RFC1436)
178 my $gopher_plus = "(?:$xchar*)";
179 my $selector = "(?:$xchar*)";
180 my $gtype = "(?:$xchar)";
181 my $gopherurl = "(?:gopher://$hostport(?:/$gtype(?:$selector(?:%09$search(?:%09$gopher_plus)?)?)?)?)";
182
183 # MAILTO (see also RFC822)
184 my $encoded822addr = "(?:$email)";
185 my $mailtourl = "(?:mailto:$encoded822addr)";
186
187 # NEWS (see also RFC1036)
188 my $article = "(?:(?:$uchar|[;/?:&=])+\@$host)";
189 my $group = "(?:$alpha(?:$alpha|$digit|[.+_-])*)";
190 my $grouppart = "(?:$article|$group|\\*)";
191 my $newsurl = "(?:news:$grouppart)";
192
193 # NNTP (see also RFC977)
194 my $nntpurl = "(?:nntp://$hostport/$group(?:/$digits)?)";
195
196 # TELNET
197 my $telneturl = "(?:telnet://$login(?:/)?)";
198
199 # WAIS (see also RFC1625)
200 my $wpath = "(?:$uchar*)";
201 my $wtype = "(?:$uchar*)";
202 my $database = "(?:$uchar*)";
203 my $waisdoc = "(?:wais://$hostport/$database/$wtype/$wpath)";
204 my $waisindex = "(?:wais://$hostport/$database\\?$search)";
205 my $waisdatabase = "(?:wais://$hostport/$database)";
206 my $waisurl = "(?:$waisdatabase|$waisindex|$waisdoc)";
207
208 # PROSPERO
209 my $fieldvalue = "(?:(?:$uchar|[?:\@&]))";
210 my $fieldname = "(?:(?:$uchar|[?:\@&]))";
211 my $fieldspec = "(?:;$fieldname=$fieldvalue)";
212 my $psegment = "(?:(?:$uchar|[?:\@&=]))";
213 my $ppath = "(?:$psegment(?:/$psegment)*)";
214 my $prosperourl = "(?:prospero://$hostport/$ppath(?:$fieldspec)*)";
215
216 %url = (
217 http => qr/^$httpurl$/,
218 ftp => qr/^$ftpurl$/,
219 news => qr/^$newsurl$/,
220 nntp => qr/^$nntpurl$/,
221 telnet => qr/^$telneturl$/,
222 gopher => qr/^$gopherurl$/,
223 wais => qr/^$waisurl$/,
224 mailto => qr/^$mailtourl$/,
225 file => qr/^$fileurl$/,
226 prospero => qr/^$prosperourl$/
227 );
228 }
229
230 # keeping require happy
231 1;
232
233 #
234 #
235 ### end of CheckRFC ############################################################

patrick-canterino.de