copy current state of medienstaatsvertrag.org, to be verified
This commit is contained in:
485
lib/calcms/markup.pm
Normal file
485
lib/calcms/markup.pm
Normal file
@@ -0,0 +1,485 @@
|
||||
use warnings "all";
|
||||
use strict;
|
||||
use Data::Dumper;
|
||||
use Text::WikiCreole;
|
||||
|
||||
use HTML::Parse;
|
||||
use HTML::FormatText;
|
||||
|
||||
use log;
|
||||
|
||||
package markup;
|
||||
|
||||
use Encode;
|
||||
|
||||
require Exporter;
|
||||
our @ISA = qw(Exporter);
|
||||
#our @EXPORT = qw(all);
|
||||
our @EXPORT_OK = qw(fix_line_ends html_to_creole creole_to_html creole_to_plain plain_to_ical ical_to_plain ical_to_xml html_to_plain fix_utf8 uri_encode compress base26);
|
||||
our %EXPORT_TAGS = ( 'all' => [ @EXPORT_OK ] );
|
||||
|
||||
sub fix_line_ends{
|
||||
my $s=shift;
|
||||
$s=~s/\r?\n|\r/\n/g;
|
||||
return $s;
|
||||
}
|
||||
|
||||
# convert 1..26 to a..z, 27 to aa, inspired by ConvertAA
|
||||
sub base26{
|
||||
my $num = shift ;
|
||||
return '' if $num <= 0;
|
||||
|
||||
my $s = "";
|
||||
while($num){
|
||||
$s= chr (--$num % 26 + ord "a").$s;
|
||||
$num = int $num/26;
|
||||
}
|
||||
|
||||
return $s;
|
||||
}
|
||||
|
||||
sub html_to_creole{
|
||||
my $s=shift;
|
||||
|
||||
#remove elements
|
||||
# $s=~s/[\r\f\n]+/\n/gi;
|
||||
# $s=~s/<\/p.*?>//gi;
|
||||
# $s=~s/<\/br.*?>//gi;
|
||||
$s=~s/\<\!\-\-[\s\S]*?\-\-\>//gi;
|
||||
$s=~s/<script.*?>.*?<\/script.*?>//gi;
|
||||
# $s=~s/<\/?span.*?>//gi;
|
||||
# $s=~s/<\/?font.*?>//gi;
|
||||
# $s=~s/<\/?meta.*?>//gi;
|
||||
# $s=~s/<\/?title.*?>//gi;
|
||||
# $s=~s/<\/?style.*?>//gi;
|
||||
# $s=~s/<\/?col.*?>//gi;
|
||||
# $s=~s/<\/?thead.*?>//gi;
|
||||
# $s=~s/<\/?tbody.*?>//gi;
|
||||
$s=~s/<\/?form.*?>//gi;
|
||||
$s=~s/<\/?select.*?>//gi;
|
||||
$s=~s/<\/?option.*?//gi;
|
||||
$s=~s/<\/?input.*?>//gi;
|
||||
# $s=~s/<\/?button.*?>//gi;
|
||||
# $s=~s/<\/?textarea.*?>//gi;
|
||||
$s=~s/<\/?script.*?>//gi;
|
||||
|
||||
#table elements
|
||||
# $s=~s/\s*<\/?td.*?>//gi;
|
||||
# $s=~s/\s*<\/?th.*?>//gi;
|
||||
|
||||
#remove line breaks
|
||||
$s=~s/[\r\n]+/ /gi;
|
||||
|
||||
#formats
|
||||
$s=~s/<img.*?src="(.*?)".*?>/{{$1\|}}/gi;
|
||||
$s=~s/<img.*?title="(.*?)".*?>/{{$2\|$1}}/gi;
|
||||
$s=~s/<img.*?src="(.*?)"[^>]*?title="(.*?)".*?>/{{$1\|$2}}/gi;
|
||||
$s=~s/<img.*?title="(.*?)"[^>]*?src="(.*?)".*?>/{{$2\|$1}}/gi;
|
||||
$s=~s/<\/?img.*?>//gi;
|
||||
#replace line breaks from images
|
||||
$s=~s/(\{\{[^\}\n]*?)\n([^\}\n]*?\}\})/$1$2/g;
|
||||
$s=~s/(\{\{[^\}\n]*?)\n([^\}\n]*?\}\})/$1$2/g;
|
||||
$s=~s/(\{\{[^\}\n]*?)\n([^\}\n]*?\}\})/$1$2/g;
|
||||
|
||||
$s=~s/<i.*?>(.*?)<\/i>/\/\/$1\/\//gi;
|
||||
$s=~s/<\/?i.*?>//gi;
|
||||
$s=~s/<b.*?>(.*?)<\/b>/\*\*$1\*\*/gi;
|
||||
# $s=~s/<\/?b.*?>//gi;
|
||||
|
||||
$s=~s/<strong.*?>(.*?)<\/strong>/\*\*$1\*\*/gi;
|
||||
$s=~s/<em.*?>(.*?)<\/em>/\/\/$1\/\//gi;
|
||||
$s=~s/<blockquote.*?>((\W+|\w+)*?)<\/blockquote>/{{{$1}}}/gi;
|
||||
# $s=~s/<a\s+.*?href="(.*?)".*?>((\W+|\w+)*?)<\/a>/\[\[$1\|$2\]\]$3/gi;
|
||||
$s=~s/<a\s+.*?href="(.*?)".*?>(.*?)(\s*)<\/a>/\[\[$1\|$2\]\]$3/gi;
|
||||
$s=~s/<a.*?>//gi;
|
||||
#replace line breaks from links
|
||||
$s=~s/(\[\[[^\]\n]*?)\n([^\]]*?\]\])/$1$2/g;
|
||||
$s=~s/(\[\[[^\]\n]*?)\n([^\]]*?\]\])/$1$2/g;
|
||||
$s=~s/(\[\[[^\]\n]*?)\n([^\]]*?\]\])/$1$2/g;
|
||||
|
||||
# print STDERR Dumper($s) if ($s=~/</);
|
||||
|
||||
$s=~s/[\s]+/ /gi;
|
||||
# $s=~s/\n[ \t\r\n]+\n/\n\n/gi;
|
||||
# $s=~s/\n[ ]+/\n /gi;
|
||||
# $s=~s/\n+/\n/gi;
|
||||
# $s=~s/\n+/\\\n/gi;
|
||||
|
||||
#line elements, increase head line level to avoid breaking single = chars
|
||||
$s=~s/\s*<h1.*?>/== /gi;
|
||||
$s=~s/\s*<h2.*?>/=== /gi;
|
||||
$s=~s/\s*<h3.*?>/==== /gi;
|
||||
$s=~s/\s*<h\d.*?>/===== /gi;
|
||||
# $s=~s/\s*<\/h\d.*?>/\n/gi;
|
||||
|
||||
# $s=~s/<br.*?>/\\\\<br>/gi;
|
||||
# $s=~s/\s*<div.*?>//gi;
|
||||
# $s=~s/\s*<\/div>/\n/gi;
|
||||
|
||||
# $s=~s/<table.*?>/\n/gi;
|
||||
# $s=~s/<\/table>/\n/gi;
|
||||
# $s=~s/\s*<tr.*?>//gi;
|
||||
# $s=~s/\s*<\/tr>//gi;
|
||||
|
||||
# $s=~s/\s*<ol.*?>/\n/gi;
|
||||
# $s=~s/\s*<\/ol>/\n/gi;
|
||||
# $s=~s/\s*<ul.*?>/\n/gi;
|
||||
# $s=~s/\s*<\/ul>/\n/gi;
|
||||
# $s=~s/\s*<li.*?>/\n\* /gi;
|
||||
# $s=~s/\s*<\/li>//gi;
|
||||
|
||||
# $s=~s/\s*<p.*?>\s*/\n\n/gi;
|
||||
# $s=~s/\s*<br.*?>\s*/\n /gi;
|
||||
|
||||
my $tree=HTML::Parse::parse_html('<body>'.$s.'</body>');
|
||||
my $formatter = HTML::FormatText->new(leftmargin => 0, rightmargin => 2000);
|
||||
$s= $formatter->format($tree);
|
||||
#use Data::Dumper; print "asd:<textarea cols=100 rows=5>".Dumper($s);print "</textarea>";
|
||||
$s=~s/\</\</g;
|
||||
|
||||
#fix line endings
|
||||
$s=~s/\n[ \t]+/\n/gi;
|
||||
#$s=~s/\n[\t\r ]+\n/\n\n/g;
|
||||
$s=~s/\n{3,99}/\n\n/g;
|
||||
$s=~s/\n*\*[\s]+/\n\* /g;
|
||||
#$s=~s/(\n\*.*?\n)([^\*])/$1\n\n$2/g;
|
||||
|
||||
#enter line break before headlines
|
||||
$s=~s/(={2,99})/\n$1/g;
|
||||
#reduce head line level
|
||||
$s=~s/=(=+)/$1/g;
|
||||
|
||||
$s=~s/^\s+//gi;
|
||||
$s=~s/\s+$//gi;
|
||||
$s=~s/\n{3,99}/\n\n/g;
|
||||
# $s=~s/\n\n+/ \\\\\n/g;
|
||||
$s=~s/\n/\\\\\n/g;
|
||||
$s=~s/\\\\\n\=/\n\=/g;
|
||||
|
||||
#$s=~s/\n\n/ \\\\\n/g;
|
||||
# $s=~s/(\\\\\n){3,99}/\\\\\n\\\\\n/g;
|
||||
#$s=~s/\\\\[ \t]+/\\\\\n/g;
|
||||
|
||||
# $s=~s/<\/a>//gi;
|
||||
|
||||
return $s;
|
||||
}
|
||||
|
||||
sub creole_to_html{
|
||||
my $s=$_[0]||'';
|
||||
|
||||
#$s=~s/\n\#\n/\n/g;
|
||||
#fix_line_ends($s);
|
||||
$s=~s/<a\s+.*?href="(.*?)".*?>(.*?)(\s*)<\/a>/\[\[$1\|$2\]\]$3/gi;
|
||||
$s=~s/<a.*?>//gi;
|
||||
|
||||
$s=~s/(\[\[[^\]\n]*?)\n([^\]]*?\]\])/$1$2/g;
|
||||
$s=~s/(\[\[[^\]\n]*?)\n([^\]]*?\]\])/$1$2/g;
|
||||
$s=~s/(\[\[[^\]\n]*?)\n([^\]]*?\]\])/$1$2/g;
|
||||
$s=~s/^\s+//g;
|
||||
$s=~s/\s+$//g;
|
||||
|
||||
$s=Text::WikiCreole::creole_parse($s)||'';
|
||||
# $s=~s/<p>/\n/gi;
|
||||
# $s=~s/\{\{\{((\W+|\w+)+?)\}\}\}/<blockquote>$1<\/blockquote>/g;
|
||||
# $s=~s/\{\{(.+?)\|(.*?)\}\}/<img src="$1" title="$2" \/>/g;
|
||||
# $s=~s/\[\[(.+?)\|(.*?)\]\]/<a href="$1">$2<\/a>/g;
|
||||
# $s=~s/([^\:])\/\/(.*?[^\:])\/\//$1<em>$2<\/em> /g;
|
||||
# $s=~s/\n=== (.*?)\n/<h3>$1<\/h3>\n/g;
|
||||
# $s=~s/\n== (.*?)\n/<h2>$1<\/h2>\n/g;
|
||||
#replace line breaks from images
|
||||
$s=~s/(\{\{[^\}\n]*?)\n([^\}\n]*?\}\})/$1$2/g;
|
||||
$s=~s/(\{\{[^\}\n]*?)\n([^\}\n]*?\}\})/$1$2/g;
|
||||
$s=~s/(\{\{[^\}\n]*?)\n([^\}\n]*?\}\})/$1$2/g;
|
||||
#replace line breaks from links
|
||||
# $s=~s/\n= (.*?)\n/<h1>$1<\/h1>\n/g;
|
||||
# $s=~s/\*\*(.*?)\*\*/<strong>$1<\/strong> /g;
|
||||
# $s=~s/^== (.*?)\n/<h2>$1<\/h2>\n/g;
|
||||
# $s=~s/\n\* (.*?)([\r\n]+)/<li>$1<\/li>\n/g;
|
||||
# $s=~s/\n\- (.*?)\n/<lo>$1<\/lo>\n/g;
|
||||
# $s=~s/\n\n/<p>/gi;
|
||||
# $s=~s/\n+/<br \/>/gi;
|
||||
# $s=~s/\</\</g;
|
||||
|
||||
#remove whitespaces and break lines at start or end of elements
|
||||
for my $elem ('p','li'){
|
||||
$s=~s|<$elem>\s*<br/><br/>|<$elem>|g;
|
||||
$s=~s|<br/><br/>\s*</$elem>|</$elem>|g;
|
||||
}
|
||||
|
||||
return $s;
|
||||
}
|
||||
|
||||
sub creole_to_plain{
|
||||
my $s=shift;
|
||||
|
||||
$s=~s/\<p\>/\n/gi;
|
||||
$s=~s/\{\{\{((\W+|\w+)+?)\}\}\}/<blockquote>$1<\/blockquote>/g;
|
||||
$s=~s/\{\{(.+?)\|(.*?)\}\}//g;
|
||||
$s=~s/\[\[(.+?)\|(.*?)\]\]/$2/g;
|
||||
$s=~s/\/\/([^\/\/]*?)\/\//<em>$1<\/em> /g;
|
||||
$s=~s/\n=== (.*?)\n/\n<h3>$1<\/h3>\n/g;
|
||||
$s=~s/\n== (.*?)\n/\n<h2>$1<\/h2>\n/g;
|
||||
$s=~s/\*\*(.*?)\*\*/<strong>$1<\/strong> /g;
|
||||
$s=~s/^== (.*?)\n/<h2>$1<\/h2>\n/g;
|
||||
$s=~s/\n\* (.*?)\n/\n<li>$1<\/li>\n/g;
|
||||
$s=~s/\n\* (.*?)\n/\n<li>$1<\/li>\n/g;
|
||||
$s=~s/\n\- (.*?)\n/\n<lo>$1<\/lo>\n/g;
|
||||
$s=~s/\n\- (.*?)\n/\n<lo>$1<\/lo>\n/g;
|
||||
$s=~s/\n\n/\n<p>/gi;
|
||||
$s=~s/\n/\n<br\/>/gi;
|
||||
return $s;
|
||||
}
|
||||
|
||||
sub html_to_plain{
|
||||
my $s=shift;
|
||||
return '' unless (defined $s);
|
||||
my $tree=HTML::Parse::parse_html('<body>'.$s.'</body>');
|
||||
my $formatter = HTML::FormatText->new(leftmargin => 0, rightmargin => 2000);
|
||||
$s= $formatter->format($tree);
|
||||
return $s;
|
||||
}
|
||||
|
||||
sub ical_to_plain{
|
||||
return '' unless defined ($_[0]);
|
||||
$_[0]=~s/\\n/\n/gi;
|
||||
$_[0]=~s/ /\t/gi;
|
||||
$_[0]=~s/\\\./\./gi;
|
||||
$_[0]=~s/\\\,/\,/gi;
|
||||
$_[0]=~s/\\\\/\\/gi;
|
||||
return $_[0];
|
||||
}
|
||||
|
||||
sub plain_to_ical{
|
||||
return '' unless defined ($_[0]);
|
||||
#remove images + links
|
||||
$_[0]=~s/\[\[.+?\|(.+?)\]\]/$1/g;
|
||||
$_[0]=~s/\{\{.+?\}\}//g;
|
||||
$_[0]=~s/^\s+//g;
|
||||
$_[0]=~s/\\/\\\\/gi;
|
||||
$_[0]=~s/\,/\\\,/gi;
|
||||
# $_[0]=~s/\./\\\./gi;
|
||||
$_[0]=~s/[\r\n]/\\n/gi;
|
||||
$_[0]=~s/\t/ /gi;
|
||||
return $_[0];
|
||||
}
|
||||
|
||||
sub plain_to_xml{
|
||||
return '' unless defined ($_[0]);
|
||||
$_[0]=~s/\n\={1,6} (.*?)\s+/\n\[\[$1\]\]\n/gi;
|
||||
#remove images + links
|
||||
$_[0]=~s/\[\[.+?\|(.+?)\]\]/$1/g;
|
||||
$_[0]=~s/\{\{.+?\}\}//g;
|
||||
return encode_xml_element($_[0]);
|
||||
|
||||
# $_[0]=~s/\ä/ä/gi;
|
||||
# $_[0]=~s/\ö/ö/gi;
|
||||
# $_[0]=~s/\ü/ü/gi;
|
||||
# $_[0]=~s/\Ä/Ä/gi;
|
||||
# $_[0]=~s/\Ö/Ö/gi;
|
||||
# $_[0]=~s/\Ü/Ü/gi;
|
||||
# $_[0]=~s/\ß/ß/gi;
|
||||
# $_[0]=~s/\&/\&/gi;
|
||||
# $_[0]=~s/\</\</gi;
|
||||
# $_[0]=~s/\>/\>/gi;
|
||||
# $_[0]=~s/\"/\"/gi;
|
||||
|
||||
## $_[0]=~s/\n/<br\/>/gi;
|
||||
## $_[0]=~s/\&amp;/\&/gi;
|
||||
## $_[0]=~s/\&amp;/+/gi;
|
||||
## $_[0]=~s/\&/+/gi;
|
||||
## $_[0]=~s/\&/+/gi;
|
||||
# return $_[0];
|
||||
}
|
||||
|
||||
sub fix_utf8{
|
||||
$_[0] = decode( 'cp1252', $_[0] );
|
||||
return $_[0];
|
||||
}
|
||||
|
||||
sub uri_encode{
|
||||
$_[0] =~s/([^a-zA-Z0-9_\.\-])/sprintf("%%%02lx",ord($1))/esg;
|
||||
return $_[0];
|
||||
}
|
||||
|
||||
sub compress{
|
||||
my $header='';
|
||||
|
||||
if($_[0]=~/(Content\-type\:[^\n]+[\n]+)/){
|
||||
$header=$1;
|
||||
}else{
|
||||
#return;
|
||||
}
|
||||
|
||||
my $start=index($_[0],$header);
|
||||
return if ($start<0);
|
||||
|
||||
my $header_length=length($header);
|
||||
$header =substr($_[0],0,$start+$header_length);
|
||||
# print $header."\n";
|
||||
|
||||
my $content=substr($_[0],$start+$header_length);
|
||||
|
||||
# #remove multiple line breaks
|
||||
$content=~s/[\r\n]+[\s]*[\r\n]+/\n/g;
|
||||
|
||||
#remove leading whitespaces
|
||||
$content=~s/[\r\n]+[\s]+/\n/g;
|
||||
|
||||
#remove tailing whitespaces
|
||||
$content=~s/[\t ]*[\r\n]+/\n/g;
|
||||
|
||||
#remove whitespaces inside tags
|
||||
$content=~s/([\n]\<[^\n]+)[\r\n]+/$1 /g;
|
||||
$content=~s/\"\s+\>/\"\>/g;
|
||||
|
||||
#get closing tags closer
|
||||
$content=~s/[\r\n]+(\<[\/\!])/$1/g;
|
||||
$content=~s/(\>)[\r\n]+([^\<])/$1$2/g;
|
||||
|
||||
#remove leading whitespaces
|
||||
#$content=~s/[\r\n]+([\d\S])/$1/g;
|
||||
|
||||
#remove empty lines
|
||||
$content=~s/[\n\r]+/\n/g;
|
||||
|
||||
#remove whitespaces between tags
|
||||
$content=~s/\>[\t ]+\<(^\/T)/\>\<$1/g;
|
||||
|
||||
#multiple whitespaces
|
||||
$content=~s/[\t ]+/ /g;
|
||||
|
||||
#restore content-type line break
|
||||
$_[0]=$header.$content;
|
||||
#$_[0]=~s/HTTP_CONTENT_TYPE/\n\n/;
|
||||
# return $_[0];
|
||||
}
|
||||
|
||||
#from XML::RSS.pm
|
||||
my %entity = (
|
||||
nbsp => " ",
|
||||
iexcl => "¡",
|
||||
cent => "¢",
|
||||
pound => "£",
|
||||
curren => "¤",
|
||||
yen => "¥",
|
||||
brvbar => "¦",
|
||||
sect => "§",
|
||||
uml => "¨",
|
||||
copy => "©",
|
||||
ordf => "ª",
|
||||
laquo => "«",
|
||||
not => "¬",
|
||||
shy => "­",
|
||||
reg => "®",
|
||||
macr => "¯",
|
||||
deg => "°",
|
||||
plusmn => "±",
|
||||
sup2 => "²",
|
||||
sup3 => "³",
|
||||
acute => "´",
|
||||
micro => "µ",
|
||||
para => "¶",
|
||||
middot => "·",
|
||||
cedil => "¸",
|
||||
sup1 => "¹",
|
||||
ordm => "º",
|
||||
raquo => "»",
|
||||
frac14 => "¼",
|
||||
frac12 => "½",
|
||||
frac34 => "¾",
|
||||
iquest => "¿",
|
||||
Agrave => "À",
|
||||
Aacute => "Á",
|
||||
Acirc => "Â",
|
||||
Atilde => "Ã",
|
||||
Auml => "Ä",
|
||||
Aring => "Å",
|
||||
AElig => "Æ",
|
||||
Ccedil => "Ç",
|
||||
Egrave => "È",
|
||||
Eacute => "É",
|
||||
Ecirc => "Ê",
|
||||
Euml => "Ë",
|
||||
Igrave => "Ì",
|
||||
Iacute => "Í",
|
||||
Icirc => "Î",
|
||||
Iuml => "Ï",
|
||||
ETH => "Ð",
|
||||
Ntilde => "Ñ",
|
||||
Ograve => "Ò",
|
||||
Oacute => "Ó",
|
||||
Ocirc => "Ô",
|
||||
Otilde => "Õ",
|
||||
Ouml => "Ö",
|
||||
times => "×",
|
||||
Oslash => "Ø",
|
||||
Ugrave => "Ù",
|
||||
Uacute => "Ú",
|
||||
Ucirc => "Û",
|
||||
Uuml => "Ü",
|
||||
Yacute => "Ý",
|
||||
THORN => "Þ",
|
||||
szlig => "ß",
|
||||
agrave => "à",
|
||||
aacute => "á",
|
||||
acirc => "â",
|
||||
atilde => "ã",
|
||||
auml => "ä",
|
||||
aring => "å",
|
||||
aelig => "æ",
|
||||
ccedil => "ç",
|
||||
egrave => "è",
|
||||
eacute => "é",
|
||||
ecirc => "ê",
|
||||
euml => "ë",
|
||||
igrave => "ì",
|
||||
iacute => "í",
|
||||
icirc => "î",
|
||||
iuml => "ï",
|
||||
eth => "ð",
|
||||
ntilde => "ñ",
|
||||
ograve => "ò",
|
||||
oacute => "ó",
|
||||
ocirc => "ô",
|
||||
otilde => "õ",
|
||||
ouml => "ö",
|
||||
divide => "÷",
|
||||
oslash => "ø",
|
||||
ugrave => "ù",
|
||||
uacute => "ú",
|
||||
ucirc => "û",
|
||||
uuml => "ü",
|
||||
yacute => "ý",
|
||||
thorn => "þ",
|
||||
yuml => "ÿ",
|
||||
);
|
||||
|
||||
my $entities = join('|', keys %entity);
|
||||
|
||||
sub encode_xml_element {
|
||||
my $text = shift;
|
||||
|
||||
my $encoded_text = '';
|
||||
|
||||
while ( $text =~ s/(.*?)(\<\!\[CDATA\[.*?\]\]\>)//s ) {
|
||||
$encoded_text .= encode_xml_element_text($1) . $2;
|
||||
}
|
||||
$encoded_text .= encode_xml_element_text($text);
|
||||
|
||||
return $encoded_text;
|
||||
}
|
||||
|
||||
sub encode_xml_element_text {
|
||||
my $text = shift;
|
||||
|
||||
$text =~ s/&(?!(#[0-9]+|#x[0-9a-fA-F]+|\w+);)/&/g;
|
||||
$text =~ s/&($entities);/$entity{$1}/g;
|
||||
$text =~ s/\</\<\;/g;
|
||||
$text =~ s/\>/\>\;/g;
|
||||
|
||||
return $text;
|
||||
}
|
||||
|
||||
#do not delete last line!
|
||||
1;
|
||||
Reference in New Issue
Block a user