use warnings "all"; use strict; use Data::Dumper; use Text::WikiCreole; use HTML::Parse; use HTML::FormatText; use log; package markup; use Encode; require Exporter; our @ISA = qw(Exporter); #our @EXPORT = qw(all); our @EXPORT_OK = qw(fix_line_ends html_to_creole creole_to_html creole_to_plain plain_to_ical ical_to_plain ical_to_xml html_to_plain fix_utf8 uri_encode compress base26); our %EXPORT_TAGS = ( 'all' => [ @EXPORT_OK ] ); sub fix_line_ends{ my $s=shift; $s=~s/\r?\n|\r/\n/g; return $s; } # convert 1..26 to a..z, 27 to aa, inspired by ConvertAA sub base26{ my $num = shift ; return '' if $num <= 0; my $s = ""; while($num){ $s= chr (--$num % 26 + ord "a").$s; $num = int $num/26; } return $s; } sub html_to_creole{ my $s=shift; #remove elements # $s=~s/[\r\f\n]+/\n/gi; # $s=~s/<\/p.*?>//gi; # $s=~s/<\/br.*?>//gi; $s=~s/\<\!\-\-[\s\S]*?\-\-\>//gi; $s=~s/.*?<\/script.*?>//gi; # $s=~s/<\/?span.*?>//gi; # $s=~s/<\/?font.*?>//gi; # $s=~s/<\/?meta.*?>//gi; # $s=~s/<\/?title.*?>//gi; # $s=~s/<\/?style.*?>//gi; # $s=~s/<\/?col.*?>//gi; # $s=~s/<\/?thead.*?>//gi; # $s=~s/<\/?tbody.*?>//gi; $s=~s/<\/?form.*?>//gi; $s=~s/<\/?select.*?>//gi; $s=~s/<\/?option.*?//gi; $s=~s/<\/?input.*?>//gi; # $s=~s/<\/?button.*?>//gi; # $s=~s/<\/?textarea.*?>//gi; $s=~s/<\/?script.*?>//gi; #table elements # $s=~s/\s*<\/?td.*?>//gi; # $s=~s/\s*<\/?th.*?>//gi; #remove line breaks $s=~s/[\r\n]+/ /gi; #formats $s=~s//{{$1\|}}/gi; $s=~s//{{$2\|$1}}/gi; $s=~s/]*?title="(.*?)".*?>/{{$1\|$2}}/gi; $s=~s/]*?src="(.*?)".*?>/{{$2\|$1}}/gi; $s=~s/<\/?img.*?>//gi; #replace line breaks from images $s=~s/(\{\{[^\}\n]*?)\n([^\}\n]*?\}\})/$1$2/g; $s=~s/(\{\{[^\}\n]*?)\n([^\}\n]*?\}\})/$1$2/g; $s=~s/(\{\{[^\}\n]*?)\n([^\}\n]*?\}\})/$1$2/g; $s=~s/(.*?)<\/i>/\/\/$1\/\//gi; $s=~s/<\/?i.*?>//gi; $s=~s/(.*?)<\/b>/\*\*$1\*\*/gi; # $s=~s/<\/?b.*?>//gi; $s=~s/(.*?)<\/strong>/\*\*$1\*\*/gi; $s=~s/(.*?)<\/em>/\/\/$1\/\//gi; $s=~s/((\W+|\w+)*?)<\/blockquote>/{{{$1}}}/gi; # $s=~s/((\W+|\w+)*?)<\/a>/\[\[$1\|$2\]\]$3/gi; $s=~s/(.*?)(\s*)<\/a>/\[\[$1\|$2\]\]$3/gi; $s=~s///gi; #replace line breaks from links $s=~s/(\[\[[^\]\n]*?)\n([^\]]*?\]\])/$1$2/g; $s=~s/(\[\[[^\]\n]*?)\n([^\]]*?\]\])/$1$2/g; $s=~s/(\[\[[^\]\n]*?)\n([^\]]*?\]\])/$1$2/g; # print STDERR Dumper($s) if ($s=~//== /gi; $s=~s/\s*/=== /gi; $s=~s/\s*/==== /gi; $s=~s/\s*/===== /gi; # $s=~s/\s*<\/h\d.*?>/\n/gi; # $s=~s//\\\\
/gi; # $s=~s/\s*//gi; # $s=~s/\s*<\/div>/\n/gi; # $s=~s//\n/gi; # $s=~s/<\/table>/\n/gi; # $s=~s/\s*//gi; # $s=~s/\s*<\/tr>//gi; # $s=~s/\s*/\n/gi; # $s=~s/\s*<\/ol>/\n/gi; # $s=~s/\s*/\n/gi; # $s=~s/\s*<\/ul>/\n/gi; # $s=~s/\s*/\n\* /gi; # $s=~s/\s*<\/li>//gi; # $s=~s/\s*\s*/\n\n/gi; # $s=~s/\s*\s*/\n /gi; my $tree=HTML::Parse::parse_html(''.$s.''); my $formatter = HTML::FormatText->new(leftmargin => 0, rightmargin => 2000); $s= $formatter->format($tree); #use Data::Dumper; print "asd:"; $s=~s/\//gi; return $s; } sub creole_to_html{ my $s=$_[0]||''; #$s=~s/\n\#\n/\n/g; #fix_line_ends($s); $s=~s/(.*?)(\s*)<\/a>/\[\[$1\|$2\]\]$3/gi; $s=~s///gi; $s=~s/(\[\[[^\]\n]*?)\n([^\]]*?\]\])/$1$2/g; $s=~s/(\[\[[^\]\n]*?)\n([^\]]*?\]\])/$1$2/g; $s=~s/(\[\[[^\]\n]*?)\n([^\]]*?\]\])/$1$2/g; $s=~s/^\s+//g; $s=~s/\s+$//g; $s=Text::WikiCreole::creole_parse($s)||''; # $s=~s/

/\n/gi; # $s=~s/\{\{\{((\W+|\w+)+?)\}\}\}/

$1<\/blockquote>/g; # $s=~s/\{\{(.+?)\|(.*?)\}\}//g; # $s=~s/\[\[(.+?)\|(.*?)\]\]/$2<\/a>/g; # $s=~s/([^\:])\/\/(.*?[^\:])\/\//$1$2<\/em> /g; # $s=~s/\n=== (.*?)\n/

$1<\/h3>\n/g; # $s=~s/\n== (.*?)\n/

$1<\/h2>\n/g; #replace line breaks from images $s=~s/(\{\{[^\}\n]*?)\n([^\}\n]*?\}\})/$1$2/g; $s=~s/(\{\{[^\}\n]*?)\n([^\}\n]*?\}\})/$1$2/g; $s=~s/(\{\{[^\}\n]*?)\n([^\}\n]*?\}\})/$1$2/g; #replace line breaks from links # $s=~s/\n= (.*?)\n/

$1<\/h1>\n/g; # $s=~s/\*\*(.*?)\*\*/$1<\/strong> /g; # $s=~s/^== (.*?)\n/

$1<\/h2>\n/g; # $s=~s/\n\* (.*?)([\r\n]+)/
  • $1<\/li>\n/g; # $s=~s/\n\- (.*?)\n/$1<\/lo>\n/g; # $s=~s/\n\n/

    /gi; # $s=~s/\n+/
    /gi; # $s=~s/\\s*

    |<$elem>|g; $s=~s|

    \s*||g; } return $s; } sub creole_to_plain{ my $s=shift; $s=~s/\/\n/gi; $s=~s/\{\{\{((\W+|\w+)+?)\}\}\}/

    $1<\/blockquote>/g; $s=~s/\{\{(.+?)\|(.*?)\}\}//g; $s=~s/\[\[(.+?)\|(.*?)\]\]/$2/g; $s=~s/\/\/([^\/\/]*?)\/\//$1<\/em> /g; $s=~s/\n=== (.*?)\n/\n

    $1<\/h3>\n/g; $s=~s/\n== (.*?)\n/\n

    $1<\/h2>\n/g; $s=~s/\*\*(.*?)\*\*/$1<\/strong> /g; $s=~s/^== (.*?)\n/

    $1<\/h2>\n/g; $s=~s/\n\* (.*?)\n/\n
  • $1<\/li>\n/g; $s=~s/\n\* (.*?)\n/\n
  • $1<\/li>\n/g; $s=~s/\n\- (.*?)\n/\n$1<\/lo>\n/g; $s=~s/\n\- (.*?)\n/\n$1<\/lo>\n/g; $s=~s/\n\n/\n

    /gi; $s=~s/\n/\n/gi; return $s; } sub html_to_plain{ my $s=shift; return '' unless (defined $s); my $tree=HTML::Parse::parse_html(''.$s.''); my $formatter = HTML::FormatText->new(leftmargin => 0, rightmargin => 2000); $s= $formatter->format($tree); return $s; } sub ical_to_plain{ return '' unless defined ($_[0]); $_[0]=~s/\\n/\n/gi; $_[0]=~s/ /\t/gi; $_[0]=~s/\\\./\./gi; $_[0]=~s/\\\,/\,/gi; $_[0]=~s/\\\\/\\/gi; return $_[0]; } sub plain_to_ical{ return '' unless defined ($_[0]); #remove images + links $_[0]=~s/\[\[.+?\|(.+?)\]\]/$1/g; $_[0]=~s/\{\{.+?\}\}//g; $_[0]=~s/^\s+//g; $_[0]=~s/\\/\\\\/gi; $_[0]=~s/\,/\\\,/gi; # $_[0]=~s/\./\\\./gi; $_[0]=~s/[\r\n]/\\n/gi; $_[0]=~s/\t/ /gi; return $_[0]; } sub plain_to_xml{ return '' unless defined ($_[0]); $_[0]=~s/\n\={1,6} (.*?)\s+/\n\[\[$1\]\]\n/gi; #remove images + links $_[0]=~s/\[\[.+?\|(.+?)\]\]/$1/g; $_[0]=~s/\{\{.+?\}\}//g; return encode_xml_element($_[0]); # $_[0]=~s/\ä/ä/gi; # $_[0]=~s/\ö/ö/gi; # $_[0]=~s/\ü/ü/gi; # $_[0]=~s/\Ä/Ä/gi; # $_[0]=~s/\Ö/Ö/gi; # $_[0]=~s/\Ü/Ü/gi; # $_[0]=~s/\ß/ß/gi; # $_[0]=~s/\&/\&/gi; # $_[0]=~s/\/\>/gi; # $_[0]=~s/\"/\"/gi; ## $_[0]=~s/\n//gi; ## $_[0]=~s/\&amp;/\&/gi; ## $_[0]=~s/\&amp;/+/gi; ## $_[0]=~s/\&/+/gi; ## $_[0]=~s/\&/+/gi; # return $_[0]; } sub fix_utf8{ $_[0] = decode( 'cp1252', $_[0] ); return $_[0]; } sub uri_encode{ $_[0] =~s/([^a-zA-Z0-9_\.\-])/sprintf("%%%02lx",ord($1))/esg; return $_[0]; } sub compress{ my $header=''; if($_[0]=~/(Content\-type\:[^\n]+[\n]+)/){ $header=$1; }else{ #return; } my $start=index($_[0],$header); return if ($start<0); my $header_length=length($header); $header =substr($_[0],0,$start+$header_length); # print $header."\n"; my $content=substr($_[0],$start+$header_length); # #remove multiple line breaks $content=~s/[\r\n]+[\s]*[\r\n]+/\n/g; #remove leading whitespaces $content=~s/[\r\n]+[\s]+/\n/g; #remove tailing whitespaces $content=~s/[\t ]*[\r\n]+/\n/g; #remove whitespaces inside tags $content=~s/([\n]\<[^\n]+)[\r\n]+/$1 /g; $content=~s/\"\s+\>/\"\>/g; #get closing tags closer $content=~s/[\r\n]+(\<[\/\!])/$1/g; $content=~s/(\>)[\r\n]+([^\<])/$1$2/g; #remove leading whitespaces #$content=~s/[\r\n]+([\d\S])/$1/g; #remove empty lines $content=~s/[\n\r]+/\n/g; #remove whitespaces between tags $content=~s/\>[\t ]+\<(^\/T)/\>\<$1/g; #multiple whitespaces $content=~s/[\t ]+/ /g; #restore content-type line break $_[0]=$header.$content; #$_[0]=~s/HTTP_CONTENT_TYPE/\n\n/; # return $_[0]; } #from XML::RSS.pm my %entity = ( nbsp => " ", iexcl => "¡", cent => "¢", pound => "£", curren => "¤", yen => "¥", brvbar => "¦", sect => "§", uml => "¨", copy => "©", ordf => "ª", laquo => "«", not => "¬", shy => "­", reg => "®", macr => "¯", deg => "°", plusmn => "±", sup2 => "²", sup3 => "³", acute => "´", micro => "µ", para => "¶", middot => "·", cedil => "¸", sup1 => "¹", ordm => "º", raquo => "»", frac14 => "¼", frac12 => "½", frac34 => "¾", iquest => "¿", Agrave => "À", Aacute => "Á", Acirc => "Â", Atilde => "Ã", Auml => "Ä", Aring => "Å", AElig => "Æ", Ccedil => "Ç", Egrave => "È", Eacute => "É", Ecirc => "Ê", Euml => "Ë", Igrave => "Ì", Iacute => "Í", Icirc => "Î", Iuml => "Ï", ETH => "Ð", Ntilde => "Ñ", Ograve => "Ò", Oacute => "Ó", Ocirc => "Ô", Otilde => "Õ", Ouml => "Ö", times => "×", Oslash => "Ø", Ugrave => "Ù", Uacute => "Ú", Ucirc => "Û", Uuml => "Ü", Yacute => "Ý", THORN => "Þ", szlig => "ß", agrave => "à", aacute => "á", acirc => "â", atilde => "ã", auml => "ä", aring => "å", aelig => "æ", ccedil => "ç", egrave => "è", eacute => "é", ecirc => "ê", euml => "ë", igrave => "ì", iacute => "í", icirc => "î", iuml => "ï", eth => "ð", ntilde => "ñ", ograve => "ò", oacute => "ó", ocirc => "ô", otilde => "õ", ouml => "ö", divide => "÷", oslash => "ø", ugrave => "ù", uacute => "ú", ucirc => "û", uuml => "ü", yacute => "ý", thorn => "þ", yuml => "ÿ", ); my $entities = join('|', keys %entity); sub encode_xml_element { my $text = shift; my $encoded_text = ''; while ( $text =~ s/(.*?)(\<\!\[CDATA\[.*?\]\]\>)//s ) { $encoded_text .= encode_xml_element_text($1) . $2; } $encoded_text .= encode_xml_element_text($text); return $encoded_text; } sub encode_xml_element_text { my $text = shift; $text =~ s/&(?!(#[0-9]+|#x[0-9a-fA-F]+|\w+);)/&/g; $text =~ s/&($entities);/$entity{$1}/g; $text =~ s/\/\>\;/g; return $text; } #do not delete last line! 1;