package markup; use strict; use warnings; no warnings 'redefine'; use Data::Dumper; use Text::WikiCreole(); use HTML::Parse(); use HTML::FormatText(); use Encode(); use HTML::Entities(); use Text::Markdown(); use log(); our @EXPORT_OK = qw(fix_line_ends html_to_creole creole_to_html creole_to_plain plain_to_ical ical_to_plain ical_to_xml html_to_plain fix_utf8 uri_encode compress base26); sub fix_line_ends ($) { my ($s) = @_; $s =~ s/\r?\n|\r/\n/g; return $s; } # convert 1..26 to a..z, 27 to aa, inspired by ConvertAA sub base26($) { my ($num) = @_; return '' if $num <= 0; my $s = ""; while ($num) { $s = chr( --$num % 26 + ord "a" ) . $s; $num = int $num / 26; } return $s; } sub html_to_creole($) { my ($s) = @_; #remove elements $s =~ s/\<\!\-\-[\s\S]*?\-\-\>//gi; $s =~ s/.*?<\/script.*?>//gi; $s =~ s/<\/?form.*?>//gi; $s =~ s/<\/?select.*?>//gi; $s =~ s/<\/?option.*?//gi; $s =~ s/<\/?input.*?>//gi; $s =~ s/<\/?script.*?>//gi; #remove line breaks $s =~ s/[\r\n]+/ /gi; #formats $s =~ s//{{$1\|}}/gi; $s =~ s//{{$2\|$1}}/gi; $s =~ s/]*?title="(.*?)".*?>/{{$1\|$2}}/gi; $s =~ s/]*?src="(.*?)".*?>/{{$2\|$1}}/gi; $s =~ s/<\/?img.*?>//gi; #replace line breaks from images $s =~ s/(\{\{[^\}\n]*?)\n([^\}\n]*?\}\})/$1$2/g; $s =~ s/(\{\{[^\}\n]*?)\n([^\}\n]*?\}\})/$1$2/g; $s =~ s/(\{\{[^\}\n]*?)\n([^\}\n]*?\}\})/$1$2/g; $s =~ s/(.*?)<\/i>/\/\/$1\/\//gi; $s =~ s/<\/?i.*?>//gi; $s =~ s/(.*?)<\/b>/\*\*$1\*\*/gi; $s =~ s/(.*?)<\/strong>/\*\*$1\*\*/gi; $s =~ s/(.*?)<\/em>/\/\/$1\/\//gi; $s =~ s/((\W+|\w+)*?)<\/blockquote>/{{{$1}}}/gi; $s =~ s/(.*?)(\s*)<\/a>/\[\[$1\|$2\]\]$3/gi; $s =~ s///gi; #replace line breaks from links $s =~ s/(\[\[[^\]\n]*?)\n([^\]]*?\]\])/$1$2/g; $s =~ s/(\[\[[^\]\n]*?)\n([^\]]*?\]\])/$1$2/g; $s =~ s/(\[\[[^\]\n]*?)\n([^\]]*?\]\])/$1$2/g; $s =~ s/[\s]+/ /gi; #line elements, increase head line level to avoid breaking single = chars $s =~ s/\s*/== /gi; $s =~ s/\s*/=== /gi; $s =~ s/\s*/==== /gi; $s =~ s/\s*/===== /gi; my $tree = HTML::Parse::parse_html( '' . $s . '' ); my $formatter = HTML::FormatText->new( leftmargin => 0, rightmargin => 2000 ); $s = $formatter->format($tree); $s =~ s/\(.*?)(\s*)<\/a>/\[\[$1\|$2\]\]$3/gi; $s =~ s///gi; $s =~ s/(\[\[[^\]\n]*?)\n([^\]]*?\]\])/$1$2/g; $s =~ s/(\[\[[^\]\n]*?)\n([^\]]*?\]\])/$1$2/g; $s =~ s/(\[\[[^\]\n]*?)\n([^\]]*?\]\])/$1$2/g; $s =~ s/^\s+//g; $s =~ s/\s+$//g; $s = Text::WikiCreole::creole_parse($s) || ''; #replace line breaks from images $s =~ s/(\{\{[^\}\n]*?)\n([^\}\n]*?\}\})/$1$2/g; $s =~ s/(\{\{[^\}\n]*?)\n([^\}\n]*?\}\})/$1$2/g; $s =~ s/(\{\{[^\}\n]*?)\n([^\}\n]*?\}\})/$1$2/g; #remove whitespaces and break lines at start or end of elements for my $elem ( 'p', 'li' ) { $s =~ s|<$elem>\s*

|<$elem>|g; $s =~ s|

\s*||g; } return $s; } sub markdown_to_html($){ my $text = $_[0] // ''; my $html = Text::Markdown::markdown($text); $html=~s!(\s)(https?://[^\s]+)(\s)!$1$2$3!g; return $html; } sub creole_to_plain($) { my ($s) = @_; $s =~ s/\/\n/gi; $s =~ s/\{\{\{((\W+|\w+)+?)\}\}\}/
$1<\/blockquote>/g; $s =~ s/\{\{(.+?)\|(.*?)\}\}//g; $s =~ s/\[\[(.+?)\|(.*?)\]\]/$2/g; $s =~ s/\/\/([^\/\/]*?)\/\//$1<\/em> /g; $s =~ s/\n=== (.*?)\n/\n

$1<\/h3>\n/g; $s =~ s/\n== (.*?)\n/\n

$1<\/h2>\n/g; $s =~ s/\*\*(.*?)\*\*/$1<\/strong> /g; $s =~ s/^== (.*?)\n/

$1<\/h2>\n/g; $s =~ s/\n\* (.*?)\n/\n
  • $1<\/li>\n/g; $s =~ s/\n\* (.*?)\n/\n
  • $1<\/li>\n/g; $s =~ s/\n\- (.*?)\n/\n$1<\/lo>\n/g; $s =~ s/\n\- (.*?)\n/\n$1<\/lo>\n/g; $s =~ s/\n\n/\n

    /gi; $s =~ s/\n/\n/gi; return $s; } sub html_to_plain ($) { my ($s) = @_; return '' unless defined $s; my $tree = HTML::Parse::parse_html( '' . $s . '' ); my $formatter = HTML::FormatText->new( leftmargin => 0, rightmargin => 2000 ); $s = $formatter->format($tree); return $s; } sub ical_to_plain ($) { return '' unless defined( $_[0] ); $_[0] =~ s/\\n/\n/gi; $_[0] =~ s/ /\t/gi; $_[0] =~ s/\\\./\./gi; $_[0] =~ s/\\\,/\,/gi; $_[0] =~ s/\\\\/\\/gi; return $_[0]; } sub plain_to_ical ($) { return '' unless defined( $_[0] ); #remove images + links $_[0] =~ s/\[\[.+?\|(.+?)\]\]/$1/g; $_[0] =~ s/\{\{.+?\}\}//g; $_[0] =~ s/^\s+//g; $_[0] =~ s/\\/\\\\/gi; $_[0] =~ s/\,/\\\,/gi; # $_[0]=~s/\./\\\./gi; $_[0] =~ s/[\r\n]/\\n/gi; $_[0] =~ s/\t/ /gi; return $_[0]; } sub plain_to_xml($) { return '' unless defined( $_[0] ); $_[0] =~ s/\n\={1,6} (.*?)\s+/\n\[\[$1\]\]\n/gi; #remove images + links $_[0] =~ s/\[\[.+?\|(.+?)\]\]/$1/g; $_[0] =~ s/\{\{.+?\}\}//g; return encode_xml_element( $_[0] ); } sub fix_utf8($) { $_[0] = Encode::decode( 'cp1252', $_[0] ); return $_[0]; } sub uri_encode ($) { $_[0] =~ s/([^a-zA-Z0-9_\.\-])/sprintf("%%%02lx",ord($1))/esg; return $_[0]; } sub compress ($) { my $header = ''; if ( $_[0] =~ /(Content\-type\:[^\n]+[\n]+)/ ) { $header = $1; } my $start = index( $_[0], $header ); return if ( $start < 0 ); my $header_length = length($header); $header = substr( $_[0], 0, $start + $header_length ); my $content = substr( $_[0], $start + $header_length ); #remove multiple line breaks $content =~ s/[\r\n]+[\s]*[\r\n]+/\n/g; #remove leading whitespaces $content =~ s/[\r\n]+[\s]+/\n/g; #remove tailing whitespaces $content =~ s/[\t ]*[\r\n]+/\n/g; #remove whitespaces inside tags $content =~ s/([\n]\<[^\n]+)[\r\n]+/$1 /g; $content =~ s/\"\s+\>/\"\>/g; #get closing tags closer $content =~ s/[\r\n]+(\<[\/\!])/$1/g; $content =~ s/(\>)[\r\n]+([^\<])/$1$2/g; #remove leading whitespaces #$content=~s/[\r\n]+([\d\S])/$1/g; #remove empty lines $content =~ s/[\n\r]+/\n/g; #remove whitespaces between tags $content =~ s/\>[\t ]+\<(^\/T)/\>\<$1/g; #multiple whitespaces $content =~ s/[\t ]+/ /g; #restore content-type line break $_[0] = $header . $content; #$_[0]=~s/HTTP_CONTENT_TYPE/\n\n/; # return $_[0]; } #from XML::RSS.pm my %entity = ( nbsp => " ", iexcl => "¡", cent => "¢", pound => "£", curren => "¤", yen => "¥", brvbar => "¦", sect => "§", uml => "¨", copy => "©", ordf => "ª", laquo => "«", not => "¬", shy => "­", reg => "®", macr => "¯", deg => "°", plusmn => "±", sup2 => "²", sup3 => "³", acute => "´", micro => "µ", para => "¶", middot => "·", cedil => "¸", sup1 => "¹", ordm => "º", raquo => "»", frac14 => "¼", frac12 => "½", frac34 => "¾", iquest => "¿", Agrave => "À", Aacute => "Á", Acirc => "Â", Atilde => "Ã", Auml => "Ä", Aring => "Å", AElig => "Æ", Ccedil => "Ç", Egrave => "È", Eacute => "É", Ecirc => "Ê", Euml => "Ë", Igrave => "Ì", Iacute => "Í", Icirc => "Î", Iuml => "Ï", ETH => "Ð", Ntilde => "Ñ", Ograve => "Ò", Oacute => "Ó", Ocirc => "Ô", Otilde => "Õ", Ouml => "Ö", times => "×", Oslash => "Ø", Ugrave => "Ù", Uacute => "Ú", Ucirc => "Û", Uuml => "Ü", Yacute => "Ý", THORN => "Þ", szlig => "ß", agrave => "à", aacute => "á", acirc => "â", atilde => "ã", auml => "ä", aring => "å", aelig => "æ", ccedil => "ç", egrave => "è", eacute => "é", ecirc => "ê", euml => "ë", igrave => "ì", iacute => "í", icirc => "î", iuml => "ï", eth => "ð", ntilde => "ñ", ograve => "ò", oacute => "ó", ocirc => "ô", otilde => "õ", ouml => "ö", divide => "÷", oslash => "ø", ugrave => "ù", uacute => "ú", ucirc => "û", uuml => "ü", yacute => "ý", thorn => "þ", yuml => "ÿ", ); my $entities = join( '|', keys %entity ); sub encode_xml_element($) { my ($text) = @_; my $encoded_text = ''; while ( $text =~ s/(.*?)(\<\!\[CDATA\[.*?\]\]\>)//s ) { $encoded_text .= encode_xml_element_text($1) . $2; } $encoded_text .= encode_xml_element_text($text); return $encoded_text; } sub encode_xml_element_text ($) { my ($text) = @_; $text =~ s/&(?!(#[0-9]+|#x[0-9a-fA-F]+|\w+);)/&/g; $text =~ s/&($entities);/$entity{$1}/g; $text =~ s/\/\>\;/g; return $text; } sub escapeHtml($) { my ($s) = @_; return HTML::Entities::encode_entities( $s, q{&<>"'} ); } #do not delete last line! 1;