roffit

#!/usr/bin/env perl
##########################################################################
#
# Copyright (C) Daniel Stenberg, <daniel@haxx.se>, et al.
#
# This software is licensed as described in the file LICENSE, which
# you should have received as part of this distribution.
#
# You may opt to use, copy, modify, merge, publish, distribute and/or sell
# copies of the Software, and permit persons to whom the Software is
# furnished to do so, under the terms of the COPYING file.
#
# This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
# KIND, either express or implied.
#
# SPDX-License-Identifier: curl
#
##########################################################################
#
# Read an nroff file. Output a HTML file.
#
# Web:     https://daniel.haxx.se/projects/roffit

my $version = "0.15";

use strict;
#use warnings;
use HTML::Entities qw(encode_entities);
use File::Basename;
sub do_encode($) {
    return encode_entities(shift, q{<>&"'#});
}

my $InFH = \*STDIN;
my $OutFH = \*STDOUT;
my $debugFH = \*STDERR;

my %manpage;
my @out;

my $indentlevel=0; # logical levels, not columns
my @p;
my $within_tp;
my $standalone=1; # by default we make stand-alone HTML pages
my $pre;
my %anchor; # hash with all anchors

my @mandirs;
my $mandir;
my $hrefdir=".";
my $filename;
my $leavecase;

while($ARGV[0]) {
    if($ARGV[0] eq "--bare") {
        # don't include headers and stuff
        $standalone=0;
        shift @ARGV;
    }
    elsif($ARGV[0] eq "--version") {
        print "roffit $version (https://daniel.haxx.se/projects/roffit/)\n";
        exit;
    }
    elsif($ARGV[0] eq "--preserve-case") {
        $leavecase = 1;
        exit;
    }
    elsif($ARGV[0] =~ /^--mandir=(.*)/) {
        # check for other HTMLized man pages in these given dirs
        # can be specified multiple times
        push @mandirs, $1;
        $mandir = 1; # option used
        shift @ARGV;
    }
    elsif($ARGV[0] =~ /^--hrefdir=(.*)/) {
        # if another manpage is found in the --mandir, this is the dir we
        # prefix the HTML file name with in the output HTML
        $hrefdir=$1;
        shift @ARGV;
    }
    elsif($ARGV[0] =~ /^(--help|-h)/) {
        print $debugFH "Usage: roffit [options] [infile] < infile > outfile\n",
            "Options:\n",
            " --bare          do not put in HTML, HEAD, BODY tags\n",
            " --hrefdir=<dir> if a manpage is found in the --mandir, this is the\n",
            "                 dir we prefix the HTML file name with in the output\n",
            " --mandir=<dir>  check for other HTMLized man pages in these dirs\n",
            " --preserve-case leave case of section headers untouched\n",
            " --version       display roffit version and exit\n";
            
        exit;
    }
    else {
        $filename=$ARGV[0];
        last;
    }
}

sub strlen($) { defined($_[0]) ? length($_[0]) : 0; }

sub showp {
    my @p = @_;
    if(@p && join("", @p)) {
        push @out, "\n<p class=\"level$indentlevel\">" if(!$pre);
        push @out, @p;
        push @out, "</p>" if(!$pre);
    }
}

sub defaultcss {
    print $OutFH <<ENDOFCSS
<STYLE type="text/css">
pre {
  overflow: auto;
  margin: 0;
}

P.level0, pre.level0 {
 padding-left: 2em;
}

P.level1, pre.level1 {
 padding-left: 4em;
}

P.level2, pre.level2 {
 padding-left: 6em;
}

span.emphasis {
 font-style: italic;
}

span.bold {
 font-weight: bold;
}

span.manpage {
 font-weight: bold;
}

h2.nroffsh {
 background-color: #e0e0e0;
}

span.nroffip {
 font-weight: bold;
 font-size: 120%;
 font-family: monospace;
}

p.roffit {
 text-align: center;
 font-size: 80%;
}
</STYLE>
ENDOFCSS
    ;
}

sub text2name {
    my ($text) = @_;
    $text =~ s/^ *([^ ]*).*/$1/g;
    $text =~ s/[^a-zA-Z0-9-]//g;
    return $text;
}

# scan through the file and check for <span> sections we should convert
# to proper links. ignore preformatted text.
sub linkfile {
    my $preformatted;
    my @new;
    for(@out) {
        my $remaining = $_;
        my $processed = "";
        while(strlen($remaining)) {
            my $field;
            my $tail;
            $remaining =~ /(?:(.*?)(<\/?pre.*?>))?(.*)/s;
            # $1 is anything before the first encountered pre tag
            # $2 is that pre tag (starts with <pre or </pre)
            # $3 is anything remaining
            if(strlen($2)) {
                $remaining = (strlen($3) ? $3 : ""); # for the next iteration
                $preformatted = ("<pre" eq substr($2, 0, 4));
                if(!strlen($1) || !$preformatted) {
                    # $1 is empty or preformatted (eg foo</pre>bar)
                    $processed .= (strlen($1) ? $1 : "") . $2;
                    next;
                }
                # $1 is not empty and not preformatted
                $field = $1;
                $tail = $2;
            }
            else {
                # $2 is empty which means $1 is empty and $3 contains no pre
                # we can process the remaining part of the line all at once
                $remaining = "";
                if(!strlen($3) || $preformatted) {
                    # $3 is empty or preformatted from previous tag
                    $processed .= (strlen($3) ? $3 : "");
                    last;
                }
                # $3 is not empty and not preformatted
                $field = $3;
                $tail = "";
            }

            while($field =~ s/<span class=\"(manpage|emphasis|bold)\">([^<]*)<\/span>/[]/) {
                my ($style, $name)=($1, $2);

                my $l = text2name($name);

                my $link;
                if($anchor{$l}) {
                    $link="<a class=\"$style\" href=\"#$l\">$name</a>";
                }
                else {
                    if($mandir) {
                        my ($symbol, $section);
                        if ($name =~ /^ *([A-Za-z_0-9-]*) *\( *(\d+[A-Za-z]*) *\) *$/) {

                            # this looks like a reference to another man page, and
                            # we have asked for this feature! We check for the
                            # specified nroff file and not the HTML version, to
                            # avoid depending on which order the set of files are
                            # converted to HTML!
                            ($symbol, $section)=($1, lc $2);
                        }
                        elsif ($field =~ /\[\] *\( *(\d+[A-Za-z]*) *\)/) {
                            # Also look for references where the man page
                            # section is outside the span.
                            $section = lc $1;
                            ($symbol) = ($name =~ /^ *([A-Za-z_0-9-]+)/);
                        }
                        if ($symbol && $section) {
                            for my $d (@mandirs) {
                                my $file = "$d/$symbol.$section";

                                if(-r $file) {
                                    my $html = "$hrefdir/$symbol.html";
                                    # there is such a HTML file present, produce a
                                    # link to it!
                                    $link = "<a Class=\"$style\" href=\"$html\">$symbol</a>";
                                    last; # skip out of loop
                                }
                            }
                        }
                    }
                    if(!$link) {
                        $link = "<span Class=\"$style\">$name</span>";
                    }
                }
                $field =~ s/\[\]/$link/;
            }

            # convert https://, http:// and ftp:// URLs to <a href> links
            $field =~ s/(^|\W)((https|http|ftp):\/\/[a-z0-9\-._~%:\/?\#\[\]\@!\$&'()*+,;=]+)/$1<a href=\"$2\">$2<\/a>/gi;

            # convert (uppercase only) "RFC [number]" to a link
            $field =~ s/(^|\W)RFC ?(\d+)/$1<a href=\"https:\/\/www.ietf.org\/rfc\/rfc$2.txt\">RFC $2<\/a>/g;

            $processed .= $field . $tail;
        }
        push @new, $processed;
    }
    return @new;
}

sub handle_italic_bold ($) {
    $_ = shift;
    $_ =~ s/\\fI/<span class=\"emphasis\">/g;
    $_ =~ s/\\fB/<span class=\"bold\">/g;
    $_ =~ s/\\f[PR]/<\/span>/g;
    return $_;
}

sub split_quoted ($) {
    my $t = shift;
    my @tokens;
    my $quoted=0;
    for my $chunk (split /(?<![^\\]\\)"/, $t) {
        if ($quoted) {
            # "text in quotes" don't split
            push @tokens, $chunk;
        }
        else {
            # Outside of quotes, split on whitespace
            push @tokens, split(' ', $chunk);
        }
        $quoted = not $quoted;
    }
    return @tokens;
}

my %apply_face = (
    I => sub {
        return sprintf('<span class="emphasis">%s</span>', shift);
    },
    B => sub {
        return sprintf('<span class="bold">%s</span>', shift);
    },
    R => sub {
        return shift;
    }
);

sub parsefile {
    my ($filename)=@_;
    my $lineno;
    my $br_lastp;
    while(<$InFH>) {
        $lineno++;
        my $in = $_;
        my $out;

  #     print $debugFH "DEBUG IN: $_";

        $in =~ s/[\r\n]//g if(!$pre); # tear off newlines

        $in =~ s/\r//g; # remove carriage return always

        # substitue all '\-' sequences with '-'.
        # '\-' might be in the source file to specify a literal
        # '-' rather than a hyphen.
        $in =~ s/\\-/-/g;

        if($in =~ /^\'\\\"/) {
            # ignore this kind of weird comment
            $out ="";
        }
        elsif($in =~ /^\.([^ \t\n]*)(.*)/) {
            # this is a line starting with a dot, that means it is special
            my ($keyword, $rest) = ($1, $2);
            $out = "";
            
            # cut off initial spaces
            $rest =~ s/^\s+//;
            
            if ( $keyword eq q(\\") ) {
                # this is a comment, skip this line
            }
            elsif ( $keyword eq "TH" ) {
                # man page header:
                # curl 1 "22 Oct 2003" "Curl 7.10.8" "Curl Manual"

                # Treat pages that have "*(Dt":
                # .TH IDENT 1 \*(Dt GNU

                $rest =~ s,\Q\\*(Dt,,g;

                # Delete backslashes

                $rest =~ s,\\,,g;

                # Delete old RCS tags
                # .TH saidar 1 $Date:\ 2006/11/30\ 23:42:42\ $ i\-scream

                $rest =~ s,\$Date:\s+(.*?)\s+\$,$1,g;

                # NAME SECTION DATE VERSION MANUAL
                # section can be: 1 or 3C

                if ( $rest =~ /(\S+)\s+\"?(\d+\S?)\"?\s+\"([^\"]*)\" \"([^\"]*)\"(\"([^\"]*)\")?/ ) {
                    # strict matching only so far
                    $manpage{'name'}    = $1;
                    $manpage{'section'} = $2;
                    $manpage{'date'}    = $3;
                    $manpage{'version'} = $4;
                    $manpage{'manual'}  = $6;
                }
                # .TH html2text 1 2008-09-20 HH:MM:SS
                elsif ( $rest =~  m, (\S+) \s+ \"?(\d+\S?)\"? \s+ \"?([ \d:/-]+)\"? \s* (.*) ,x )
                {
                    $manpage{'name'}    = $1;
                    $manpage{'section'} = $2;
                    $manpage{'date'}    = $3;
                    $manpage{'manual'}  = $4;
                }
                # .TH program 1 description
                elsif ( $rest =~ /(\S+) \s+ \"?(\d+\S?)\"? \s+ (.+)/x )
                {
                    $manpage{'name'}    = $1;
                    $manpage{'section'} = $2;
                    $manpage{'manual'}  = $3;
                }
                # .TH program 1
                elsif ( $rest =~ /(\S+) \s+ \"?(\d+\S?)\"? /x )
                {
                    $manpage{'name'}    = $1;
                    $manpage{'section'} = $2;
                }
            }
            elsif($keyword =~ /^S[HS]$/i) {
                # SS is treated the same as SH
                # Section Header
                showp(@p);
                @p="";
                if($pre) {
                    push @out, "</pre>\n";
                    $pre = 0;
                }

                my $name = text2name($rest);
                $anchor{$name}=1;

                $rest =~ s/\"//g; # cut off quotes
                $rest = do_encode($rest);
                $rest = ucfirst(lc($rest)) if(!$leavecase);
                $out = "<a name=\"$name\"></a><h2 class=\"nroffsh\">$rest</h2>";
                $indentlevel=0;
                $within_tp=0;
            }
            elsif($keyword =~ /^B$/i) {
                $rest =~ s/\"//g; # cut off quotes
                $rest = do_encode($rest);

                # This is pretty lame, but since a .B section and itself
                # contain a \fI[blabla]\fP section, we cut off all the \f
                # occurances within the .B text to make it easier for ourself.
                $rest =~ s/\\f[IPR]//g;

                push @p, "<span class=\"bold\">$rest</span> ";
                if($pre) {
                    push @p, "\n";
                }
            }
            elsif($keyword =~ /^I$/i) {
                $rest =~ s/\"//g; # cut off quotes
                $rest = do_encode($rest);
                push @p, "<span class=\"emphasis\">$rest</span> ";
            }
            elsif($keyword =~ /^RS$/i) {
                # the start of another indent-level. for inlined tables
                # within an "IP"
                showp(@p);
                @p="";
                $indentlevel++;
            }
            elsif($keyword =~ /^RE$/i) {
                # end of the RS section
                showp(@p);
                @p="";
                $indentlevel--;
            }
            elsif($keyword =~ /^NF$/i) {
                # We let nf start a <pre> section .nf = no-fill, you use for
                # graphs or text that you don't want spaces to be ignored

                showp(@p);
                @p="";
                if (!$pre) {
                    # avoid nesting pre sections
                    push @out, "<pre class=\"level$indentlevel\">\n";
                    $pre=1
                }
            }
            elsif($keyword =~ /^TP$/i) {
                # Used within an "RS" section to make a new line. The first
                # TP as a column indicator, but we decide to do that
                # controlling in the CSS instead.
                $within_tp=1;
                showp(@p);
                @p="";                
            }
            elsif($keyword =~ /^IP$/i) {
                # start of a new paragraph coming up
                showp(@p);
                @p="";

                my $name= text2name($rest);
                $anchor{$name}=1;

                $rest =~ s/\"//g; # cut off quotes
		$rest =~ s/\\f[IBPR]//g; # Un-format it (Use CSS on nroffip class instead)
                $rest = do_encode($rest);
                
                $indentlevel-- if ($indentlevel);
                push @p, "<a name=\"$name\"></a><span class=\"nroffip\">$rest</span> ";
                # make this a single-line title
                showp(@p);
                @p="";
                $indentlevel++;
                $within_tp=0;
            }
            elsif($keyword =~ /^ad$/i) {
                showp(@p);
                @p="";
            }
            elsif($keyword =~ /^sp$/i) {
                showp(@p);
                @p="";
            }
            elsif($keyword =~ /^lp$/i) {
                # marks end of a paragraph
                showp(@p);
                @p="";
            }
            elsif($keyword =~ /^pp$/i) {
                # PP ends a TP section, but some TP sections don't use it
                # Often used to separate paragraphs
                $within_tp=0;
                showp(@p);
                @p="";
            }
            elsif($keyword =~ /^fi$/i) {
                # .fi = fill-in, extra space will be ignored and text that is
                # entered like this, the fill-in command will continue until
                # you enter a .nf command and vice-versa

                showp(@p);
                @p="";
                if($pre) {
                    # if this is the end of a .nf (<pre>) section
                    push @out, "</pre>\n";
                    $pre=0; # disable pre again
                }

            }
            elsif($keyword =~ /^(de|ft|\.)$/i) {
                # no idea what these do!
            }
            elsif($keyword =~ /^so$/i) {
                # This keyword refers to a different man page, named in the
                # $rest.
                # Recurse and read that instead if possible, otherwise output
                # generic message saying where to find the information.
                if ($mandir) {
                    for my $d (@mandirs) {
                        my $sofilename = $d . "/" . basename($rest);
                        if (-r $sofilename) {
                            # Recurse!
                            close($InFH);
                            open($InFH, "<".$sofilename);
                            parsefile($sofilename);
                            return;
                        }
                    }
                }

                # If we're here, we didn't recurse.
                push @out, "See the $rest man page.\n";
            }
	    elsif($keyword eq "br") {
                push @p, "<br>";
	    }
            elsif($keyword =~ /^([RIB])([RIB])$/) {
                # Alternating font faces
                my @ff = ($1, $2);

                if($rest) {
                    my @all = split_quoted($rest);
                    my $fi = 0;
                    my @applied;
                    for(@all) {
                        my $encoded = do_encode($_);
                        push @applied, $apply_face{$ff[$fi]}->($encoded);
                        $fi = $fi ? 0 : 1;
                    }
                    push @applied, " ";
                    push @p, join("", @applied);
                }
            }
            elsif ($keyword =~ /^ti/i) {
                push @p, "&nbsp;&nbsp;&nbsp;&nbsp;";
            }
            else {
                showp(@p);
                @p="";
                printf $debugFH "Error: unknown keyword \"$keyword\" on %s:$lineno\n",
                $filename?$filename:"<unknown>";
            }
        }
        else {
            # text line, decode \-stuff
            my $txt = $in;

            $txt =~ s/\\&//g;
	    # Must come after previous substitution or it could munge some entities.
	    $txt = do_encode($txt);
            $txt =~ s/\\-/-/g;
            $txt =~ s/\\ /&nbsp\;/g;
            $txt =~ s/\\(?:'|&#39;)/&acute\;/g;
            $txt =~ s/\\\(co/&copy\;/g;
            $txt =~ s/\\\(aq/&apos\;/g;
            $txt =~ s/\\\(cq/&apos\;/g;
            $txt =~ s/\\\(dq/&quot\;/g;

            # \e is the "escape character", defaults to backslash
            $txt =~ s/\\e/&bsol;/g;

            # \/ and \, are "italic corrections" that we can just remove
            $txt =~ s/\\[,\/]//g;

            $txt = handle_italic_bold $txt;

            # replace backslash [something] with just [something]
            $txt =~ s/\\(.)/$1/g;

            # convert remaining backslashes to &bsol;
            $txt =~ s/\\/&bsol\;/g;
            if(($txt =~ /^[ \t\r\n]*$/) && (!$pre)) {
                # no contents, marks end of a paragraph
                showp(@p);
                @p="";
            }
            else {
                $txt =~ s/^ /\&nbsp\;/g;
                $txt =~ s/^\n/\&nbsp\;\n/ if($pre);
                push @p, "$txt";
                push @p, " " if(!$pre);
            }
            $out ="";
        }

        if($out) {
            push @out, $out;
   #         print $debugFH "DEBUG OUT: $out\n";
        }
        else {
   #         print $debugFH "DEBUG OUT: [withheld]\n";
        }
    }
    showp(@p);
}

if($filename) {
    # if specified, otherwise stick with stdin
    open($InFH, "<".$filename);
}

parsefile($filename);

my @conv = linkfile();

my $title=sprintf("%s man page",
                  $manpage{'name'}?$manpage{'name'}:"secret");

if($standalone) {
    print $OutFH <<MOO
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
 "http://www.w3.org/TR/html4/loose.dtd">
<html><head>
<title>$title</title>
<meta name="generator" content="roffit">
MOO
    ;
    defaultcss();
    print "</head><body>\n";
}
else {
    print "<!-- generated with roffit -->\n";
}

print $OutFH @conv;
print $OutFH <<ROFFIT
<p class="roffit">
 This HTML page was made with <a href="https://daniel.haxx.se/projects/roffit/">roffit</a>.
ROFFIT
    ;

if($standalone) {
    print "</body></html>\n";
}