#!/usr/bin/perl -w

# Copyright © 2001 Colin Watson
# Copyright © 2008 Jordà Polo
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, you can find it on the World Wide
# Web at http://www.gnu.org/copyleft/gpl.html, or write to the Free
# Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
# MA 02110-1301, USA.

# Should be run from the top level of the Lintian source tree or with
# LINTIAN_ROOT set appropriately.  You need copies of all the relevant manuals
# installed in the standard places locally (packages debian-policy,
# developers-reference, doc-base, python, lintian, menu and libpkg-guide).

use strict;
use warnings;

use File::Basename;
use POSIX qw(strftime);

BEGIN {
    my $LINTIAN_ROOT = $ENV{'LINTIAN_ROOT'};
    if (not $LINTIAN_ROOT) {
        use Cwd ();
        $ENV{'LINTIAN_ROOT'} = $LINTIAN_ROOT = Cwd::cwd();
    } else {
        chdir $LINTIAN_ROOT or die "Cannot chdir to $LINTIAN_ROOT: $!\n";
    }
}

my $LINTIAN_ROOT = $ENV{'LINTIAN_ROOT'};

# For each manual, we need:
#  * Location of the manual index on the local filesystem
#  * Base URL for the eventual target of the reference (or empty string if no
#    public URL is available)
#  * Regex to match the possible references
#  * Mapping from regex fields to reference fields (array of arrays of
#    keywords: url, section title; the position of each keyword in the array
#    defines which is its corresponding group in the regex)
#
# Optionally, if there are subsections that aren't available in the index, an
# additional regex can be defined to match possible references on other pages
# of the manual.

my $title_re = qr/<title\s?>(.+?)<\/title\s?>/i;
my $link_re = qr/<link href="(.+?)" rel="[\w]+" title="([A-Z]|[A-Z]?[\d\.]+?)\.?\s+([\w\s[:punct:]]+?)">/;
my $index_re = qr/<a href="(.+?)">([A-Z]|[A-Z]?[\d\.]+?)\.?\s+([\w\s[:punct:]]+?)<\/a>/;
my $fields = [ [ 'url' ], [ 'section' ], [ 'title' ] ];
my $dbk_index_re = qr/([\d.]+?)\.\s+<a\s*href="(.+?)"\s*>([\w\s[:punct:]]+?)<\/a\s*>/i;
my $dbk_fields = [ [ 'section' ], [ 'url' ], [ 'title' ] ];

my %manuals = (
    'policy' => [
        '/usr/share/doc/debian-policy/policy.html/index.html',
        'http://www.debian.org/doc/debian-policy/',
        $link_re, $fields
    ],
    'menu-policy' => [
        '/usr/share/doc/debian-policy/menu-policy.html/index.html',
        'http://www.debian.org/doc/packaging-manuals/menu-policy/',
        $link_re, $fields
    ],
    'perl-policy' => [
        '/usr/share/doc/debian-policy/perl-policy.html/index.html',
        'http://www.debian.org/doc/packaging-manuals/perl-policy/',
        $link_re, $fields
    ],
    'python-policy' => [
        '/usr/share/doc/python/python-policy.html/index.html',
        'http://www.debian.org/doc/packaging-manuals/python-policy/',
        $link_re, $fields
    ],
    'java-policy' => [
        '/usr/share/doc/java-common/debian-java-policy/index.html',
        'http://www.debian.org/doc/packaging-manuals/java-policy/',
        $dbk_index_re, $dbk_fields
    ],
    'vim-policy' => [
        '/usr/share/doc/vim-doc/vim-policy.html/index.html',
        'http://pkg-vim.alioth.debian.org/vim-policy.html/',
        $dbk_index_re, $dbk_fields
    ],
    'lintian' => [
        '/usr/share/doc/lintian/lintian.html/index.html',
        'http://lintian.debian.org/manual/',
        $dbk_index_re, $dbk_fields
    ],
    'devref' => [
        '/usr/share/doc/developers-reference/index.html',
        'http://www.debian.org/doc/developers-reference/',
        $index_re, $fields,
        qr/<h[45] class="title"><a id="(.+?)"><\/a>([\d\.]+?)\.? ([\w\s[:punct:]]+?)<\/h[45]>/
    ],
    'menu' => [
        '/usr/share/doc/menu/html/index.html',
        'http://www.debian.org/doc/packaging-manuals/menu.html/',
        $index_re, $fields
    ],
    'doc-base' => [
        '/usr/share/doc/doc-base/doc-base.html/index.html', '',
        $index_re, $fields
    ],
    'debconf-spec' => [
        '/usr/share/doc/debian-policy/debconf_specification.html',
        'http://www.debian.org/doc/packaging-manuals/debconf_specification.html',
        qr/<a href="(#.+?)">([\w\s[:punct:]]+?)<\/a>/,
        [ [ 'section', 'url' ], [ 'title' ] ]
    ],
    'fhs' => [
        '/usr/share/doc/debian-policy/fhs/fhs-2.3.html',
        'http://www.pathname.com/fhs/pub/fhs-2.3.html',
        qr/<a\s*href="(#.+?)"\s*>([\w\s[:punct:]]+?)<\/a\s*>/i,
        [ [ 'section', 'url' ], [ 'title' ] ]
    ],
);

# extract_refs -- Extract manual references from HTML file.
#
# This function takes the output file handle, the path to the page, and the
# regex to match, and prints references to stdout. The second argument is used
# to decide whether to look for the title (0) or not (1). It returns a list of
# pages linked by the extracted references.
sub extract_refs {
    my ($fh, $manual, $title, $page, $url, $ref_re, $fields) = @_;
    my @linked_pages = ();

    open(PAGE, '<', $page) or die "Couldn't open $page: $!";

    # Read until there are 2 newlines. This hack is needed since some lines in
    # the Developer's Reference are cut in the middle of <a>...</a>.
    local $/ = "\n\n";

    while (<PAGE>) {
        if (not $title and m/$title_re/) {
            $title = 1;
            my @out = ( $manual, '', $1, $url );
            print $fh join('::', @out) . "\n";
        }

        while (m/$ref_re/g) {
            my %ref;
            for (my $i = 0; $i < scalar @{$fields}; $i++) {
                foreach my $c (@{$fields->[$i]}) {
                    my $v = $i + 1;
                    $ref{$c} = eval '$' . $v;
                }
            }

            if ($ref{url} =~ m/^(.+?\.html)#?/i) {
                push(@linked_pages, $1) if not grep(m/$1/, @linked_pages);
            }

            # If the extracted URL part doesn't look like a URL, assume it is
            # an anchor and convert to URL accordingly.
            if ($ref{url} and not $ref{url} =~ m/(?:#|\.html$)/i) {
                $ref{url} = basename($page) . "#$ref{url}";
            }

            $ref{section} =~ s/^\#(.+)$/\L$1/;
            $ref{title} =~ s/\s+/ /g;
            $ref{title} =~ s,<span[^>]*>(.*?)</span ?>,$1,ig;
            $ref{title} =~ s,<code[^>]*>(.*?)</code ?>,<code>$1</code>,ig;
            $ref{url} = "$url$ref{url}";
            $ref{url} = '' if not $url;

            my @out = ( $manual, $ref{section}, $ref{title}, $ref{url} );
            print $fh join('::', @out) . "\n";
        }
    }

    close(PAGE);

    return @linked_pages;
}

# Create a new reference file.
open(OUT, '>', 'data/output/manual-references.new')
    or die "Cannot create data/output/manual-references.new: $!\n";
my $date = strftime('%Y-%m-%d', localtime);
print OUT <<"HEADER";
# Data about titles, sections, and URLs of manuals, used to expand references
# in tag descriptions and add links for HTML output.  Each line of this file
# has four fields separated by double colons:
#
#     <manual> :: <section> :: <title> :: <url>
#
# If <section> is empty, that line specifies the title and URL for the whole
# manual.  If <url> is empty, that manual is not available on the web.
#
# Last updated: $date

HEADER

for my $manual (sort keys %manuals) {
    my ($index, $url, $ref_re, $fields, $sub_re) = @{$manuals{$manual}};

    if (not -f $index) {
        die "Manual '$manual' not installed, aborting.\n";
    }

    # Extract references from the index.
    my @subpages
        = extract_refs(\*OUT, $manual, 0, $index, $url, $ref_re, $fields);

    # Extract additional subsection references if not available in the index.
    next if not $sub_re;
    foreach my $pagename (@subpages) {
        my $page = dirname($index) . "/$pagename";
        extract_refs(\*OUT, $manual, 1, $page, $url, $sub_re, $fields);
    }
}

# Replace the old reference file.
close OUT or die "Cannot flush data/output/manual-references.new: $!\n";
rename('data/output/manual-references.new',
       'data/output/manual-references')
    or die "Cannot rename data/output/manual-references: $!\n";

# Local Variables:
# indent-tabs-mode: nil
# cperl-indent-level: 4
# End:
# vim: syntax=perl sw=4 sts=4 sr et
