Listing 1. sman-index-prog.pl converts man pages to XML for indexing. #!/usr/bin/perl -w use strict; use File::Find; my ($cnt, @files) = (0, get_man_files()); warn scalar @files, " man pages to index...\n"; for my $f (@files) { warn "processing $cnt\n" unless ++$cnt % 20; my ($hashref) = parse_man($f); my $xml = make_xml($hashref); my $size = length $xml; # NOTE: Fails if UTF print "Path-Name: $f\n", "Document-Type: XML*\n", "Content-Length: $size\n\n", $xml; } sub get_man_files { # get english manfiles my @files; chomp(my $man_path = $ENV{MANPATH} || `manpath` || '/usr/share/man'); find( sub { my $n = $File::Find::name; push @files, $n if -f $n && $n =~ m!man/man.*\.! }, split /:/, $man_path ); return @files; } sub make_xml { # output xml version of hash my ($metas) = @_; # escapes vals as side-effect my $xml = join ("\n", map { "<$_>" . escape($metas->{$_}) . "" } keys %$metas); my $pre = qq{\n}; return qq{$pre$xml\n}; } sub escape { # modifies scalar you pass! return "" unless defined($_[0]); s/&/&/g, s//>/g for $_[0]; return $_[0]; } sub parse_man { # this is the bulk my ($file) = @_; my ($manpage, $cur_content) = ('', ''); my ($cur_section,%h) = qw(NOSECTION); open FH, "man $file | col -b |" or die "Failed to run man: $!"; my ($line1, $lineM) = (scalar() || "", ""); while ( ) { # parse manpage into sections $line1 = $_ if $line1 =~ /^\s*$/; $manpage .= $lineM = $_ unless /^\s*$/; if (s/^(\w(\s|\w)+)// || s/^\s*(NAME)/$1/i){ chomp( my $sec = $1 ); # section title $h{$cur_section} .= $cur_content; $cur_content = ""; $cur_section = $sec; # new section name } $cur_content .= $_ unless /^\s*$/; } $h{$cur_section} .= $cur_content; # examine NAME, HEADer, FOOTer, (and # maybe the filename too). close(FH) or die "Failed close on pipe to man"; @h{qw(A_AHEAD A_BFOOT)} = ($line1, $lineM); my ($mn, $ms, $md) = ("","","",""); # NAME mn, DESCRIPTION md, & SECTION ms for(sort keys(%h)) { # A_AHEAD & A_BFOOT first my ($k, $v) = ($_, $h{$_}); # copy key&val if (/^A_(AHEAD|BFOOT)$/) { #get sec or cmd # look for the 'section' in ()'s if ($v =~ /\(([^)]+)\)\s*$/) {$ms||= $1;} } elsif($k =~ s/^\s*(NOSECTION|NAME)\s*//) { my $namestr = $v || $k; # 'cmd - a desc' if ($namestr =~ /(\S.*)\s+--?\s*(.*)/) { $mn ||= $1 || ""; $md ||= $2 || ""; } else { # that regex could fail. $md ||= $namestr || $v; } } } if (!$ms && $file =~ m!/man/man([^/]*)/!) { $ms = $1; # get sec from path if not found } ($mn = $file) =~ s!(^.*/)|(\.gz$)!! unless $mn; my %metas; @metas{qw(swishtitle sec desc page)} = ($mn, $ms, $md, $manpage); return ( \%metas ); # return ref to 5-key hash. }