Listing 10: best-search.pl. This program performs a Web search through HTML and text files while ignoring directories containing .nosearch files, allowing for three types of searches ("and", "or", and phrase), and ignoring the contents of HTML tags. #!/usr/bin/perl -w use strict; use diagnostics; use File::Find; use CGI; use CGI::Carp qw|fatalsToBrowser|; # Which directory should start the search? my $search_root = "/usr/local/apache/htdocs"; # What is the beginning of every URL? my $url_origin = "http://localhost"; # Slurp up files in one fell swoop undef $/; # What directories should we avoid? my %ignore_directory = (); # Create an instance of CGI my $query = new CGI; # Send a MIME header print $query->header("text/html"); # Get the text pattern for which to search my $pattern = $query->param("pattern"); # What kind of search will we run? my $search_type = $query->param("type"); # How many matches did we find? my $total_matches = 0; # Make sure that $pattern is defined unless ($pattern) { print $query->start_html(-title => "No pattern named"); print "

You must enter a pattern!

"; print $query->end_html; exit; } # Start the HTML output print $query->start_html(-title => "Search results"); print qq{

Results of a $search_type search for "$pattern":

\n }; # Start an unordered list print "\n"; # Get the pluralization right my $match_or_matches = ($total_matches == 1 ? "match" : "matches"); print "

$total_matches $match_or_matches found.

\n"; print $query->end_html; # ------------------------------------------------------------ # Subroutine that searches through files for matches sub find_matches { # Return if we have already marked this directory as ignorable return if ($ignore_directory{$File::Find::dir}); # Mark the directory as ignorable ... $ignore_directory{$File::Find::dir} = 1 if (($_ eq ".nosearch") || # ... if this file is .nosearch (-e ".nosearch") || # ... if there is .nosearch in this dir (-e "../.nosearch")); # ... if there is .nosearch up one dir # Make sure that this is an HTML or text return unless (m/\.html?$/i or m/\.te?xt$/i); # Get the filename my $filename = $_; # Open the file, and search through its contents if (open FILE, $filename) { # Turn the filename into a viable URL my $url = "$File::Find::dir/$filename"; $url =~ s/$search_root/$url_origin/; # Get the file my $contents = (); # Remove HTML tags $contents =~ s|<.+?>||g; # Literal phrase search if ($search_type eq "phrase") { # Print the filename, with the directory if ($contents =~ m|\b$pattern\b|ios) { print qq{
  • $filename\n}; $total_matches++; } } # Boolean "and" search elsif ($search_type eq "and") { # Break up the words, and search for each one my @words = split /\s+/, $pattern; # How many items must we find? my $count = scalar @words; # Decrement $count if we match $word foreach my $word (@words) { $count-- if ($contents =~ m|\b$word\b|is); } # Print the URL, if all words were found unless ($count) { print qq{
  • $filename\n}; $total_matches++; } } # Boolean "or" search elsif ($search_type eq "or") { # Break up the words, and search for each one my @words = split /\s+/, $pattern; # Finding even one match is sufficient for # matching this document foreach my $word (@words) { # If we find a match, ignore the rest of the words if ($contents =~ m|\b$word\b|is) { print qq{
  • $filename\n}; $total_matches++; return; } } } else { warn qq{Unknown search_type "$search_type"!\n}; } close FILE; } else { warn qq{Unable to open "$filename": $! }; return; } }