# wordsinpara11 - iterates over all Balzac writings included in the Gutenberg Project # and lists paragraphs that have a lot of words used to describe the physical # appearance of characters, either their body or their clothes. # # This supplants wordsinpara9, this version has a more linear design, # it builds the regex in a separate routine prior to matching, then collects a list # of files to search in, then iterating over these files, it opens each file and # reads it into a string in a separate routine, and then passes a reference to # this stringto the actual search routine. # this version changes the regex search loop in extract_paragraphs to make # it simpler so that it can also be used to extract all paragraphs whether # they have keywords or not. The loop creates one record per paragraph # containing the within-file begin and end offsets for each paragraph along # with a list of all keywords found in the paragraph. # The decision on whether to extract the paragraph or not is deferred to the # next block, in which all paragraphs with keyword counts within the chosen # range are extracted. use MakeRegex; use Data::Dumper; $Data::Dumper::Indent = 0; #print "\n\nBalzacs Interiors\n\n"; print "\n\nDickens Food\n\n"; $upperlimit = 10000; $lowerlimit = 4; print "$lowerlimit < Keywords-In-Text < $upperlimit\n\n"; #$dir = "c:\\Balzac\\"; $dir = "c:\\dickens\\"; #print "directory: $dir\n\n"; #$titles_file = "balzactitles.tit"; $titles_file = "dickenstitles.tit"; $titles = init_titles($titles_file,$dir); #print "titles: ",Dumper($titles),"\n"; #******************************************* # keyword stuff #$keywords_file = "interiors.wrd"; $keywords_file = "foodall.wrd"; $regex = build_regex($keywords_file,$dir); #$regex_actual = '\n\s*\n'; # get all paragraphs #$regex = \$regex_actual; print "regex: $$regex\n"; #************************************************* $bytesread_tot = 0; # make a list of files to search @tosearch = (); opendir(DIR,$dir) or die "Can't open $dir: $! \n"; @files = readdir(DIR); foreach $file (@files) { if (-f "$dir\\$file") { $tosearch = $dir . $file; push(@tosearch,$tosearch); } } closedir(DIR); print "tosearch: ",Dumper(\@tosearch),"\n"; # search and collect info foreach $file (@tosearch) { $bigstring = file2string($file); $paragraphs = extract_paragraphs($regex,$bigstring,$upperlimit,$lowerlimit,$titles); foreach $paragraph (@$paragraphs) { #print Dumper($paragraph), "\n\n"; print_paragraph($paragraph,$titles); } } exit; #********************************************** # subroutines begin here sub file2string { my($text_file) = @_; my($bigstring,$bytesread); open(INFILE,$text_file) or die "Can't open file $text_file\n"; $bigstring = ""; $bytesread = read(INFILE,$bigstring,3000000); print "bytes read: $bytesread\n"; $bytesread_tot += $bytesread; return \$bigstring; } sub extract_paragraphs { my($regex_ref,$bigstring_ref,$upperlimit,$lowerlimit,$titles) = @_; my($infile,$bytesread,$toregex,$regex,@conc); my($cur,$prev,$pos_prev,$pos_cur,@akeyword,@keywords,$item); my($para,$offset,$len,$para_text,@record,$wordcount,$i); my($record); # iterate over paras in string/file local($bigstring) = $$bigstring_ref; $regex = $$regex_ref; @conc = (); $prev = ''; $pos_prev = 0; $bigstring =~ /($regex)/gi; $cur = $1; @para = (); @words = (); $pos_prev = 0; while ($pos_cur = pos($bigstring)) { trimword($cur); if (isparagraph($cur)) { push(@conc,[$pos_prev,$pos_cur,@words]); $pos_prev = $pos_cur; @para = (); @words = (); } elsif (isword($cur)) { push(@words,$cur); } $bigstring =~ /($regex)/gi; $cur = $1; } #print "conc: ",Dumper(\@conc),"\n"; # if para has number of keywords in range, record it. @keywords = (); foreach $para (@conc) { my($wordcount) = scalar(@$para) - 2; if (($wordcount >= $lowerlimit) && ($wordcount <= $upperlimit)) { $record = record_para($para); push(@keywords,[@$record]); } } return \@keywords; } sub record_para { my($para) = @_; my($offset,$len,@record,$para_text,$wordcount); $offset = $para->[0]; $len = $para->[1] - $para->[0]; $wordcount = scalar(@$para) - 2; $para_text = substr($bigstring,$offset,$len); @record = ($wordcount,$text_file,$offset,$len); for ($i = 2; $i < $wordcount + 2; $i++) { push(@record,$para->[$i]); } push(@record,$para_text); return \@record; } sub stringfile2regex { # turn strings in a text file into a regular expression my($infile) = @_; #print "infile: $infile\n"; open(INFILE,$infile) or die "Can't open $infile"; my(@list) = (); while($line = ) { chlomp($line); push(@list,$line); } #print "\nkeywords: ",Dumper(\@list),"\n"; my($regex) = MakeRegex::make_regex(@list); #print "\nkeywords-regex: $regex\n\n"; return $regex; } sub isnumber { return($_[0] =~ /^\d+$/); } sub selectit { my($cur,$after,$conc) = @_; my($topush) = [$cur,$after]; push(@$conc,$topush); } sub trimword { #remove excess characters before or after a word chomp($_[0]); $_[0] =~ s/^\W+//; $_[0] =~ s/\W+$//; } sub isword { if (length($_[0]) > 1) { return 1; } else { return 0; }; } sub isparagraph { if (length($_[0]) <= 1) { return 1; } else { return 0; }; } sub chlomp { #remove excess white space and the beginning and end of strings, # and newlines at the end chomp($_[0]); $_[0] =~ s/^\s+//; $_[0] =~ s/\s+$//; } sub init_titles { my($filename,$dir) = @_; my($path) = $dir . $filename; my($line,%file2title,$file,$title); open(INFILE,$path) or die "Can't open $path"; while ($line = ) { #print "$line\n"; $line =~ /^\s*(\w+.txt)\s*(.+?)\[.*$/; $file = $1; $title = $2; chlomp($title); #print "file: $file title: $title \n"; $file2title{$file} = $title; } return \%file2title; } sub build_regex { my($keywords_file,$dir) = @_; my($toregex,$regex); # build regex out of strings in string file $toregex = $dir . $keywords_file; $regex = stringfile2regex($toregex); # add to regex to handle the space or punctuation preceding and following words # and to handle blank lines that indicate para breaks $regex = '\W+' . $regex . '\W+|\n\s*\n'; return \$regex; } sub print_paragraph { my($para,$titles) = @_; my($i,$text_pos); my($filename) = $para->[1]; chlomp($filename); $filename = lc($filename); print "From: $titles->{$filename}\n"; print "(keyword count: $para->[0], file: $para->[1], offset: $para->[2], length: $para->[3])\n"; $text_pos = scalar(@$para) - 1; print "[keywords: "; for ($i = 4; $i < $text_pos; $i++ ) { print "$para->[$i],"; } print "]\n\n"; print "$para->[$text_pos]\n\n"; }