Index: tools/native_search/NativeTWikiSearch.pm =================================================================== --- tools/native_search/NativeTWikiSearch.pm (revision 0) +++ tools/native_search/NativeTWikiSearch.pm (revision 0) @@ -0,0 +1,13 @@ +# Copyright (C) 2007 WikiRing http://wikiring.com All Rights Reserved +# Author: Crawford Currie +# Perl interface to NativeTWikiSearch xs module +package NativeTWikiSearch; + +require Exporter; +require DynaLoader; +@ISA = qw(Exporter DynaLoader); +@EXPORT = qw( cgrep ); + +bootstrap NativeTWikiSearch; + +1; Index: tools/native_search/Makefile.PL =================================================================== --- tools/native_search/Makefile.PL (revision 0) +++ tools/native_search/Makefile.PL (revision 0) @@ -0,0 +1,7 @@ +# Makefile for NativeTWikiSearch module +use ExtUtils::MakeMaker; +WriteMakefile( + 'NAME' => 'NativeTWikiSearch', + 'OBJECT' => 'NativeTWikiSearch.o', + 'LIBS' => [ "-lpcreposix" ], +); Index: tools/native_search/NativeTWikiSearch.xs =================================================================== --- tools/native_search/NativeTWikiSearch.xs (revision 0) +++ tools/native_search/NativeTWikiSearch.xs (revision 0) @@ -0,0 +1,210 @@ +/* Copyright (C) 2007 WikiRing http://wikiring.com All Rights Reserved + * Author: Crawford Currie + * Fast grep function designed for use from Perl. Does not suffer from + * limitations of `grep` viz. cost of spawning a subprocess, and + * limits on command-line length. + */ +#include "EXTERN.h" +#include "perl.h" +#include "XSUB.h" +#include + +#define LINEBUFSIZE 4192 +#define ERRBUFSIZE 256 +#define MATCHBUFSIZE 1 + +/* Copy the static match buffer into heap memory, resizing as required */ +char** _backup(int mc, char** m, char** r) { + int curlen = 0; + char** newR; + if (!mc) { + return r; + } + if (r) { + while (r[curlen]) { + curlen++; + } + } + newR = (char**)safemalloc(sizeof(char*) * (curlen + mc + 1)); + if (curlen) { + memcpy(newR, r, sizeof(char*) * curlen); + } + memcpy(&newR[curlen], m, sizeof(char*) * mc); + newR[curlen + mc] = (char*)NULL; + if (r) { + safefree(r); + } + + return newR; +} + +/* Do a grep. Arguments are provided in argv, options first, then the + * pattern, then the file names. -i (case insensitive) and -l (report + * matching file names only) are the only options supported. */ +char** cgrep(char** argv) { + char** argptr = argv; + int reflags = REG_NOSUB; + int justFiles = 0; + FILE* f; + regex_t pattern; + regmatch_t match; + char linebuf[LINEBUFSIZE]; + char* matchCache[MATCHBUFSIZE]; + int matchCacheSize = 0; + char** result = (char**)NULL; + int resultSize; + char* fname; + + while (*argptr) { + char* arg = *(argptr++); + if (strcmp(arg, "-i") == 0) { + reflags |= REG_ICASE; + safefree(arg); + } else if (strcmp(arg, "-l") == 0) { + justFiles = 1; + safefree(arg); + } else { + int ern; + if (ern = regcomp(&pattern, arg, reflags)) { + char erb[ERRBUFSIZE]; + regerror(ern, &pattern, erb, ERRBUFSIZE); + warn(erb); + safefree(arg); + return (char**)NULL; + } + safefree(arg); + break; + } + } + while (*argptr) { + fname = *(argptr++); + f = fopen(fname, "r"); + if (f) { + int ern; + int mi; + int size; + char ch = 0; + while (ch >= 0) { + int chc = 0; + while ((ch = fgetc(f)) >= 0) { + if (ch == '\n' || chc == LINEBUFSIZE - 1) { + break; /* got a lineful */ + } + linebuf[chc++] = ch; + } + linebuf[chc] = '\0'; + if ((ern = regexec(&pattern, linebuf, 1, &match, 0)) == 0) { + /* Successful match */ + if (matchCacheSize == MATCHBUFSIZE) { + result = _backup(matchCacheSize, matchCache, result); + matchCacheSize = 0; + } + mi = matchCacheSize++; + size = strlen(fname); + if (!justFiles) { + size += 1 + strlen(linebuf); + } + matchCache[mi] = (char*)safemalloc(size + 1); + strcpy(matchCache[mi], fname); + if (!justFiles) { + strcat(matchCache[mi], ":"); + strcat(matchCache[mi], linebuf); + /* go to next matching line in this file */ + } + if (justFiles) { + break; /* go to next file */ + } + } + } + fclose(f); + safefree(fname); + } else { + warn("Open failed"); + } + } + safefree(argv); + result = _backup(matchCacheSize, matchCache, result); + return result; +} + +/* Next two functions taken from + * http://search.cpan.org/src/TBUSCH/Lucene-0.06/Av_CharPtrPtr.cpp + * and modified + */ +char ** XS_unpack_charPtrPtr(SV* rv ) +{ + AV *av; + SV **ssv; + char **s; + int avlen; + int x; + + if( SvROK( rv ) && (SvTYPE(SvRV(rv)) == SVt_PVAV) ) + av = (AV*)SvRV(rv); + else { + warn("XS_unpack_charPtrPtr: rv was not an AV ref"); + return( (char**)NULL ); + } + + /* is it empty? */ + avlen = av_len(av); + if( avlen < 0 ){ + warn("XS_unpack_charPtrPtr: array was empty"); + return( (char**)NULL ); + } + + /* av_len+2 == number of strings, plus 1 for an end-of-array sentinel. + */ + s = (char **)safemalloc( sizeof(char*) * (avlen + 2) ); + if( s == NULL ){ + warn("XS_unpack_charPtrPtr: unable to malloc char**"); + return( (char**)NULL ); + } + for( x = 0; x <= avlen; ++x ){ + ssv = av_fetch( av, x, 0 ); + if( ssv != NULL ){ + if( SvPOK( *ssv ) ){ + s[x] = (char *)safemalloc( SvCUR(*ssv) + 1 ); + if( s[x] == NULL ) + warn("XS_unpack_charPtrPtr: unable to malloc char*"); + else + strcpy( s[x], SvPV( *ssv, PL_na ) ); + } + else + warn("XS_unpack_charPtrPtr: array elem %d was not a string.", x ); + } + else + s[x] = (char*)NULL; + } + s[x] = (char*)NULL; /* sentinel */ + return( s ); +} + +/* Used by the OUTPUT typemap for char**. + * Will convert a C char** to a Perl AV*, freeing the char** and the strings + * stored in it + */ +void XS_pack_charPtrPtr(SV* st, char **s, int n) +{ + AV *av = newAV(); + SV *sv; + char **c; + + for( c = s; *c != NULL; ++c ){ + sv = newSVpv( *c, 0 ); + safefree(*c); + av_push( av, sv ); + } + sv = newSVrv( st, NULL ); /* upgrade stack SV to an RV */ + SvREFCNT_dec( sv ); /* discard */ + SvRV( st ) = (SV*)av; /* make stack RV point at our AV */ + safefree(s); +} + +MODULE = FastSearch PACKAGE = FastSearch + +char** +cgrep(argv) + char ** argv + PREINIT: + int count_charPtrPtr; Index: test/bin/make_big.pl =================================================================== --- test/bin/make_big.pl (revision 0) +++ test/bin/make_big.pl (revision 0) @@ -0,0 +1,119 @@ +#!/usr/bin/perl +# Copyright (C) WikiRing 2007 +# Author: Crawford Currie +# Generate large test data. This script is primarily designed for running +# stand-alone to generate large test data. It requires /usr/lib/dict +# to be installed. + +use strict; + +sub usage { + print STDERR < +Options: + -webs - generate webs (default 1) + -topics - generate additional topics in + each web (default 0) + -size - target number of words to put in + each additional topic (default 501) + -base - base for new web and topic names + (default IncredibleHulk) + +This script must be run while cd'ed to the root directory +of a TWiki install. It checks for data and pub dirs and +refuses to run without them. + +Using as the base for new web names, generates + new webs using _default as the basis. The standard +topics from _default are always included, and will +generate additional topics in each generated web. +Additional topics are named using , are plain text, +and are generated using words picked from +/usr/share/dict. Each new topic will contain words +taken sequentially from the dictionary. Generated topics +have no histories and no meta-data, just text. + +Web names are generated from by appending decimal +numbers to generate unique web names. Topic names are +generated the same way. You are recommended to use unique +web names to make a later rm -r as safe as possible. + +USAGE + exit 1; +} + +unless (-w "data" && -w "pub") { + usage(); +} + +my $dict = '/usr/share/dict/words'; +my $dict_fh; + +# Get $n words from the dictionary +sub getWords { + my ($n) = @_; + local $/ = "\n"; + my $words = ''; + my $word; + if (!$dict_fh) { + open($dict_fh, "<$dict") || die $!; + } + while ($n) { + while ($n && ($word = <$dict_fh>)) { + $words .= $word; + $n--; + } + last unless $n; + close($dict_fh); + open($dict_fh, "<$dict"); + } + + return $words; +} + +my %opts = ( + webs => 1, + topics => 0, + size => 501, + base => 'IncredibleHulk', +); + +while (my $arg = shift @ARGV) { + if ($arg =~ /^-(\w+)$/) { + $opts{$1} = shift @ARGV; + } else { + print STDERR "Unrecognised option $arg"; + usage(); + } +} + +my $newWebs = 0; +my $nextWeb = 0; +while ($newWebs < $opts{webs}) { + while (-e "data/$opts{base}$nextWeb") { + $nextWeb++; + } + my $web = "$opts{base}$nextWeb"; + # Create the web + mkdir("data/$web"); + `cp data/_default/*.txt data/$web`; + my $newTopics = 0; + my $nextTopic = 0; + while ($newTopics < $opts{topics}) { + while (-e "data/$web/$opts{base}$nextTopic.txt") { + $nextTopic++; + } + my $topic = "$opts{base}$nextTopic"; + open(TOPIC, ">data/$web/$topic.txt") || die $!; + my $t = time(); + print TOPIC <{type} || ''; - - # I18N: 'grep' must use locales if needed, - # for case-insensitive searching. See TWiki::setupLocale. - my $program = ''; - # FIXME: For Cygwin grep, do something about -E and -F switches - # - best to strip off any switches after first space in - # EgrepCmd etc and apply those as argument 1. - if( $type eq 'regex' ) { - $program = $TWiki::cfg{RCS}{EgrepCmd}; - } else { - $program = $TWiki::cfg{RCS}{FgrepCmd}; - } - - $program =~ s/%CS{(.*?)\|(.*?)}%/$options->{casesensitive}?$1:$2/ge; - $program =~ s/%DET{(.*?)\|(.*?)}%/$options->{files_without_match}?$2:$1/ge; - my $sDir = $TWiki::cfg{DataDir}.'/'.$this->{web}.'/'; - my $seen = {}; - # process topics in sets, fix for Codev.ArgumentListIsTooLongForSearch - my $maxTopicsInSet = 512; # max number of topics for a grep call - my @take = @$topics; - my @set = splice( @take, 0, $maxTopicsInSet ); - my $sandbox = $this->{session}->{sandbox}; - while( @set ) { - @set = map { "$sDir/$_.txt" } @set; - my ($matches, $exit ) = $sandbox->sysCommand( - $program, - TOKEN => $searchString, - FILES => \@set); - foreach my $match ( split( /\r?\n/, $matches )) { - if( $match =~ m/([^\/]*)\.txt(:(.*))?$/ ) { - push( @{$seen->{$1}}, $3 ); + my $matches = ''; + my %seen; + # Use the WikiRing native search if it is available, it is faster + # than forking grep. + eval 'use NativeTWikiSearch qw(cgrep)'; + unless ($@) { + my @fs; + push(@fs, "-i") unless $options->{casesensitive}; + push(@fs, "-l") if $options->{files_without_match}; + push(@fs, $searchString); + push(@fs, map { "$sDir/$_.txt" } @$topics); + my $matches = NativeTWikiSearch::cgrep(\@fs); + if (defined($matches)) { + for (@$matches) { + if (/([^\/]*)\.txt(:(.*))?$/) { + push( @{$seen{$1}}, $3 ); + } } } - @set = splice( @take, 0, $maxTopicsInSet ); + } elsif (exists $ENV{MOD_PERL}) { + # Use pure-perl grep if MOD_PERL, as the fork() used by TWiki::Sandbox + # is horribly inefficient with mod_perl + local $/ = "\n"; + if ($type eq 'regex') { + $searchString =~ s!/!\\/!g; + } else { + $searchString =~ s/(\W)/\\$1/g; + } + my $match_code = "/$searchString/o"; + $match_code .= 'i' unless ($options->{casesensitive}); + my $doMatch = eval "sub { $match_code }"; + FILE: + foreach my $file ( @$topics ) { + next unless open(FILE, "$sDir/$file.txt"); + while () { + if (&$doMatch()) { + push( @{$seen{$file}}, $_ ); + next FILE if $options->{files_without_match}; + } + } + } + } else { + # I18N: 'grep' must use locales if needed, + # for case-insensitive searching. See TWiki::setupLocale. + my $program = ''; + # FIXME: For Cygwin grep, do something about -E and -F switches + # - best to strip off any switches after first space in + # EgrepCmd etc and apply those as argument 1. + if( $type eq 'regex' ) { + $program = $TWiki::cfg{RCS}{EgrepCmd}; + } else { + $program = $TWiki::cfg{RCS}{FgrepCmd}; + } + + $program =~ s/%CS{(.*?)\|(.*?)}%/$options->{casesensitive}?$1:$2/ge; + $program =~ s/%DET{(.*?)\|(.*?)}%/$options->{files_without_match}?$2:$1/ge; + # process topics in sets, fix for Codev.ArgumentListIsTooLongForSearch + my $maxTopicsInSet = 512; # max number of topics for a grep call + my @take = @$topics; + my @set = splice( @take, 0, $maxTopicsInSet ); + my $sandbox = $this->{session}->{sandbox}; + while( @set ) { + @set = map { "$sDir/$_.txt" } @set; + my ($m, $exit ) = $sandbox->sysCommand( + $program, + TOKEN => $searchString, + FILES => \@set); + $matches .= $m; + @set = splice( @take, 0, $maxTopicsInSet ); + } + $matches =~ s/([^\/]*)\.txt(:(.*))?$/push( @{$seen{$1}}, $3 ); ''/gem; } - return $seen; + return \%seen; } =pod