Index: tools/native_search/NativeTWikiSearch.pm
===================================================================
--- tools/native_search/NativeTWikiSearch.pm	(revision 0)
+++ tools/native_search/NativeTWikiSearch.pm	(revision 0)
@@ -0,0 +1,13 @@
+# Copyright (C) 2007 WikiRing http://wikiring.com All Rights Reserved
+# Author: Crawford Currie
+# Perl interface to NativeTWikiSearch xs module
+package NativeTWikiSearch;
+
+require Exporter;
+require DynaLoader;
+@ISA = qw(Exporter DynaLoader);
+@EXPORT = qw( cgrep );
+
+bootstrap NativeTWikiSearch;
+
+1;
Index: tools/native_search/Makefile.PL
===================================================================
--- tools/native_search/Makefile.PL	(revision 0)
+++ tools/native_search/Makefile.PL	(revision 0)
@@ -0,0 +1,7 @@
+# Makefile for NativeTWikiSearch module
+use ExtUtils::MakeMaker;
+WriteMakefile(
+   'NAME' => 'NativeTWikiSearch',
+   'OBJECT' => 'NativeTWikiSearch.o',
+   'LIBS' => [ "-lpcreposix" ],
+);
Index: tools/native_search/NativeTWikiSearch.xs
===================================================================
--- tools/native_search/NativeTWikiSearch.xs	(revision 0)
+++ tools/native_search/NativeTWikiSearch.xs	(revision 0)
@@ -0,0 +1,210 @@
+/* Copyright (C) 2007 WikiRing http://wikiring.com All Rights Reserved
+ * Author: Crawford Currie
+ * Fast grep function designed for use from Perl. Does not suffer from
+ * limitations of `grep` viz. cost of spawning a subprocess, and
+ * limits on command-line length.
+ */
+#include "EXTERN.h"
+#include "perl.h"
+#include "XSUB.h"
+#include <pcreposix.h>
+
+#define LINEBUFSIZE  4192
+#define ERRBUFSIZE   256
+#define MATCHBUFSIZE 1
+
+/* Copy the static match buffer into heap memory, resizing as required */
+char** _backup(int mc, char** m, char** r) {
+    int curlen = 0;
+    char** newR;
+    if (!mc) {
+        return r;
+    }
+    if (r) {
+        while (r[curlen]) {
+            curlen++;
+        }
+    }
+    newR = (char**)safemalloc(sizeof(char*) * (curlen + mc + 1));
+    if (curlen) {
+        memcpy(newR, r, sizeof(char*) * curlen);
+    }
+    memcpy(&newR[curlen], m, sizeof(char*) * mc);
+    newR[curlen + mc] = (char*)NULL;
+    if (r) {
+        safefree(r);
+    }
+
+    return newR; 
+}
+
+/* Do a grep. Arguments are provided in argv, options first, then the
+ * pattern, then the file names. -i (case insensitive) and -l (report
+ * matching file names only) are the only options supported. */
+char** cgrep(char** argv) {
+    char** argptr = argv;
+    int reflags = REG_NOSUB;
+    int justFiles = 0;
+    FILE* f;
+    regex_t pattern;
+    regmatch_t match;
+    char linebuf[LINEBUFSIZE];
+    char* matchCache[MATCHBUFSIZE];
+    int matchCacheSize = 0;
+    char** result = (char**)NULL;
+    int resultSize;
+    char* fname;
+
+    while (*argptr) {
+        char* arg = *(argptr++);
+        if (strcmp(arg, "-i") == 0) {
+            reflags |= REG_ICASE;
+            safefree(arg);
+        } else if (strcmp(arg, "-l") == 0) {
+            justFiles = 1;
+            safefree(arg);
+        } else {
+            int ern;
+            if (ern = regcomp(&pattern, arg, reflags)) {
+                char erb[ERRBUFSIZE];
+                regerror(ern, &pattern, erb, ERRBUFSIZE);
+                warn(erb);
+                safefree(arg);
+                return (char**)NULL;
+            }
+            safefree(arg);
+            break;
+        }
+    }
+    while (*argptr) {
+        fname = *(argptr++);
+        f = fopen(fname, "r");
+        if (f) {
+            int ern;
+            int mi;
+            int size;
+            char ch = 0;
+            while (ch >= 0) {
+                int chc = 0;
+                while ((ch = fgetc(f)) >= 0) {
+                    if (ch == '\n' || chc == LINEBUFSIZE - 1) {
+                        break; /* got a lineful */
+                    }
+                    linebuf[chc++] = ch;
+                }
+                linebuf[chc] = '\0';
+                if ((ern = regexec(&pattern, linebuf, 1, &match, 0)) == 0) {
+                    /* Successful match */
+                    if (matchCacheSize == MATCHBUFSIZE) {
+                        result = _backup(matchCacheSize, matchCache, result);
+                        matchCacheSize = 0;
+                    }
+                    mi = matchCacheSize++;
+                    size = strlen(fname);
+                    if (!justFiles) {
+                        size += 1 + strlen(linebuf);
+                    }
+                    matchCache[mi] = (char*)safemalloc(size + 1);
+                    strcpy(matchCache[mi], fname);
+                    if (!justFiles) {
+                        strcat(matchCache[mi], ":");
+                        strcat(matchCache[mi], linebuf);
+                        /* go to next matching line in this file */
+                    }
+                    if (justFiles) {
+                        break; /* go to next file */
+                    }
+                }
+            }        
+            fclose(f);
+            safefree(fname);
+        } else {
+            warn("Open failed");
+        }
+    }
+    safefree(argv);
+    result = _backup(matchCacheSize, matchCache, result);
+    return result;
+}
+
+/* Next two functions taken from
+ * http://search.cpan.org/src/TBUSCH/Lucene-0.06/Av_CharPtrPtr.cpp
+ * and modified
+ */
+char ** XS_unpack_charPtrPtr(SV* rv )
+{
+	AV *av;
+	SV **ssv;
+	char **s;
+	int avlen;
+	int x;
+
+	if( SvROK( rv ) && (SvTYPE(SvRV(rv)) == SVt_PVAV) )
+		av = (AV*)SvRV(rv);
+	else {
+		warn("XS_unpack_charPtrPtr: rv was not an AV ref");
+		return( (char**)NULL );
+	}
+
+	/* is it empty? */
+	avlen = av_len(av);
+	if( avlen < 0 ){
+		warn("XS_unpack_charPtrPtr: array was empty");
+		return( (char**)NULL );
+	}
+
+	/* av_len+2 == number of strings, plus 1 for an end-of-array sentinel.
+	 */
+	s = (char **)safemalloc( sizeof(char*) * (avlen + 2) );
+	if( s == NULL ){
+		warn("XS_unpack_charPtrPtr: unable to malloc char**");
+		return( (char**)NULL );
+	}
+	for( x = 0; x <= avlen; ++x ){
+		ssv = av_fetch( av, x, 0 );
+		if( ssv != NULL ){
+			if( SvPOK( *ssv ) ){
+				s[x] = (char *)safemalloc( SvCUR(*ssv) + 1 );
+				if( s[x] == NULL )
+					warn("XS_unpack_charPtrPtr: unable to malloc char*");
+				else
+					strcpy( s[x], SvPV( *ssv, PL_na ) );
+			}
+			else
+				warn("XS_unpack_charPtrPtr: array elem %d was not a string.", x );
+		}
+		else
+			s[x] = (char*)NULL;
+	}
+	s[x] = (char*)NULL; /* sentinel */
+	return( s );
+}
+
+/* Used by the OUTPUT typemap for char**.
+ * Will convert a C char** to a Perl AV*, freeing the char** and the strings
+ * stored in it
+ */
+void XS_pack_charPtrPtr(SV* st, char **s, int n)
+{
+	AV *av = newAV();
+	SV *sv;
+	char **c;
+
+	for( c = s; *c != NULL; ++c ){
+		sv = newSVpv( *c, 0 );
+        safefree(*c);
+		av_push( av, sv );
+	}
+	sv = newSVrv( st, NULL );	/* upgrade stack SV to an RV */
+	SvREFCNT_dec( sv );         /* discard */
+	SvRV( st ) = (SV*)av;       /* make stack RV point at our AV */
+    safefree(s);
+}
+
+MODULE = FastSearch     PACKAGE = FastSearch
+
+char**
+cgrep(argv)
+	char ** argv
+    PREINIT:
+        int count_charPtrPtr;
Index: test/bin/make_big.pl
===================================================================
--- test/bin/make_big.pl	(revision 0)
+++ test/bin/make_big.pl	(revision 0)
@@ -0,0 +1,119 @@
+#!/usr/bin/perl
+# Copyright (C) WikiRing 2007
+# Author: Crawford Currie
+# Generate large test data. This script is primarily designed for running
+# stand-alone to generate large test data. It requires /usr/lib/dict
+# to be installed.
+
+use strict;
+
+sub usage {
+    print STDERR <<USAGE;
+Usage: $0 <options>
+Options:
+    -webs <w> - generate <w> webs (default 1)
+    -topics <t> - generate <t> additional topics in
+     each web (default 0)
+    -size <s> - target number of words to put in
+     each additional topic (default 501)
+    -base <base> - base for new web and topic names
+     (default IncredibleHulk)
+
+This script must be run while cd'ed to the root directory
+of a TWiki install. It checks for data and pub dirs and
+refuses to run without them.
+
+Using <base> as the base for new web names, generates
+<w> new webs using _default as the basis. The standard
+topics from _default are always included, and will
+generate <t> additional topics in each generated web.
+Additional topics are named using <base>, are plain text,
+and are generated using words picked from
+/usr/share/dict. Each new topic will contain <s> words
+taken sequentially from the dictionary. Generated topics
+have no histories and no meta-data, just text.
+
+Web names are generated from <base> by appending decimal
+numbers to generate unique web names. Topic names are
+generated the same way. You are recommended to use unique
+web names to make a later rm -r as safe as possible.
+
+USAGE
+    exit 1;
+}
+
+unless (-w "data" && -w "pub") {
+    usage();
+}
+
+my $dict = '/usr/share/dict/words';
+my $dict_fh;
+
+# Get $n words from the dictionary
+sub getWords {
+    my ($n) = @_;
+    local $/ = "\n";
+    my $words = '';
+    my $word;
+    if (!$dict_fh) {
+        open($dict_fh, "<$dict") || die $!;
+    }
+    while ($n) {
+        while ($n && ($word = <$dict_fh>)) {
+            $words .= $word;
+            $n--;
+        }
+        last unless $n;
+        close($dict_fh);
+        open($dict_fh, "<$dict");
+    }
+
+    return $words;
+}
+
+my %opts = (
+    webs => 1,
+    topics => 0,
+    size => 501,
+    base => 'IncredibleHulk',
+);
+
+while (my $arg = shift @ARGV) {
+    if ($arg =~ /^-(\w+)$/) {
+        $opts{$1} = shift @ARGV;
+    } else {
+        print STDERR "Unrecognised option $arg";
+        usage();
+    }
+}
+
+my $newWebs = 0;
+my $nextWeb = 0;
+while ($newWebs < $opts{webs}) {
+    while (-e "data/$opts{base}$nextWeb") {
+        $nextWeb++;
+    }
+    my $web = "$opts{base}$nextWeb";
+    # Create the web
+    mkdir("data/$web");
+    `cp data/_default/*.txt data/$web`;
+    my $newTopics = 0;
+    my $nextTopic = 0;
+    while ($newTopics < $opts{topics}) {
+        while (-e "data/$web/$opts{base}$nextTopic.txt") {
+            $nextTopic++;
+        }
+        my $topic = "$opts{base}$nextTopic";
+        open(TOPIC, ">data/$web/$topic.txt") || die $!;
+        my $t = time();
+        print TOPIC <<FLUFF;
+%META:TOPICINFO{author="TWikiContributor" date="$t" format="1.1" version="1"}%
+FLUFF
+        print TOPIC getWords($opts{size});
+        close(TOPIC);
+        $newTopics++;
+        print "Generated topic $topic                             \r";
+    }
+    print "Generated web $web                                     \n";
+    $newWebs++;
+}

Property changes on: test/bin/make_big.pl
___________________________________________________________________
Name: svn:executable
   + *

Index: lib/TWiki/Store/RcsFile.pm
===================================================================
--- lib/TWiki/Store/RcsFile.pm	(revision 12498)
+++ lib/TWiki/Store/RcsFile.pm	(working copy)
@@ -328,43 +328,80 @@
     my( $this, $searchString, $topics, $options ) = @_;
     ASSERT(defined $options) if DEBUG;
     my $type = $options->{type} || '';
-
-    # I18N: 'grep' must use locales if needed,
-    # for case-insensitive searching.  See TWiki::setupLocale.
-    my $program = '';
-    # FIXME: For Cygwin grep, do something about -E and -F switches
-    # - best to strip off any switches after first space in
-    # EgrepCmd etc and apply those as argument 1.
-    if( $type eq 'regex' ) {
-        $program = $TWiki::cfg{RCS}{EgrepCmd};
-    } else {
-        $program = $TWiki::cfg{RCS}{FgrepCmd};
-    }
-
-    $program =~ s/%CS{(.*?)\|(.*?)}%/$options->{casesensitive}?$1:$2/ge;
-    $program =~ s/%DET{(.*?)\|(.*?)}%/$options->{files_without_match}?$2:$1/ge;
-
     my $sDir = $TWiki::cfg{DataDir}.'/'.$this->{web}.'/';
-    my $seen = {};
-    # process topics in sets, fix for Codev.ArgumentListIsTooLongForSearch
-    my $maxTopicsInSet = 512; # max number of topics for a grep call
-    my @take = @$topics;
-    my @set = splice( @take, 0, $maxTopicsInSet );
-    my $sandbox = $this->{session}->{sandbox};
-    while( @set ) {
-        @set = map { "$sDir/$_.txt" } @set;
-        my ($matches, $exit ) = $sandbox->sysCommand(
-            $program,
-            TOKEN => $searchString,
-            FILES => \@set);
-        foreach my $match ( split( /\r?\n/, $matches )) {
-            if( $match =~ m/([^\/]*)\.txt(:(.*))?$/ ) {
-                push( @{$seen->{$1}}, $3 );
+    my $matches = '';
+    my %seen;
+    # Use the WikiRing native search if it is available, it is faster
+    # than forking grep.
+    eval 'use NativeTWikiSearch qw(cgrep)';
+    unless ($@) {
+        my @fs;
+        push(@fs, "-i") unless $options->{casesensitive};
+        push(@fs, "-l") if $options->{files_without_match};
+        push(@fs, $searchString);
+        push(@fs, map { "$sDir/$_.txt" } @$topics);
+        my $matches = NativeTWikiSearch::cgrep(\@fs);
+        if (defined($matches)) {
+            for (@$matches) {
+                if (/([^\/]*)\.txt(:(.*))?$/) {
+                    push( @{$seen{$1}}, $3 );
+                }
             }
         }
-        @set = splice( @take, 0, $maxTopicsInSet );
+    } elsif (exists $ENV{MOD_PERL}) {
+        # Use pure-perl grep if MOD_PERL, as the fork() used by TWiki::Sandbox
+        # is horribly inefficient with mod_perl
+        local $/ = "\n";
+        if ($type eq 'regex') {
+            $searchString =~ s!/!\\/!g;
+        } else {
+            $searchString =~ s/(\W)/\\$1/g;
+        }
+        my $match_code = "/$searchString/o";
+        $match_code .= 'i' unless ($options->{casesensitive});
+        my $doMatch = eval "sub { $match_code }";
+      FILE:
+        foreach my $file ( @$topics ) {
+            next unless open(FILE, "$sDir/$file.txt");
+            while (<FILE>) {
+                if (&$doMatch()) {
+                    push( @{$seen{$file}}, $_ );
+                    next FILE if $options->{files_without_match};
+                }
+            }
+        }
+    } else {
+        # I18N: 'grep' must use locales if needed,
+        # for case-insensitive searching.  See TWiki::setupLocale.
+        my $program = '';
+        # FIXME: For Cygwin grep, do something about -E and -F switches
+        # - best to strip off any switches after first space in
+        # EgrepCmd etc and apply those as argument 1.
+        if( $type eq 'regex' ) {
+            $program = $TWiki::cfg{RCS}{EgrepCmd};
+        } else {
+            $program = $TWiki::cfg{RCS}{FgrepCmd};
+        }
+
+        $program =~ s/%CS{(.*?)\|(.*?)}%/$options->{casesensitive}?$1:$2/ge;
+        $program =~ s/%DET{(.*?)\|(.*?)}%/$options->{files_without_match}?$2:$1/ge;
+        # process topics in sets, fix for Codev.ArgumentListIsTooLongForSearch
+        my $maxTopicsInSet = 512; # max number of topics for a grep call
+        my @take = @$topics;
+        my @set = splice( @take, 0, $maxTopicsInSet );
+        my $sandbox = $this->{session}->{sandbox};
+        while( @set ) {
+            @set = map { "$sDir/$_.txt" } @set;
+            my ($m, $exit ) = $sandbox->sysCommand(
+                $program,
+                TOKEN => $searchString,
+                FILES => \@set);
+            $matches .= $m;
+            @set = splice( @take, 0, $maxTopicsInSet );
+        }
+        $matches =~ s/([^\/]*)\.txt(:(.*))?$/push( @{$seen{$1}}, $3 ); ''/gem;
     }
-    return $seen;
+    return \%seen;
 }
 
 =pod