> I want to do a fast case insensitive memory search for a string. At
> the moment I am using:
> for(j = 0; j < l; j++) { // l = length of file
> file[j] = tolower(file[j]); // make the document lowercase
> } // char by char
> where fileend is file+l and p is a pointer to a string and pend is a
> pointer to the end of that string.
That's apparently wrong. I'd recommend you to take a look on any CS
book, there'd be nice examples there. TAOP is very nice to read too.
as an example -- that's my _old_ :-) app written for DOS for searching
the dictionary. It took 1.5 seconds on ancient laptop (24M/Cyrix 180)
to search for "zymotic" in ~6mb file (after few launches file was
cached). There were asm() pieces in original code, however, with GCC
there's no longer any need for it -- -O2 reduces time quite nice...
[Pattern must be shorter than number of bits in "unsigned long", or
try "unsigned long long"]
[On my C400 it runs fine]
# ./a.out Mueller7GPL.koi zYmOtiC
------- translating -------
at offset: 5714140
zymotic _a. 1> <*zymotic*> 2> <*zymotic*>; zymotic diseases <*zymotic*> <*diseases*>
---------------
100.00%
Elapsed time: 2600
all's OK
[That's something quite hard to read, but it shows the point]
[Yes, I was depressed when wrote this ;-)]
[almost no error checking, yet it should work]
#include <sys/types.h>
#include <ctype.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <math.h>
#include <string.h>
#include <time.h>
#define ERROR(x) {perror("(fatal) "x);return -1;}
const unsigned bufl=(1<<15)-5000;
unsigned char ch[bufl*2];
#define nodebug
char pattern[32];
//=#ifdef debug//"aabac""int"#else"\n" "ail" " "//zymotic"#endif
char pl;
unsigned long r,pr=~1l;
unsigned long sss[0x100];
void binp(unsigned long v) {
for(int i=0;i<pl+3;i++){
printf("%c",(!(v&1))|0x30);
v>>=1;
};
printf("\n");
Quote:};
int main(int argc,char*argv[]) {
if(argc!=3) {
printf("Not enough parameters, exiting\n");
return -1;
};
int inf=open(argv[1],O_RDONLY);
strcpy(&pattern[1],argv[2]);
pattern[0]='\n';
pl=strlen(pattern);
pattern[pl++]=' ';
pattern[pl++]=' ';
if(inf==-1)
ERROR("opening input file");
unsigned long l=0,s=0,fl=lseek(inf,0,SEEK_END);
if(fl==-1)
ERROR("counting file's length");
lseek(inf,0,SEEK_SET);
unsigned int i,j;
for(i=0;i<0x100;i++) {
sss[i]=0l;
for(j=0;j<pl;j++) {
int foo=pattern[j]!=i;
if(islower(i))
foo&=(pattern[j]!=toupper(i));
if(isupper(i))
foo&=(pattern[j]!=tolower(i));
sss[i]|=foo<<(j+1);
}
};
clock_t start, end;
start = clock();
int print=0,ob=0;
printf("\n\t\t------- translating -------\n");
while((l=(unsigned int)read(inf,ch,bufl*2))>0) {
for(i=0;i<l;i++) {
if(print) {
if(ch[i]=='[')
ob=1;
if(!ob)
printf("%c",ch[i]);
if(ch[i]==']')
ob=0;
if(ch[i]=='\n') {
print=0;
printf("---------------\n");
}
};
// r=((pr<<1)|1)&sss[ch[i]];
r=(pr<<1)|sss[ch[i]];
#ifdef debug
printf("\t\t-------\n");
printf("r: ");binp(r);
printf("p: ");binp(pr);
printf("s: ");binp(sss[ch[i]]);
#endif
pr=r;
// if(r&(1<<pl)) {
if(!(r&(1<<pl))) {
printf("at offset: %lu\n",s+i);
print=1;
for(int j=1;j<=pl-1;j++)
printf("%c",ch[i-pl+j]);
};
};
s+=l;
printf("%5.2f%%\t\t\t\r",s*100.0/fl);
};
end = clock();
printf("\nElapsed time: %i\n", (end - start) / CLK_TCK);
if(l==0) {
printf("all's OK\n");
return 0;
} else
ERROR("reading file");
Quote:};