|
- /*===========================================================================
- Copyright (c) 1998-2000, The Santa Cruz Operation
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
-
- *Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimer.
-
- *Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
-
- *Neither name of The Santa Cruz Operation nor the names of its contributors
- may be used to endorse or promote products derived from this software
- without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS
- IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT falseT LIMITED TO,
- THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE
- LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- CONSEQUENTIAL DAMAGES (INCLUDING, BUT falseT LIMITED TO, PROCUREMENT OF
- SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- INTERRUPTION)
- HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
- DAMAGE.
- =========================================================================*/
-
-
- #include <ctype.h>
- #include <stdio.h>
- #include <stdlib.h>
- #if SHARE
- # include <sys/types.h>
- # include <sys/ipc.h>
- # include <sys/shm.h>
- # define ERR -1
- #endif
- #include "invlib.h"
- #include "global.h"
-
- #include <assert.h>
-
- #define DEBUG 0 /* debugging code and realloc messages */
- #define BLOCKSIZE 2 * BUFSIZ /* logical block size */
- #define POSTINC 10000 /* posting buffer size increment */
- #define SEP ' ' /* sorted posting field separator */
- #define SETINC 100 /* posting set size increment */
- #define STATS 0 /* print statistics */
- #define SUPERINC 10000 /* super index size increment */
- #define TERMMAX 512 /* term max size */
- #define FMTVERSION 1 /* inverted index format version */
- #define ZIPFSIZE 200 /* zipf curve size */
-
- #if DEBUG
- /* FIXME HBB 20010705: nowhere in the source is `invbreak' ever set to
- * a value other than the (silent) initialization to zero. Pretty
- * useless, that looks */
- int invbreak;
- #endif
-
- static int boolready(void);
- static int invnewterm(void);
- static void invstep(INVCONTROL *invcntl);
- static void invcannotalloc(unsigned n);
- static void invcannotopen(char *file);
- static void invcannotwrite(char *file);
-
- #if STATS
- int showzipf; /* show postings per term distribution */
- #endif
-
- static POSTING *item, *enditem, *item1 = NULL, *item2 = NULL;
- static unsigned int setsize1, setsize2;
- static long numitems, totterm, zerolong;
- static char *indexfile, *postingfile;
- static FILE *outfile, *fpost;
- static size_t supersize = SUPERINC, supintsize;
- static unsigned int numpost, numlogblk, amtused, nextpost;
- static unsigned int lastinblk, numinvitems;
- static POSTING *POST, *postptr;
- static unsigned long *SUPINT, *supint, nextsupfing;
- static char *SUPFING, *supfing;
- static char thisterm[TERMMAX];
-
- typedef union logicalblk {
- long invblk[BLOCKSIZE / sizeof(long)];
- char chrblk[BLOCKSIZE];
- } t_logicalblk;
-
- static t_logicalblk logicalblk;
-
- #if DEBUG || STATS
- static long totpost;
- #endif
-
- #if STATS
- static int zipf[ZIPFSIZE + 1];
- #endif
-
- long invmake(char *invname, char *invpost, FILE *infile) {
- unsigned char *s;
- long num;
- int i;
- long fileindex = 0; /* initialze, to avoid warning */
- unsigned postsize = POSTINC * sizeof(*POST);
- unsigned long *intptr;
- char line[TERMMAX];
- long tlong;
- PARAM param;
- POSTING posting;
- char temp[BLOCKSIZE];
- #if STATS
- int j;
- unsigned maxtermlen = 0;
- #endif
- /* output file */
- if((outfile = vpfopen(invname, "w+b")) == NULL) {
- invcannotopen(invname);
- return (0);
- }
- indexfile = invname;
- fseek(outfile, BUFSIZ, SEEK_SET);
-
- /* posting file */
- if((fpost = vpfopen(invpost, "wb")) == NULL) {
- invcannotopen(invpost);
- return (0);
- }
- postingfile = invpost;
- nextpost = 0;
- /* get space for the postings list */
- if((POST = malloc(postsize)) == NULL) {
- invcannotalloc(postsize);
- return (0);
- }
- postptr = POST;
- /* get space for the superfinger (superindex) */
- if((SUPFING = malloc(supersize)) == NULL) {
- invcannotalloc(supersize);
- return (0);
- }
- supfing = SUPFING;
- /* FIXME HBB: magic number alert (40) */
- supintsize = supersize / 40u;
- /* also for the superfinger index */
- if((SUPINT = malloc(supintsize * sizeof(*SUPINT))) == NULL) {
- invcannotalloc(supintsize * sizeof(*SUPINT));
- return (0);
- }
- supint = SUPINT;
- supint++; /* leave first term open for a count */
- /* initialize using an empty term */
- strcpy(thisterm, "");
- *supint++ = 0;
- *supfing++ = ' ';
- *supfing++ = '\0';
- nextsupfing = 2;
- #if DEBUG || STATS
- totpost = 0L;
- #endif
- totterm = 0L;
- numpost = 1;
-
- /* set up as though a block had come and gone, i.e., set up for new block */
- /* 3 longs needed for: numinvitems, next block, and previous block */
- amtused = 3 * sizeof(long);
- numinvitems = 0;
- numlogblk = 0;
- lastinblk = sizeof(t_logicalblk);
-
- /* now loop as long as more to read (till eof) */
- while(fgets(line, TERMMAX, infile) != NULL) {
- #if DEBUG || STATS
- ++totpost;
- #endif
- s = strchr(line, SEP);
- if(s != NULL) {
- *s = '\0';
- } else {
- continue;
- }
- #if STATS
- if((i = strlen(line)) > maxtermlen) { maxtermlen = i; }
- #endif
- #if DEBUG
- printf("%ld: %s ", totpost, line);
- fflush(stdout);
- #endif
- if(strcmp(thisterm, line) == 0) {
- if((postptr + 10) > (POST + (postsize / sizeof(*POST)))) {
- i = postptr - POST;
- postsize += POSTINC * sizeof(*POST);
- if((POST = realloc(POST, postsize)) == NULL) {
- invcannotalloc(postsize);
- return (0);
- }
- postptr = i + POST;
- #if DEBUG
- printf("reallocated post space to %u, totpost=%ld\n", postsize, totpost);
- #endif
- }
- numpost++;
- } else {
- /* have a new term */
- if(!invnewterm()) { return (0); }
- strcpy(thisterm, line);
- numpost = 1;
- postptr = POST;
- fileindex = 0;
- }
- /* get the new posting */
- num = *++s - '!';
- i = 1;
- do {
- num = BASE * num + *++s - '!';
- } while(++i < PRECISION);
- posting.lineoffset = num;
- while(++fileindex < nsrcfiles && num > srcoffset[fileindex]) {
- ;
- }
- posting.fileindex = --fileindex;
- posting.type = *++s;
- ++s;
- if(*s != '\n') {
- num = *++s - '!';
- while(*++s != '\n') {
- num = BASE * num + *s - '!';
- }
- posting.fcnoffset = num;
- } else {
- posting.fcnoffset = 0;
- }
- *postptr++ = posting;
- #if DEBUG
- printf("%ld %ld %ld %ld\n",
- posting.fileindex,
- posting.fcnoffset,
- posting.lineoffset,
- posting.type);
- fflush(stdout);
- #endif
- }
- if(!invnewterm()) { return (0); }
- /* now clean up final block */
- logicalblk.invblk[0] = numinvitems;
- /* loops pointer around to start */
- logicalblk.invblk[1] = 0;
- logicalblk.invblk[2] = numlogblk - 1;
- if(fwrite(&logicalblk, sizeof(t_logicalblk), 1, outfile) == 0) { goto cannotwrite; }
- numlogblk++;
- /* write out block to save space. what in it doesn't matter */
- if(fwrite(&logicalblk, sizeof(t_logicalblk), 1, outfile) == 0) { goto cannotwrite; }
- /* finish up the super finger */
- *SUPINT = numlogblk;
- /* add to the offsets the size of the offset pointers */
- intptr = (SUPINT + 1);
- i = (char *)supint - (char *)SUPINT;
- while(intptr < supint)
- *intptr++ += i;
- /* write out the offsets (1 for the N at start) and the super finger */
- if(fwrite(SUPINT, sizeof(*SUPINT), numlogblk + 1, outfile) == 0 ||
- fwrite(SUPFING, 1, supfing - SUPFING, outfile) == 0) {
- goto cannotwrite;
- }
- /* save the size for reference later */
- nextsupfing = sizeof(long) + sizeof(long) * numlogblk + (supfing - SUPFING);
- /* make sure the file ends at a logical block boundary. This is
- necessary for invinsert to correctly create extended blocks
- */
- i = nextsupfing % sizeof(t_logicalblk);
- /* write out junk to fill log blk */
- if(fwrite(temp, sizeof(t_logicalblk) - i, 1, outfile) == 0 ||
- fflush(outfile) == EOF) { /* rewind doesn't check for write failure */
- goto cannotwrite;
- }
- /* write the control area */
- rewind(outfile);
- param.version = FMTVERSION;
- param.filestat = 0;
- param.sizeblk = sizeof(t_logicalblk);
- param.startbyte = (numlogblk + 1) * sizeof(t_logicalblk) + BUFSIZ;
- ;
- param.supsize = nextsupfing;
- param.cntlsize = BUFSIZ;
- param.share = 0;
- if(fwrite(¶m, sizeof(param), 1, outfile) == 0) { goto cannotwrite; }
- for(i = 0; i < 10; i++) /* for future use */
- if(fwrite(&zerolong, sizeof(zerolong), 1, outfile) == 0) { goto cannotwrite; }
-
- /* make first block loop backwards to last block */
- if(fflush(outfile) == EOF) { /* fseek doesn't check for write failure */
- goto cannotwrite;
- }
- /* get to second word first block */
- fseek(outfile, BUFSIZ + 2 * sizeof(long), SEEK_SET);
- tlong = numlogblk - 1;
- if(fwrite(&tlong, sizeof(tlong), 1, outfile) == 0 || fclose(outfile) == EOF) {
- cannotwrite:
- invcannotwrite(invname);
- return (0);
- }
- if(fclose(fpost) == EOF) {
- invcannotwrite(postingfile);
- return (0);
- }
- --totterm; /* don't count null term */
- #if STATS
- printf("logical blocks = %d, postings = %ld, terms = %ld, max term length = %d\n",
- numlogblk,
- totpost,
- totterm,
- maxtermlen);
- if(showzipf) {
- printf("\n************* ZIPF curve ****************\n");
- for(j = ZIPFSIZE; j > 1; j--)
- if(zipf[j]) break;
- for(i = 1; i < j; ++i) {
- printf("%3d -%6d ", i, zipf[i]);
- if(i % 6 == 0) putchar('\n');
- }
- printf(">%d-%6d\n", ZIPFSIZE, zipf[0]);
- }
- #endif
- /* free all malloc'd memory */
- free(POST);
- free(SUPFING);
- free(SUPINT);
- return (totterm);
- }
-
- /* add a term to the data base */
-
- static int invnewterm(void) {
- int backupflag, i, j, holditems, gooditems, howfar;
- unsigned int maxback, len, numwilluse, wdlen;
- char *tptr, *tptr3;
-
- union {
- unsigned long packword[2];
- ENTRY e;
- } iteminfo;
-
- gooditems = 0; /* initialize, to avoid warning */
- totterm++;
- #if STATS
- /* keep zipfian info on the distribution */
- if(numpost <= ZIPFSIZE)
- zipf[numpost]++;
- else
- zipf[0]++;
- #endif
- len = strlen(thisterm);
- /* length of term rounded up to long boundary */
- wdlen = (len + (sizeof(long) - 1)) / sizeof(long);
- /* each term needs 2 longs for its iteminfo and
- * 1 long for its offset */
- numwilluse = (wdlen + 3) * sizeof(long);
- /* new block if at least 1 item in block */
- if(numinvitems && numwilluse + amtused > sizeof(t_logicalblk)) {
- /* set up new block */
- if(supfing + 500u > SUPFING + supersize) {
- i = supfing - SUPFING;
- supersize += 20000u;
- if((SUPFING = realloc(SUPFING, supersize)) == NULL) {
- invcannotalloc(supersize);
- return (0);
- }
- supfing = i + SUPFING;
- #if DEBUG
- printf("reallocated superfinger space to %d, totpost=%ld\n",
- supersize,
- totpost);
- #endif
- }
- /* check that room for the offset as well */
- /* FIXME HBB: magic number alert (10) */
- if((numlogblk + 10) > supintsize) {
- i = supint - SUPINT;
- supintsize += SUPERINC;
- if((SUPINT = realloc(SUPINT, supintsize * sizeof(*SUPINT))) == NULL) {
- invcannotalloc(supintsize * sizeof(*SUPINT));
- return (0);
- }
- supint = i + SUPINT;
- #if DEBUG
- printf("reallocated superfinger offset to %d, totpost = %ld\n",
- supintsize * sizeof(*SUPINT),
- totpost);
- #endif
- }
- /* See if backup is efficatious */
- backupflag = 0;
- maxback = (int)strlen(thisterm) / 10;
- holditems = numinvitems;
- if(maxback > numinvitems) maxback = numinvitems - 2;
- howfar = 0;
- while(maxback-- > 1) {
- howfar++;
- iteminfo.packword[0] =
- logicalblk.invblk[--holditems * 2 + (sizeof(long) - 1)];
- if((i = iteminfo.e.size / 10) < maxback) {
- maxback = i;
- backupflag = howfar;
- gooditems = holditems;
- }
- }
- /* see if backup will occur */
- if(backupflag) { numinvitems = gooditems; }
- logicalblk.invblk[0] = numinvitems;
- /* set forward pointer pointing to next */
- logicalblk.invblk[1] = numlogblk + 1;
- /* set back pointer to last block */
- logicalblk.invblk[2] = numlogblk - 1;
- if(fwrite(logicalblk.chrblk, 1, sizeof(t_logicalblk), outfile) == 0) {
- invcannotwrite(indexfile);
- return (0);
- }
- /* 3 longs needed for: numinvitems, next block, and previous block */
- amtused = 3 * sizeof(long);
- numlogblk++;
- /* check if had to back up, if so do it */
- if(backupflag) {
- char *tptr2;
-
- /* find out where the end of the new block is */
- iteminfo.packword[0] = logicalblk.invblk[numinvitems * 2 + 1];
- tptr3 = logicalblk.chrblk + iteminfo.e.offset;
- /* move the index for this block */
- for(i = 3; i <= (backupflag * 2 + 2); i++)
- logicalblk.invblk[i] = logicalblk.invblk[numinvitems * 2 + i];
- /* move the word into the super index */
- iteminfo.packword[0] = logicalblk.invblk[3];
- iteminfo.packword[1] = logicalblk.invblk[4];
- tptr2 = logicalblk.chrblk + iteminfo.e.offset;
- strncpy(supfing, tptr2, (int)iteminfo.e.size);
- *(supfing + iteminfo.e.size) = '\0';
- #if DEBUG
- printf("backup %d at term=%s to term=%s\n", backupflag, thisterm, supfing);
- #endif
- *supint++ = nextsupfing;
- nextsupfing += strlen(supfing) + 1;
- supfing += strlen(supfing) + 1;
- /* now fix up the logical block */
- tptr = logicalblk.chrblk + lastinblk;
- lastinblk = sizeof(t_logicalblk);
- tptr2 = logicalblk.chrblk + lastinblk;
- j = tptr3 - tptr;
- while(tptr3 > tptr)
- *--tptr2 = *--tptr3;
- lastinblk -= j;
- amtused += ((2 * sizeof(long)) * backupflag + j);
- for(i = 3; i < (backupflag * 2 + 2); i += 2) {
- iteminfo.packword[0] = logicalblk.invblk[i];
- iteminfo.e.offset += (tptr2 - tptr3);
- logicalblk.invblk[i] = iteminfo.packword[0];
- }
- numinvitems = backupflag;
- } else { /* no backup needed */
- numinvitems = 0;
- lastinblk = sizeof(t_logicalblk);
- /* add new term to superindex */
- strcpy(supfing, thisterm);
- supfing += strlen(thisterm) + 1;
- *supint++ = nextsupfing;
- nextsupfing += strlen(thisterm) + 1;
- }
- }
- /* HBB 20010501: Fixed bug by replacing magic number '8' by
- * what it actually represents. */
- lastinblk -= (numwilluse - 2 * sizeof(long));
- iteminfo.e.offset = lastinblk;
- iteminfo.e.size = len;
- iteminfo.e.space = 0;
- iteminfo.e.post = numpost;
- strncpy(logicalblk.chrblk + lastinblk, thisterm, len);
- amtused += numwilluse;
- logicalblk.invblk[(lastinblk / sizeof(long)) + wdlen] = nextpost;
- if((i = postptr - POST) > 0) {
- if(fwrite(POST, sizeof(*POST), i, fpost) == 0) {
- invcannotwrite(postingfile);
- return (0);
- }
- nextpost += i * sizeof(*POST);
- }
- logicalblk.invblk[3 + 2 * numinvitems++] = iteminfo.packword[0];
- logicalblk.invblk[2 + 2 * numinvitems] = iteminfo.packword[1];
- return (1);
- }
-
- /*
- * If 'invname' ends with the 'from' substring, it is replaced inline with the
- * 'to' substring (which must be of the exact same length), and the function
- * returns 0. Otherwise, returns -1.
- */
-
- static int invflipname(char *invname, const char *from, const char *to) {
- char *temp, *i = NULL;
-
- assert(strlen(from) == strlen(to));
-
- temp = invname - 1;
- while((temp = strstr(temp + 1, from)))
- i = temp;
- if(!i || i[strlen(from)] != '\0') return -1;
- while(*to)
- *i++ = *to++;
- return 0;
- }
-
- /* small helper function to centralize handling of binary opening
- * for reading, and use of the 'stat" flag */
- static FILE *open_for_reading(char *name, int stat) {
- return vpfopen(name, ((stat == 0) ? "rb" : "r+b"));
- }
-
- /* handle opening of a file under a possibly "flipped" name */
- /* If db created without '-f', but now invoked with '-f cscope.out',
- * we need to check for 'cscope.in.out', rather than 'cscope.out.in':
- * I.e, hack around our own violation of the inverse db naming convention */
- /* more silliness: if you create the db with '-f cscope', then try to open
- * it without '-f cscope', you'll fail unless we check for 'cscope.out.in'
- * here. */
- static FILE *open_file_with_flipped_name(char *name, const char *flip_in,
- const char *flip_out, int stat) {
- if(!invflipname(name, flip_in, flip_out)) {
- FILE *fptr = open_for_reading(name, stat);
- if(!fptr) /* flip back for error message */
- invflipname(name, flip_out, flip_in);
- return fptr;
- };
- return 0;
- }
-
- static FILE *open_file_with_possibly_flipped_name(char *name, const char *flip1,
- const char *flip2, int stat) {
- FILE *fptr = open_for_reading(name, stat);
-
- if(!fptr) fptr = open_file_with_flipped_name(name, flip2, flip1, stat);
- if(!fptr) fptr = open_file_with_flipped_name(name, flip1, flip2, stat);
- return fptr;
- }
-
- int invopen(INVCONTROL *invcntl, char *invname, char *invpost, int stat) {
- int read_index;
-
- invcntl->invfile =
- open_file_with_possibly_flipped_name(invname, INVNAME, INVNAME2, stat);
- if(!invcntl->invfile) {
- invcannotopen(invname);
- return (-1);
- }
- if(fread(&invcntl->param, sizeof(invcntl->param), 1, invcntl->invfile) == 0) {
- fprintf(stderr, PROGRAM_NAME ": empty inverted file\n");
- fclose(invcntl->invfile);
- return (-1);
- }
- if(invcntl->param.version != FMTVERSION) {
- fprintf(stderr,
- PROGRAM_NAME
- ": cannot read old index format; use -U option to force database to rebuild\n");
- fclose(invcntl->invfile);
- return (-1);
- }
- assert(invcntl->param.sizeblk == sizeof(t_logicalblk));
-
- if(stat == 0 && invcntl->param.filestat == INVALONE) {
- fprintf(stderr, PROGRAM_NAME ": inverted file is locked\n");
- fclose(invcntl->invfile);
- return (-1);
- }
-
- invcntl->postfile =
- open_file_with_possibly_flipped_name(invpost, INVPOST, INVPOST2, stat);
- if(!invcntl->postfile) {
- invcannotopen(invpost);
- fclose(invcntl->invfile);
- return (-1);
- }
-
- /* allocate core for a logical block */
- if((invcntl->logblk = malloc((size_t)invcntl->param.sizeblk)) == NULL) {
- invcannotalloc((size_t)invcntl->param.sizeblk);
- fclose(invcntl->postfile);
- fclose(invcntl->invfile);
- return (-1);
- }
- /* allocate for and read in superfinger */
- read_index = 1;
- invcntl->iindex = NULL;
- #if SHARE
- if(invcntl->param.share == 1) {
- key_t shm_key;
- struct shmid_ds shm_buf;
- int shm_id;
-
- /* see if the shared segment exists */
- shm_key = ftok(invname, 2);
- shm_id = shmget(shm_key, 0, 0);
- /* Failure simply means (hopefully) that segment doesn't exists */
- if(shm_id == -1) {
- /* Have to give general write permission due to AMdahl not having protected
- * segments */
- shm_id =
- shmget(shm_key, invcntl->param.supsize + sizeof(long), IPC_CREAT | 0666);
- if(shm_id == -1) perror("Could not create shared memory segment");
- } else
- read_index = 0;
-
- if(shm_id != -1) {
- invcntl->iindex = shmat(shm_id, 0, ((read_index) ? 0 : SHM_RDONLY));
- if(invcntl->iindex == (char *)ERR) {
- fprintf(stderr, PROGRAM_NAME ": shared memory link failed\n");
- invcntl->iindex = NULL;
- read_index = 1;
- }
- }
- }
- #endif
- if(invcntl->iindex == NULL) /* FIXME HBB: magic number alert (4, sizeof(long)) */
- invcntl->iindex = malloc((size_t)invcntl->param.supsize + 4 * sizeof(long));
- if(invcntl->iindex == NULL) {
- invcannotalloc((size_t)invcntl->param.supsize);
- free(invcntl->logblk);
- fclose(invcntl->postfile);
- fclose(invcntl->invfile);
- return (-1);
- }
- if(read_index) {
- fseek(invcntl->invfile, invcntl->param.startbyte, SEEK_SET);
- fread(invcntl->iindex, (int)invcntl->param.supsize, 1, invcntl->invfile);
- }
- invcntl->numblk = -1;
- if(boolready() == -1) {
- fclose(invcntl->postfile);
- fclose(invcntl->invfile);
- return (-1);
- }
- /* write back out the control block if anything changed */
- invcntl->param.filestat = stat;
- if(stat > invcntl->param.filestat) {
- rewind(invcntl->invfile);
- fwrite(&invcntl->param, sizeof(invcntl->param), 1, invcntl->invfile);
- }
- return (1);
- }
-
- /** invclose must be called to wrap things up and deallocate core **/
- void invclose(INVCONTROL *invcntl) {
- /* write out the control block in case anything changed */
- if(invcntl->param.filestat > 0) {
- invcntl->param.filestat = 0;
- rewind(invcntl->invfile);
- fwrite(&invcntl->param, 1, sizeof(invcntl->param), invcntl->invfile);
- }
- if(invcntl->param.filestat == INVALONE) {
- /* write out the super finger */
- fseek(invcntl->invfile, invcntl->param.startbyte, SEEK_SET);
- fwrite(invcntl->iindex, 1, (int)invcntl->param.supsize, invcntl->invfile);
- }
- fclose(invcntl->invfile);
- fclose(invcntl->postfile);
- #if SHARE
- if(invcntl->param.share > 0) {
- shmdt(invcntl->iindex);
- invcntl->iindex = NULL;
- }
- #endif
- if(invcntl->iindex != NULL) free(invcntl->iindex);
- free(invcntl->logblk);
- }
-
- /** invstep steps the inverted file forward one item **/
- static void invstep(INVCONTROL *invcntl) {
- if(invcntl->keypnt < (invcntl->logblk->invblk[0] - 1)) {
- invcntl->keypnt++;
- return;
- }
-
- /* move forward a block else wrap */
- invcntl->numblk =
- invcntl->logblk->invblk[1]; /* was: *(int *)(invcntl->logblk + sizeof(long))*/
-
- /* now read in the block */
- fseek(invcntl->invfile,
- invcntl->numblk * invcntl->param.sizeblk + invcntl->param.cntlsize,
- SEEK_SET);
- fread(invcntl->logblk, (int)invcntl->param.sizeblk, 1, invcntl->invfile);
- invcntl->keypnt = 0;
- }
-
- /** invforward moves forward one term in the inverted file **/
- int invforward(INVCONTROL *invcntl) {
- invstep(invcntl);
- /* skip things with 0 postings */
- /* FIXME HBB: magic number alert! (3) */
- while(((ENTRY *)(invcntl->logblk->invblk + 3) + invcntl->keypnt)->post == 0) {
- invstep(invcntl);
- }
- /* Check for having wrapped - reached start of inverted file! */
- if((invcntl->numblk == 0) && (invcntl->keypnt == 0)) return (0);
- return (1);
- }
-
- /** invterm gets the present term from the present logical block **/
- long invterm(INVCONTROL *invcntl, char *term) {
- ENTRY *entryptr;
-
- /* FIXME HBB: magic number alert! (3) */
- entryptr = (ENTRY *)(invcntl->logblk->invblk + 3) + invcntl->keypnt;
- strncpy(term, invcntl->logblk->chrblk + entryptr->offset, (int)entryptr->size);
- *(term + entryptr->size) = '\0';
- return (entryptr->post);
- }
-
- /** invfind searches for an individual item in the inverted file **/
- long invfind(INVCONTROL *invcntl, char *searchterm) /* term being searched for */
- {
- int imid, ilow, ihigh;
- long num;
- int i;
- unsigned long *intptr, *intptr2;
- ENTRY *entryptr;
-
- /* make sure it is initialized via invready */
- if(invcntl->invfile == 0) return (-1L);
-
- /* now search for the appropriate finger block */
- intptr = (unsigned long *)invcntl->iindex;
-
- ilow = 0;
- ihigh = *intptr++ - 1;
- while(ilow <= ihigh) {
- imid = (ilow + ihigh) / 2;
- intptr2 = intptr + imid;
- i = strcmp(searchterm, (invcntl->iindex + *intptr2));
- if(i < 0)
- ihigh = imid - 1;
- else if(i > 0)
- ilow = ++imid;
- else {
- ilow = imid + 1;
- break;
- }
- }
- /* be careful about case where searchterm is after last in this block */
- imid = (ilow) ? ilow - 1 : 0;
-
- /* fetch the appropriate logical block if not in core */
- /* note always fetch it if the file is busy */
- if((imid != invcntl->numblk) || (invcntl->param.filestat >= INVBUSY)) {
- fseek(invcntl->invfile,
- (imid * invcntl->param.sizeblk) + invcntl->param.cntlsize,
- SEEK_SET);
- invcntl->numblk = imid;
- fread(invcntl->logblk, (int)invcntl->param.sizeblk, 1, invcntl->invfile);
- }
-
- srch_ext:
- /* now find the term in this block. tricky this */
- intptr = (unsigned long *)invcntl->logblk->invblk;
-
- ilow = 0;
- ihigh = *intptr - 1;
- intptr += 3;
- num = 0;
- while(ilow <= ihigh) {
- imid = (ilow + ihigh) / 2;
- entryptr = (ENTRY *)intptr + imid;
- i = strncmp(searchterm,
- invcntl->logblk->chrblk + entryptr->offset,
- (int)entryptr->size);
- if(i == 0) i = strlen(searchterm) - entryptr->size;
- if(i < 0)
- ihigh = imid - 1;
- else if(i > 0)
- ilow = ++imid;
- else {
- num = entryptr->post;
- break;
- }
- }
- /* be careful about case where searchterm is after last in this block */
- if(imid >= invcntl->logblk->invblk[0]) {
- invcntl->keypnt = invcntl->logblk->invblk[0];
- invstep(invcntl);
- /* note if this happens the term could be in extended block */
- if(invcntl->param.startbyte < invcntl->numblk * invcntl->param.sizeblk)
- goto srch_ext;
- } else
- invcntl->keypnt = imid;
- return (num);
- }
-
- #if DEBUG
-
- /** invdump dumps the block the term parameter is in **/
- void invdump(INVCONTROL *invcntl, char *term) {
- long i, j, n, *longptr;
- ENTRY *entryptr;
- char temp[512], *ptr;
-
- /* dump superindex if term is "-" */
- if(*term == '-') {
- j = atoi(term + 1);
- longptr = (long *)invcntl->iindex;
- n = *longptr++;
- printf("Superindex dump, num blocks=%ld\n", n);
- longptr += j;
- while((longptr <= ((long *)invcntl->iindex) + n) && invbreak == 0) {
- printf("%2ld %6ld %s\n", j++, *longptr, invcntl->iindex + *longptr);
- longptr++;
- }
- return;
- } else if(*term == '#') {
- j = atoi(term + 1);
- /* fetch the appropriate logical block */
- invcntl->numblk = j;
- fseek(invcntl->invfile,
- (j * invcntl->param.sizeblk) + invcntl->param.cntlsize,
- SEEK_SET);
- fread(invcntl->logblk, (int)invcntl->param.sizeblk, 1, invcntl->invfile);
- } else
- i = abs((int)invfind(invcntl, term));
- longptr = invcntl->logblk->invblk;
- n = *longptr++;
- printf("Entry term to invdump=%s, postings=%ld, forwrd ptr=%ld, back ptr=%ld\n",
- term,
- i,
- *(longptr),
- *(longptr + 1));
- /* FIXME HBB: magic number alert! (3) */
- entryptr = (ENTRY *)(invcntl->logblk->invblk + 3);
- printf("%ld terms in this block, block=%ld\n", n, invcntl->numblk);
- printf("\tterm\t\t\tposts\tsize\toffset\tspace\t1st word\n");
- for(j = 0; j < n && invbreak == 0; j++) {
- ptr = invcntl->logblk->chrblk + entryptr->offset;
- strncpy(temp, ptr, (int)entryptr->size);
- temp[entryptr->size] = '\0';
- ptr +=
- (sizeof(long) * (long)((entryptr->size + (sizeof(long) - 1)) / sizeof(long)));
- printf("%2ld %-24s\t%5ld\t%3d\t%d\t%d\t%ld\n",
- j,
- temp,
- entryptr->post,
- entryptr->size,
- entryptr->offset,
- entryptr->space,
- *(long *)ptr);
- entryptr++;
- }
- }
- #endif
-
- static int boolready(void) {
- numitems = 0;
- if(item1 != NULL) free(item1);
- setsize1 = SETINC;
- if((item1 = malloc(SETINC * sizeof(*item1))) == NULL) {
- invcannotalloc(SETINC);
- return (-1);
- }
- if(item2 != NULL) free(item2);
- setsize2 = SETINC;
- if((item2 = malloc(SETINC * sizeof(*item2))) == NULL) {
- invcannotalloc(SETINC);
- return (-1);
- }
- item = item1;
- enditem = item;
- return (0);
- }
-
- void boolclear(void) {
- numitems = 0;
- item = item1;
- enditem = item;
- }
-
- POSTING *boolfile(INVCONTROL *invcntl, long *num, int boolarg) {
- ENTRY *entryptr;
- FILE *file;
- void *ptr;
- unsigned long *ptr2;
- POSTING *newitem = NULL; /* initialize, to avoid warning */
- POSTING posting;
- unsigned u;
- POSTING *newsetp = NULL, *set1p;
- long newsetc, set1c, set2c;
-
- /* FIXME HBB: magic number alert! (3) */
- entryptr = (ENTRY *)(invcntl->logblk->invblk + 3) + invcntl->keypnt;
- ptr = invcntl->logblk->chrblk + entryptr->offset;
- ptr2 = ((unsigned long *)ptr) + (entryptr->size + (sizeof(long) - 1)) / sizeof(long);
- *num = entryptr->post;
- switch(boolarg) {
- case bool_OR:
- case falseT:
- if(*num == 0) {
- *num = numitems;
- return (item);
- }
- }
- /* make room for the new set */
- u = 0;
- switch(boolarg) {
- case AND:
- case falseT:
- newsetp = item;
- break;
-
- case bool_OR:
- u = enditem - item;
- /* FALLTHROUGH */
- case REVERSEfalseT:
- u += *num;
- if(item == item2) {
- if(u > setsize1) {
- u += SETINC;
- if((item1 = realloc(item1, u * sizeof(*item1))) == NULL) {
- invcannotalloc(u * sizeof(*item1));
- boolready();
- *num = -1;
- return (NULL);
- }
- setsize1 = u;
- }
- newitem = item1;
- } else {
- if(u > setsize2) {
- u += SETINC;
- if((item2 = realloc(item2, u * sizeof(*item2))) == NULL) {
- invcannotalloc(u * sizeof(*item2));
- boolready();
- *num = -1;
- return (NULL);
- }
- setsize2 = u;
- }
- newitem = item2;
- }
- newsetp = newitem;
- }
- file = invcntl->postfile;
- fseek(file, *ptr2, SEEK_SET);
- fread(&posting, sizeof(posting), 1, file);
- newsetc = 0;
- switch(boolarg) {
- case bool_OR:
- /* while something in both sets */
- set1p = item;
- newsetp = newitem;
- for(set1c = 0, set2c = 0; set1c < numitems && set2c < *num; newsetc++) {
- if(set1p->lineoffset < posting.lineoffset) {
- *newsetp++ = *set1p++;
- set1c++;
- } else if(set1p->lineoffset > posting.lineoffset) {
- *newsetp++ = posting;
- fread(&posting, (int)sizeof(posting), 1, file);
- set2c++;
- } else if(set1p->type < posting.type) {
- *newsetp++ = *set1p++;
- set1c++;
- } else if(set1p->type > posting.type) {
- *newsetp++ = posting;
- fread(&posting, (int)sizeof(posting), 1, file);
- set2c++;
- } else { /* identical postings */
- *newsetp++ = *set1p++;
- set1c++;
- fread(&posting, (int)sizeof(posting), 1, file);
- set2c++;
- }
- }
- /* find out what ran out and move the rest in */
- if(set1c < numitems) {
- newsetc += numitems - set1c;
- while(set1c++ < numitems) {
- *newsetp++ = *set1p++;
- }
- } else {
- while(set2c++ < *num) {
- *newsetp++ = posting;
- newsetc++;
- fread(&posting, (int)sizeof(posting), 1, file);
- }
- }
- item = newitem;
- break; /* end of bool_OR */
- }
- numitems = newsetc;
- *num = newsetc;
- enditem = (POSTING *)newsetp;
- return ((POSTING *)item);
- }
-
- static void invcannotalloc(unsigned n) {
- fprintf(stderr, PROGRAM_NAME ": cannot allocate %u bytes\n", n);
- }
-
- static void invcannotopen(char *file) {
- fprintf(stderr, PROGRAM_NAME ": cannot open file %s\n", file);
- }
-
- static void invcannotwrite(char *file) {
- perror(PROGRAM_NAME); /* must be first to preserve errno */
- fprintf(stderr, PROGRAM_NAME ": write to file %s failed\n", file);
- }
|