2023-07-27 14:04:50 -04:00
|
|
|
/*===========================================================================
|
2023-08-04 14:34:51 -04:00
|
|
|
Copyright (c) 1998-2000, The Santa Cruz Operation
|
2023-07-27 14:04:50 -04:00
|
|
|
All rights reserved.
|
2023-08-04 14:34:51 -04:00
|
|
|
|
2023-07-27 14:04:50 -04:00
|
|
|
Redistribution and use in source and binary forms, with or without
|
|
|
|
modification, are permitted provided that the following conditions are met:
|
|
|
|
|
|
|
|
*Redistributions of source code must retain the above copyright notice,
|
|
|
|
this list of conditions and the following disclaimer.
|
|
|
|
|
|
|
|
*Redistributions in binary form must reproduce the above copyright notice,
|
|
|
|
this list of conditions and the following disclaimer in the documentation
|
|
|
|
and/or other materials provided with the distribution.
|
|
|
|
|
|
|
|
*Neither name of The Santa Cruz Operation nor the names of its contributors
|
|
|
|
may be used to endorse or promote products derived from this software
|
2023-08-04 14:34:51 -04:00
|
|
|
without specific prior written permission.
|
2023-07-27 14:04:50 -04:00
|
|
|
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS
|
2023-08-04 15:09:58 -04:00
|
|
|
IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT falseT LIMITED TO,
|
2023-07-27 14:04:50 -04:00
|
|
|
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
2023-08-04 15:09:58 -04:00
|
|
|
PURPOSE ARE DISCLAIMED. IN false EVENT SHALL THE REGENTS OR CONTRIBUTORS BE
|
2023-07-27 14:04:50 -04:00
|
|
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
2023-08-04 15:09:58 -04:00
|
|
|
CONSEQUENTIAL DAMAGES (INCLUDING, BUT falseT LIMITED TO, PROCUREMENT OF
|
2023-07-27 14:04:50 -04:00
|
|
|
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
|
|
INTERRUPTION)
|
|
|
|
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
|
|
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
|
|
OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
|
2023-08-04 14:34:51 -04:00
|
|
|
DAMAGE.
|
2023-07-27 14:04:50 -04:00
|
|
|
=========================================================================*/
|
|
|
|
|
|
|
|
|
|
|
|
#include <ctype.h>
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
#if SHARE
|
|
|
|
#include <sys/types.h>
|
|
|
|
#include <sys/ipc.h>
|
|
|
|
#include <sys/shm.h>
|
|
|
|
#define ERR -1
|
|
|
|
#endif
|
|
|
|
#include "invlib.h"
|
|
|
|
#include "global.h"
|
|
|
|
|
|
|
|
#include <assert.h>
|
|
|
|
|
2023-08-04 13:49:03 -04:00
|
|
|
#define DEBUG 0 /* debugging code and realloc messages */
|
|
|
|
#define BLOCKSIZE 2 * BUFSIZ /* logical block size */
|
|
|
|
#define POSTINC 10000 /* posting buffer size increment */
|
|
|
|
#define SEP ' ' /* sorted posting field separator */
|
|
|
|
#define SETINC 100 /* posting set size increment */
|
|
|
|
#define STATS 0 /* print statistics */
|
|
|
|
#define SUPERINC 10000 /* super index size increment */
|
|
|
|
#define TERMMAX 512 /* term max size */
|
|
|
|
#define FMTVERSION 1 /* inverted index format version */
|
|
|
|
#define ZIPFSIZE 200 /* zipf curve size */
|
2023-07-27 14:04:50 -04:00
|
|
|
|
|
|
|
#if DEBUG
|
|
|
|
/* FIXME HBB 20010705: nowhere in the source is `invbreak' ever set to
|
|
|
|
* a value other than the (silent) initialization to zero. Pretty
|
|
|
|
* useless, that looks */
|
2023-08-04 13:49:03 -04:00
|
|
|
int invbreak;
|
2023-07-27 14:04:50 -04:00
|
|
|
#endif
|
|
|
|
|
2023-08-04 13:49:03 -04:00
|
|
|
static int boolready(void);
|
|
|
|
static int invnewterm(void);
|
|
|
|
static void invstep(INVCONTROL *invcntl);
|
|
|
|
static void invcannotalloc(unsigned n);
|
|
|
|
static void invcannotopen(char *file);
|
|
|
|
static void invcannotwrite(char *file);
|
2023-07-27 14:04:50 -04:00
|
|
|
|
|
|
|
#if STATS
|
2023-08-04 13:49:03 -04:00
|
|
|
int showzipf; /* show postings per term distribution */
|
2023-07-27 14:04:50 -04:00
|
|
|
#endif
|
|
|
|
|
2023-08-04 13:49:03 -04:00
|
|
|
static POSTING *item, *enditem, *item1 = NULL, *item2 = NULL;
|
|
|
|
static unsigned int setsize1, setsize2;
|
|
|
|
static long numitems, totterm, zerolong;
|
|
|
|
static char *indexfile, *postingfile;
|
|
|
|
static FILE *outfile, *fpost;
|
|
|
|
static size_t supersize = SUPERINC, supintsize;
|
2023-07-27 14:04:50 -04:00
|
|
|
static unsigned int numpost, numlogblk, amtused, nextpost;
|
|
|
|
static unsigned int lastinblk, numinvitems;
|
2023-08-04 13:49:03 -04:00
|
|
|
static POSTING *POST, *postptr;
|
|
|
|
static unsigned long *SUPINT, *supint, nextsupfing;
|
|
|
|
static char *SUPFING, *supfing;
|
|
|
|
static char thisterm[TERMMAX];
|
2023-07-27 14:04:50 -04:00
|
|
|
typedef union logicalblk {
|
2023-08-04 13:49:03 -04:00
|
|
|
long invblk[BLOCKSIZE / sizeof(long)];
|
|
|
|
char chrblk[BLOCKSIZE];
|
2023-07-27 14:04:50 -04:00
|
|
|
} t_logicalblk;
|
|
|
|
static t_logicalblk logicalblk;
|
|
|
|
|
|
|
|
#if DEBUG || STATS
|
2023-08-04 13:49:03 -04:00
|
|
|
static long totpost;
|
2023-07-27 14:04:50 -04:00
|
|
|
#endif
|
|
|
|
|
|
|
|
#if STATS
|
2023-08-04 13:49:03 -04:00
|
|
|
static int zipf[ZIPFSIZE + 1];
|
2023-07-27 14:04:50 -04:00
|
|
|
#endif
|
|
|
|
|
|
|
|
long
|
|
|
|
invmake(char *invname, char *invpost, FILE *infile)
|
|
|
|
{
|
2023-08-04 13:49:03 -04:00
|
|
|
unsigned char *s;
|
|
|
|
long num;
|
|
|
|
int i;
|
|
|
|
long fileindex = 0; /* initialze, to avoid warning */
|
|
|
|
unsigned postsize = POSTINC * sizeof(*POST);
|
|
|
|
unsigned long *intptr;
|
|
|
|
char line[TERMMAX];
|
|
|
|
long tlong;
|
|
|
|
PARAM param;
|
|
|
|
POSTING posting;
|
|
|
|
char temp[BLOCKSIZE];
|
2023-07-27 14:04:50 -04:00
|
|
|
#if STATS
|
2023-08-04 13:49:03 -04:00
|
|
|
int j;
|
|
|
|
unsigned maxtermlen = 0;
|
2023-07-27 14:04:50 -04:00
|
|
|
#endif
|
2023-08-04 13:49:03 -04:00
|
|
|
/* output file */
|
|
|
|
if ((outfile = vpfopen(invname, "w+b")) == NULL) {
|
|
|
|
invcannotopen(invname);
|
|
|
|
return(0);
|
|
|
|
}
|
|
|
|
indexfile = invname;
|
|
|
|
fseek(outfile, BUFSIZ, SEEK_SET);
|
|
|
|
|
|
|
|
/* posting file */
|
|
|
|
if ((fpost = vpfopen(invpost, "wb")) == NULL) {
|
|
|
|
invcannotopen(invpost);
|
|
|
|
return(0);
|
|
|
|
}
|
|
|
|
postingfile = invpost;
|
|
|
|
nextpost = 0;
|
|
|
|
/* get space for the postings list */
|
|
|
|
if ((POST = malloc(postsize)) == NULL) {
|
|
|
|
invcannotalloc(postsize);
|
|
|
|
return(0);
|
|
|
|
}
|
|
|
|
postptr = POST;
|
|
|
|
/* get space for the superfinger (superindex) */
|
|
|
|
if ((SUPFING = malloc(supersize)) == NULL) {
|
|
|
|
invcannotalloc(supersize);
|
|
|
|
return(0);
|
|
|
|
}
|
|
|
|
supfing = SUPFING;
|
|
|
|
/* FIXME HBB: magic number alert (40) */
|
|
|
|
supintsize = supersize / 40u;
|
|
|
|
/* also for the superfinger index */
|
|
|
|
if ((SUPINT = malloc(supintsize * sizeof(*SUPINT))) == NULL) {
|
|
|
|
invcannotalloc(supintsize * sizeof(*SUPINT));
|
|
|
|
return(0);
|
|
|
|
}
|
|
|
|
supint = SUPINT;
|
|
|
|
supint++; /* leave first term open for a count */
|
|
|
|
/* initialize using an empty term */
|
|
|
|
strcpy(thisterm, "");
|
|
|
|
*supint++ = 0;
|
|
|
|
*supfing++ = ' ';
|
|
|
|
*supfing++ = '\0';
|
|
|
|
nextsupfing = 2;
|
2023-07-27 14:04:50 -04:00
|
|
|
#if DEBUG || STATS
|
2023-08-04 13:49:03 -04:00
|
|
|
totpost = 0L;
|
2023-07-27 14:04:50 -04:00
|
|
|
#endif
|
2023-08-04 13:49:03 -04:00
|
|
|
totterm = 0L;
|
|
|
|
numpost = 1;
|
|
|
|
|
|
|
|
/* set up as though a block had come and gone, i.e., set up for new block */
|
|
|
|
/* 3 longs needed for: numinvitems, next block, and previous block */
|
|
|
|
amtused = 3 * sizeof(long);
|
|
|
|
numinvitems = 0;
|
|
|
|
numlogblk = 0;
|
|
|
|
lastinblk = sizeof(t_logicalblk);
|
|
|
|
|
|
|
|
/* now loop as long as more to read (till eof) */
|
|
|
|
while (fgets(line, TERMMAX, infile) != NULL) {
|
2023-07-27 14:04:50 -04:00
|
|
|
#if DEBUG || STATS
|
2023-08-04 13:49:03 -04:00
|
|
|
++totpost;
|
2023-07-27 14:04:50 -04:00
|
|
|
#endif
|
2023-08-04 13:49:03 -04:00
|
|
|
s = strchr(line, SEP);
|
|
|
|
if (s != NULL) {
|
|
|
|
*s = '\0';
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
continue;
|
|
|
|
}
|
2023-07-27 14:04:50 -04:00
|
|
|
#if STATS
|
2023-08-04 13:49:03 -04:00
|
|
|
if ((i = strlen(line)) > maxtermlen) {
|
|
|
|
maxtermlen = i;
|
|
|
|
}
|
2023-07-27 14:04:50 -04:00
|
|
|
#endif
|
|
|
|
#if DEBUG
|
2023-08-04 13:49:03 -04:00
|
|
|
printf("%ld: %s ", totpost, line);
|
|
|
|
fflush(stdout);
|
2023-07-27 14:04:50 -04:00
|
|
|
#endif
|
2023-08-04 13:49:03 -04:00
|
|
|
if (strcmp(thisterm, line) == 0) {
|
|
|
|
if ((postptr + 10) > (POST + (postsize / sizeof(*POST)))) {
|
|
|
|
i = postptr - POST;
|
|
|
|
postsize += POSTINC * sizeof(*POST);
|
|
|
|
if ((POST = realloc(POST, postsize)) == NULL) {
|
|
|
|
invcannotalloc(postsize);
|
|
|
|
return(0);
|
|
|
|
}
|
|
|
|
postptr = i + POST;
|
2023-07-27 14:04:50 -04:00
|
|
|
#if DEBUG
|
2023-08-04 13:49:03 -04:00
|
|
|
printf("reallocated post space to %u, totpost=%ld\n",
|
|
|
|
postsize, totpost);
|
2023-07-27 14:04:50 -04:00
|
|
|
#endif
|
2023-08-04 13:49:03 -04:00
|
|
|
}
|
|
|
|
numpost++;
|
|
|
|
} else {
|
|
|
|
/* have a new term */
|
|
|
|
if (!invnewterm()) {
|
|
|
|
return(0);
|
|
|
|
}
|
|
|
|
strcpy(thisterm, line);
|
|
|
|
numpost = 1;
|
|
|
|
postptr = POST;
|
|
|
|
fileindex = 0;
|
|
|
|
}
|
|
|
|
/* get the new posting */
|
|
|
|
num = *++s - '!';
|
|
|
|
i = 1;
|
|
|
|
do {
|
|
|
|
num = BASE * num + *++s - '!';
|
|
|
|
} while (++i < PRECISION);
|
|
|
|
posting.lineoffset = num;
|
|
|
|
while (++fileindex < nsrcfiles && num > srcoffset[fileindex]) {
|
|
|
|
;
|
|
|
|
}
|
|
|
|
posting.fileindex = --fileindex;
|
|
|
|
posting.type = *++s;
|
|
|
|
++s;
|
|
|
|
if (*s != '\n') {
|
|
|
|
num = *++s - '!';
|
|
|
|
while (*++s != '\n') {
|
|
|
|
num = BASE * num + *s - '!';
|
|
|
|
}
|
|
|
|
posting.fcnoffset = num;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
posting.fcnoffset = 0;
|
|
|
|
}
|
|
|
|
*postptr++ = posting;
|
2023-07-27 14:04:50 -04:00
|
|
|
#if DEBUG
|
2023-08-04 13:49:03 -04:00
|
|
|
printf("%ld %ld %ld %ld\n", posting.fileindex,
|
|
|
|
posting.fcnoffset, posting.lineoffset, posting.type);
|
|
|
|
fflush(stdout);
|
2023-07-27 14:04:50 -04:00
|
|
|
#endif
|
2023-08-04 13:49:03 -04:00
|
|
|
}
|
|
|
|
if (!invnewterm()) {
|
|
|
|
return(0);
|
|
|
|
}
|
|
|
|
/* now clean up final block */
|
|
|
|
logicalblk.invblk[0] = numinvitems;
|
|
|
|
/* loops pointer around to start */
|
|
|
|
logicalblk.invblk[1] = 0;
|
|
|
|
logicalblk.invblk[2] = numlogblk - 1;
|
|
|
|
if (fwrite(&logicalblk, sizeof(t_logicalblk), 1, outfile) == 0) {
|
|
|
|
goto cannotwrite;
|
|
|
|
}
|
|
|
|
numlogblk++;
|
|
|
|
/* write out block to save space. what in it doesn't matter */
|
|
|
|
if (fwrite(&logicalblk, sizeof(t_logicalblk), 1, outfile) == 0) {
|
|
|
|
goto cannotwrite;
|
|
|
|
}
|
|
|
|
/* finish up the super finger */
|
|
|
|
*SUPINT = numlogblk;
|
|
|
|
/* add to the offsets the size of the offset pointers */
|
|
|
|
intptr = (SUPINT + 1);
|
|
|
|
i = (char *)supint - (char *)SUPINT;
|
|
|
|
while (intptr < supint)
|
|
|
|
*intptr++ += i;
|
|
|
|
/* write out the offsets (1 for the N at start) and the super finger */
|
|
|
|
if (fwrite(SUPINT, sizeof(*SUPINT), numlogblk + 1, outfile) == 0 ||
|
|
|
|
fwrite(SUPFING, 1, supfing - SUPFING, outfile) == 0) {
|
|
|
|
goto cannotwrite;
|
|
|
|
}
|
|
|
|
/* save the size for reference later */
|
|
|
|
nextsupfing = sizeof(long) + sizeof(long) * numlogblk + (supfing - SUPFING);
|
2023-08-04 14:34:51 -04:00
|
|
|
/* make sure the file ends at a logical block boundary. This is
|
|
|
|
necessary for invinsert to correctly create extended blocks
|
2023-08-04 13:49:03 -04:00
|
|
|
*/
|
|
|
|
i = nextsupfing % sizeof(t_logicalblk);
|
|
|
|
/* write out junk to fill log blk */
|
|
|
|
if (fwrite(temp, sizeof(t_logicalblk) - i, 1, outfile) == 0 ||
|
|
|
|
fflush(outfile) == EOF) { /* rewind doesn't check for write failure */
|
|
|
|
goto cannotwrite;
|
|
|
|
}
|
|
|
|
/* write the control area */
|
|
|
|
rewind(outfile);
|
|
|
|
param.version = FMTVERSION;
|
|
|
|
param.filestat = 0;
|
|
|
|
param.sizeblk = sizeof(t_logicalblk);
|
|
|
|
param.startbyte = (numlogblk + 1) * sizeof(t_logicalblk) + BUFSIZ;;
|
|
|
|
param.supsize = nextsupfing;
|
|
|
|
param.cntlsize = BUFSIZ;
|
|
|
|
param.share = 0;
|
|
|
|
if (fwrite(¶m, sizeof(param), 1, outfile) == 0) {
|
|
|
|
goto cannotwrite;
|
|
|
|
}
|
|
|
|
for (i = 0; i < 10; i++) /* for future use */
|
|
|
|
if (fwrite(&zerolong, sizeof(zerolong), 1, outfile) == 0) {
|
|
|
|
goto cannotwrite;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* make first block loop backwards to last block */
|
|
|
|
if (fflush(outfile) == EOF) { /* fseek doesn't check for write failure */
|
|
|
|
goto cannotwrite;
|
|
|
|
}
|
|
|
|
/* get to second word first block */
|
|
|
|
fseek(outfile, BUFSIZ + 2 * sizeof(long), SEEK_SET);
|
|
|
|
tlong = numlogblk - 1;
|
|
|
|
if (fwrite(&tlong, sizeof(tlong), 1, outfile) == 0 ||
|
|
|
|
fclose(outfile) == EOF) {
|
|
|
|
cannotwrite:
|
|
|
|
invcannotwrite(invname);
|
|
|
|
return(0);
|
|
|
|
}
|
|
|
|
if (fclose(fpost) == EOF) {
|
|
|
|
invcannotwrite(postingfile);
|
|
|
|
return(0);
|
|
|
|
}
|
|
|
|
--totterm; /* don't count null term */
|
2023-07-27 14:04:50 -04:00
|
|
|
#if STATS
|
2023-08-04 13:49:03 -04:00
|
|
|
printf("logical blocks = %d, postings = %ld, terms = %ld, max term length = %d\n",
|
|
|
|
numlogblk, totpost, totterm, maxtermlen);
|
|
|
|
if (showzipf) {
|
|
|
|
printf("\n************* ZIPF curve ****************\n");
|
|
|
|
for (j = ZIPFSIZE; j > 1; j--)
|
2023-08-04 14:34:51 -04:00
|
|
|
if (zipf[j])
|
2023-08-04 13:49:03 -04:00
|
|
|
break;
|
|
|
|
for (i = 1; i < j; ++i) {
|
|
|
|
printf("%3d -%6d ", i, zipf[i]);
|
|
|
|
if (i % 6 == 0) putchar('\n');
|
|
|
|
}
|
|
|
|
printf(">%d-%6d\n", ZIPFSIZE, zipf[0]);
|
|
|
|
}
|
2023-07-27 14:04:50 -04:00
|
|
|
#endif
|
2023-08-04 13:49:03 -04:00
|
|
|
/* free all malloc'd memory */
|
|
|
|
free(POST);
|
|
|
|
free(SUPFING);
|
|
|
|
free(SUPINT);
|
|
|
|
return(totterm);
|
2023-07-27 14:04:50 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
/* add a term to the data base */
|
|
|
|
|
|
|
|
static int
|
|
|
|
invnewterm(void)
|
|
|
|
{
|
2023-08-04 13:49:03 -04:00
|
|
|
int backupflag, i, j, holditems, gooditems, howfar;
|
2023-07-27 14:04:50 -04:00
|
|
|
unsigned int maxback, len, numwilluse, wdlen;
|
2023-08-04 13:49:03 -04:00
|
|
|
char *tptr, *tptr3;
|
2023-07-27 14:04:50 -04:00
|
|
|
|
|
|
|
union {
|
2023-08-04 13:49:03 -04:00
|
|
|
unsigned long packword[2];
|
|
|
|
ENTRY e;
|
2023-07-27 14:04:50 -04:00
|
|
|
} iteminfo;
|
|
|
|
|
2023-08-04 13:49:03 -04:00
|
|
|
gooditems = 0; /* initialize, to avoid warning */
|
2023-07-27 14:04:50 -04:00
|
|
|
totterm++;
|
|
|
|
#if STATS
|
|
|
|
/* keep zipfian info on the distribution */
|
|
|
|
if (numpost <= ZIPFSIZE)
|
2023-08-04 13:49:03 -04:00
|
|
|
zipf[numpost]++;
|
2023-07-27 14:04:50 -04:00
|
|
|
else
|
2023-08-04 13:49:03 -04:00
|
|
|
zipf[0]++;
|
2023-07-27 14:04:50 -04:00
|
|
|
#endif
|
|
|
|
len = strlen(thisterm);
|
|
|
|
/* length of term rounded up to long boundary */
|
|
|
|
wdlen = (len + (sizeof(long) - 1)) / sizeof(long);
|
|
|
|
/* each term needs 2 longs for its iteminfo and
|
|
|
|
* 1 long for its offset */
|
|
|
|
numwilluse = (wdlen + 3) * sizeof(long);
|
|
|
|
/* new block if at least 1 item in block */
|
|
|
|
if (numinvitems && numwilluse + amtused > sizeof(t_logicalblk)) {
|
2023-08-04 13:49:03 -04:00
|
|
|
/* set up new block */
|
|
|
|
if (supfing + 500u > SUPFING + supersize) {
|
|
|
|
i = supfing - SUPFING;
|
|
|
|
supersize += 20000u;
|
|
|
|
if ((SUPFING = realloc(SUPFING, supersize)) == NULL) {
|
|
|
|
invcannotalloc(supersize);
|
|
|
|
return(0);
|
|
|
|
}
|
|
|
|
supfing = i + SUPFING;
|
2023-07-27 14:04:50 -04:00
|
|
|
#if DEBUG
|
2023-08-04 14:34:51 -04:00
|
|
|
printf("reallocated superfinger space to %d, totpost=%ld\n",
|
2023-08-04 13:49:03 -04:00
|
|
|
supersize, totpost);
|
2023-07-27 14:04:50 -04:00
|
|
|
#endif
|
2023-08-04 13:49:03 -04:00
|
|
|
}
|
|
|
|
/* check that room for the offset as well */
|
|
|
|
/* FIXME HBB: magic number alert (10) */
|
|
|
|
if ((numlogblk + 10) > supintsize) {
|
|
|
|
i = supint - SUPINT;
|
|
|
|
supintsize += SUPERINC;
|
|
|
|
if ((SUPINT = realloc(SUPINT, supintsize * sizeof(*SUPINT))) == NULL) {
|
|
|
|
invcannotalloc(supintsize * sizeof(*SUPINT));
|
|
|
|
return(0);
|
|
|
|
}
|
|
|
|
supint = i + SUPINT;
|
2023-07-27 14:04:50 -04:00
|
|
|
#if DEBUG
|
2023-08-04 13:49:03 -04:00
|
|
|
printf("reallocated superfinger offset to %d, totpost = %ld\n", supintsize * sizeof(*SUPINT), totpost);
|
2023-07-27 14:04:50 -04:00
|
|
|
#endif
|
2023-08-04 13:49:03 -04:00
|
|
|
}
|
|
|
|
/* See if backup is efficatious */
|
|
|
|
backupflag = 0;
|
|
|
|
maxback = (int) strlen(thisterm) / 10;
|
|
|
|
holditems = numinvitems;
|
|
|
|
if (maxback > numinvitems)
|
|
|
|
maxback = numinvitems - 2;
|
|
|
|
howfar = 0;
|
|
|
|
while (maxback-- > 1) {
|
|
|
|
howfar++;
|
|
|
|
iteminfo.packword[0] =
|
|
|
|
logicalblk.invblk[--holditems * 2 + (sizeof(long) - 1)];
|
|
|
|
if ((i = iteminfo.e.size / 10) < maxback) {
|
|
|
|
maxback = i;
|
|
|
|
backupflag = howfar;
|
|
|
|
gooditems = holditems;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/* see if backup will occur */
|
|
|
|
if (backupflag) {
|
|
|
|
numinvitems = gooditems;
|
|
|
|
}
|
|
|
|
logicalblk.invblk[0] = numinvitems;
|
|
|
|
/* set forward pointer pointing to next */
|
2023-08-04 14:34:51 -04:00
|
|
|
logicalblk.invblk[1] = numlogblk + 1;
|
2023-08-04 13:49:03 -04:00
|
|
|
/* set back pointer to last block */
|
|
|
|
logicalblk.invblk[2] = numlogblk - 1;
|
|
|
|
if (fwrite(logicalblk.chrblk, 1, sizeof(t_logicalblk), outfile) == 0) {
|
|
|
|
invcannotwrite(indexfile);
|
|
|
|
return(0);
|
|
|
|
}
|
|
|
|
/* 3 longs needed for: numinvitems, next block, and previous block */
|
|
|
|
amtused = 3 * sizeof(long);
|
|
|
|
numlogblk++;
|
|
|
|
/* check if had to back up, if so do it */
|
|
|
|
if (backupflag) {
|
|
|
|
char *tptr2;
|
2023-08-04 14:34:51 -04:00
|
|
|
|
2023-08-04 13:49:03 -04:00
|
|
|
/* find out where the end of the new block is */
|
|
|
|
iteminfo.packword[0] = logicalblk.invblk[numinvitems*2+1];
|
|
|
|
tptr3 = logicalblk.chrblk + iteminfo.e.offset;
|
|
|
|
/* move the index for this block */
|
|
|
|
for (i = 3; i <= (backupflag * 2 + 2); i++)
|
|
|
|
logicalblk.invblk[i] = logicalblk.invblk[numinvitems*2+i];
|
|
|
|
/* move the word into the super index */
|
|
|
|
iteminfo.packword[0] = logicalblk.invblk[3];
|
|
|
|
iteminfo.packword[1] = logicalblk.invblk[4];
|
|
|
|
tptr2 = logicalblk.chrblk + iteminfo.e.offset;
|
|
|
|
strncpy(supfing, tptr2, (int) iteminfo.e.size);
|
|
|
|
*(supfing + iteminfo.e.size) = '\0';
|
2023-07-27 14:04:50 -04:00
|
|
|
#if DEBUG
|
2023-08-04 13:49:03 -04:00
|
|
|
printf("backup %d at term=%s to term=%s\n",
|
|
|
|
backupflag, thisterm, supfing);
|
2023-07-27 14:04:50 -04:00
|
|
|
#endif
|
2023-08-04 13:49:03 -04:00
|
|
|
*supint++ = nextsupfing;
|
|
|
|
nextsupfing += strlen(supfing) + 1;
|
|
|
|
supfing += strlen(supfing) + 1;
|
|
|
|
/* now fix up the logical block */
|
|
|
|
tptr = logicalblk.chrblk + lastinblk;
|
|
|
|
lastinblk = sizeof(t_logicalblk);
|
|
|
|
tptr2 = logicalblk.chrblk + lastinblk;
|
|
|
|
j = tptr3 - tptr;
|
|
|
|
while (tptr3 > tptr)
|
|
|
|
*--tptr2 = *--tptr3;
|
|
|
|
lastinblk -= j;
|
|
|
|
amtused += ((2 * sizeof(long)) * backupflag + j);
|
|
|
|
for (i = 3; i < (backupflag * 2 + 2); i += 2) {
|
|
|
|
iteminfo.packword[0] = logicalblk.invblk[i];
|
|
|
|
iteminfo.e.offset += (tptr2 - tptr3);
|
|
|
|
logicalblk.invblk[i] = iteminfo.packword[0];
|
|
|
|
}
|
|
|
|
numinvitems = backupflag;
|
|
|
|
} else { /* no backup needed */
|
|
|
|
numinvitems = 0;
|
|
|
|
lastinblk = sizeof(t_logicalblk);
|
|
|
|
/* add new term to superindex */
|
|
|
|
strcpy(supfing, thisterm);
|
|
|
|
supfing += strlen(thisterm) + 1;
|
|
|
|
*supint++ = nextsupfing;
|
|
|
|
nextsupfing += strlen(thisterm) + 1;
|
|
|
|
}
|
2023-07-27 14:04:50 -04:00
|
|
|
}
|
|
|
|
/* HBB 20010501: Fixed bug by replacing magic number '8' by
|
|
|
|
* what it actually represents. */
|
|
|
|
lastinblk -= (numwilluse - 2 * sizeof(long));
|
|
|
|
iteminfo.e.offset = lastinblk;
|
|
|
|
iteminfo.e.size = len;
|
|
|
|
iteminfo.e.space = 0;
|
|
|
|
iteminfo.e.post = numpost;
|
|
|
|
strncpy(logicalblk.chrblk + lastinblk, thisterm, len);
|
|
|
|
amtused += numwilluse;
|
|
|
|
logicalblk.invblk[(lastinblk/sizeof(long))+wdlen] = nextpost;
|
|
|
|
if ((i = postptr - POST) > 0) {
|
2023-08-04 13:49:03 -04:00
|
|
|
if (fwrite(POST, sizeof(*POST), i, fpost) == 0) {
|
|
|
|
invcannotwrite(postingfile);
|
|
|
|
return(0);
|
|
|
|
}
|
|
|
|
nextpost += i * sizeof(*POST);
|
2023-07-27 14:04:50 -04:00
|
|
|
}
|
|
|
|
logicalblk.invblk[3+2*numinvitems++] = iteminfo.packword[0];
|
|
|
|
logicalblk.invblk[2+2*numinvitems] = iteminfo.packword[1];
|
|
|
|
return(1);
|
|
|
|
}
|
|
|
|
|
2023-08-04 14:34:51 -04:00
|
|
|
/*
|
2023-07-27 14:04:50 -04:00
|
|
|
* If 'invname' ends with the 'from' substring, it is replaced inline with the
|
|
|
|
* 'to' substring (which must be of the exact same length), and the function
|
2023-08-04 14:34:51 -04:00
|
|
|
* returns 0. Otherwise, returns -1.
|
2023-07-27 14:04:50 -04:00
|
|
|
*/
|
|
|
|
|
2023-08-04 14:34:51 -04:00
|
|
|
static int
|
2023-07-27 14:04:50 -04:00
|
|
|
invflipname(char * invname, const char *from, const char *to)
|
|
|
|
{
|
2023-08-04 13:49:03 -04:00
|
|
|
char *temp, *i = NULL;
|
|
|
|
|
|
|
|
assert(strlen(from) == strlen(to));
|
|
|
|
|
|
|
|
temp = invname - 1;
|
|
|
|
while( (temp = strstr(temp + 1, from)))
|
|
|
|
i = temp;
|
|
|
|
if (!i || i[strlen(from)] != '\0')
|
|
|
|
return -1;
|
|
|
|
while(*to)
|
|
|
|
*i++ = *to++;
|
|
|
|
return 0;
|
2023-07-27 14:04:50 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
/* small helper function to centralize handling of binary opening
|
|
|
|
* for reading, and use of the 'stat" flag */
|
|
|
|
static FILE *
|
|
|
|
open_for_reading(char *name, int stat)
|
|
|
|
{
|
2023-08-04 13:49:03 -04:00
|
|
|
return vpfopen(name, ((stat == 0) ? "rb" : "r+b"));
|
2023-07-27 14:04:50 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
/* handle opening of a file under a possibly "flipped" name */
|
|
|
|
/* If db created without '-f', but now invoked with '-f cscope.out',
|
2023-08-04 14:34:51 -04:00
|
|
|
* we need to check for 'cscope.in.out', rather than 'cscope.out.in':
|
2023-07-27 14:04:50 -04:00
|
|
|
* I.e, hack around our own violation of the inverse db naming convention */
|
2023-08-04 14:34:51 -04:00
|
|
|
/* more silliness: if you create the db with '-f cscope', then try to open
|
2023-07-27 14:04:50 -04:00
|
|
|
* it without '-f cscope', you'll fail unless we check for 'cscope.out.in'
|
|
|
|
* here. */
|
|
|
|
static FILE *
|
|
|
|
open_file_with_flipped_name(char *name, const char *flip_in, const char *flip_out, int stat)
|
|
|
|
{
|
2023-08-04 13:49:03 -04:00
|
|
|
if (! invflipname(name, flip_in, flip_out)) {
|
|
|
|
FILE *fptr = open_for_reading(name, stat);
|
|
|
|
if (! fptr)
|
|
|
|
/* flip back for error message */
|
|
|
|
invflipname(name, flip_out, flip_in);
|
|
|
|
return fptr;
|
|
|
|
};
|
|
|
|
return 0;
|
2023-07-27 14:04:50 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
static FILE *
|
|
|
|
open_file_with_possibly_flipped_name(char *name, const char *flip1, const char *flip2, int stat)
|
|
|
|
{
|
2023-08-04 13:49:03 -04:00
|
|
|
FILE *fptr = open_for_reading(name, stat);
|
2023-07-27 14:04:50 -04:00
|
|
|
|
2023-08-04 13:49:03 -04:00
|
|
|
if (! fptr)
|
|
|
|
fptr = open_file_with_flipped_name(name, flip2, flip1, stat);
|
|
|
|
if (! fptr)
|
|
|
|
fptr = open_file_with_flipped_name(name, flip1, flip2, stat);
|
|
|
|
return fptr;
|
2023-07-27 14:04:50 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
invopen(INVCONTROL *invcntl, char *invname, char *invpost, int stat)
|
|
|
|
{
|
2023-08-04 13:49:03 -04:00
|
|
|
int read_index;
|
|
|
|
|
|
|
|
invcntl->invfile = open_file_with_possibly_flipped_name(invname, INVNAME, INVNAME2, stat);
|
|
|
|
if (! invcntl->invfile) {
|
|
|
|
invcannotopen(invname);
|
|
|
|
return(-1);
|
|
|
|
}
|
|
|
|
if (fread(&invcntl->param, sizeof(invcntl->param), 1, invcntl->invfile) == 0) {
|
|
|
|
fprintf(stderr, "%s: empty inverted file\n", argv0);
|
|
|
|
fclose(invcntl->invfile);
|
|
|
|
return(-1);
|
|
|
|
}
|
|
|
|
if (invcntl->param.version != FMTVERSION) {
|
|
|
|
fprintf(stderr, "%s: cannot read old index format; use -U option to force database to rebuild\n", argv0);
|
|
|
|
fclose(invcntl->invfile);
|
|
|
|
return(-1);
|
|
|
|
}
|
|
|
|
assert(invcntl->param.sizeblk == sizeof(t_logicalblk));
|
|
|
|
|
|
|
|
if (stat == 0 && invcntl->param.filestat == INVALONE) {
|
|
|
|
fprintf(stderr, "%s: inverted file is locked\n", argv0);
|
|
|
|
fclose(invcntl->invfile);
|
|
|
|
return(-1);
|
|
|
|
}
|
|
|
|
|
|
|
|
invcntl->postfile = open_file_with_possibly_flipped_name(invpost, INVPOST, INVPOST2, stat);
|
|
|
|
if (! invcntl->postfile) {
|
|
|
|
invcannotopen(invpost);
|
|
|
|
fclose(invcntl->invfile);
|
|
|
|
return(-1);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* allocate core for a logical block */
|
|
|
|
if ((invcntl->logblk = malloc((size_t) invcntl->param.sizeblk)) == NULL) {
|
|
|
|
invcannotalloc((size_t) invcntl->param.sizeblk);
|
|
|
|
fclose(invcntl->postfile);
|
|
|
|
fclose(invcntl->invfile);
|
|
|
|
return(-1);
|
|
|
|
}
|
|
|
|
/* allocate for and read in superfinger */
|
|
|
|
read_index = 1;
|
|
|
|
invcntl->iindex = NULL;
|
2023-07-27 14:04:50 -04:00
|
|
|
#if SHARE
|
2023-08-04 13:49:03 -04:00
|
|
|
if (invcntl->param.share == 1) {
|
|
|
|
key_t shm_key;
|
|
|
|
struct shmid_ds shm_buf;
|
|
|
|
int shm_id;
|
|
|
|
|
|
|
|
/* see if the shared segment exists */
|
|
|
|
shm_key = ftok(invname, 2);
|
|
|
|
shm_id = shmget(shm_key, 0, 0);
|
|
|
|
/* Failure simply means (hopefully) that segment doesn't exists */
|
|
|
|
if (shm_id == -1) {
|
|
|
|
/* Have to give general write permission due to AMdahl not having protected segments */
|
|
|
|
shm_id = shmget(shm_key, invcntl->param.supsize + sizeof(long), IPC_CREAT | 0666);
|
|
|
|
if (shm_id == -1)
|
|
|
|
perror("Could not create shared memory segment");
|
|
|
|
} else
|
|
|
|
read_index = 0;
|
|
|
|
|
|
|
|
if (shm_id != -1) {
|
|
|
|
invcntl->iindex = shmat(shm_id, 0, ((read_index) ? 0 : SHM_RDONLY));
|
|
|
|
if (invcntl->iindex == (char *)ERR) {
|
|
|
|
fprintf(stderr, "%s: shared memory link failed\n", argv0);
|
|
|
|
invcntl->iindex = NULL;
|
|
|
|
read_index = 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2023-07-27 14:04:50 -04:00
|
|
|
#endif
|
2023-08-04 13:49:03 -04:00
|
|
|
if (invcntl->iindex == NULL)
|
|
|
|
/* FIXME HBB: magic number alert (4, sizeof(long)) */
|
|
|
|
invcntl->iindex = malloc((size_t) invcntl->param.supsize + 4 *sizeof(long));
|
|
|
|
if (invcntl->iindex == NULL) {
|
|
|
|
invcannotalloc((size_t) invcntl->param.supsize);
|
|
|
|
free(invcntl->logblk);
|
|
|
|
fclose(invcntl->postfile);
|
|
|
|
fclose(invcntl->invfile);
|
|
|
|
return(-1);
|
|
|
|
}
|
|
|
|
if (read_index) {
|
|
|
|
fseek(invcntl->invfile, invcntl->param.startbyte, SEEK_SET);
|
|
|
|
fread(invcntl->iindex, (int) invcntl->param.supsize, 1,
|
|
|
|
invcntl->invfile);
|
|
|
|
}
|
|
|
|
invcntl->numblk = -1;
|
|
|
|
if (boolready() == -1) {
|
|
|
|
fclose(invcntl->postfile);
|
|
|
|
fclose(invcntl->invfile);
|
|
|
|
return(-1);
|
|
|
|
}
|
|
|
|
/* write back out the control block if anything changed */
|
|
|
|
invcntl->param.filestat = stat;
|
|
|
|
if (stat > invcntl->param.filestat ) {
|
|
|
|
rewind(invcntl->invfile);
|
|
|
|
fwrite(&invcntl->param, sizeof(invcntl->param), 1, invcntl->invfile);
|
|
|
|
}
|
|
|
|
return(1);
|
2023-07-27 14:04:50 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
/** invclose must be called to wrap things up and deallocate core **/
|
|
|
|
void
|
|
|
|
invclose(INVCONTROL *invcntl)
|
|
|
|
{
|
2023-08-04 13:49:03 -04:00
|
|
|
/* write out the control block in case anything changed */
|
|
|
|
if (invcntl->param.filestat > 0) {
|
|
|
|
invcntl->param.filestat = 0;
|
|
|
|
rewind(invcntl->invfile);
|
|
|
|
fwrite(&invcntl->param, 1,
|
|
|
|
sizeof(invcntl->param), invcntl->invfile);
|
|
|
|
}
|
|
|
|
if (invcntl->param.filestat == INVALONE) {
|
|
|
|
/* write out the super finger */
|
|
|
|
fseek(invcntl->invfile, invcntl->param.startbyte, SEEK_SET);
|
|
|
|
fwrite(invcntl->iindex, 1,
|
|
|
|
(int) invcntl->param.supsize, invcntl->invfile);
|
|
|
|
}
|
|
|
|
fclose(invcntl->invfile);
|
|
|
|
fclose(invcntl->postfile);
|
2023-07-27 14:04:50 -04:00
|
|
|
#if SHARE
|
2023-08-04 13:49:03 -04:00
|
|
|
if (invcntl->param.share > 0) {
|
|
|
|
shmdt(invcntl->iindex);
|
|
|
|
invcntl->iindex = NULL;
|
|
|
|
}
|
2023-07-27 14:04:50 -04:00
|
|
|
#endif
|
2023-08-04 13:49:03 -04:00
|
|
|
if (invcntl->iindex != NULL)
|
|
|
|
free(invcntl->iindex);
|
|
|
|
free(invcntl->logblk);
|
2023-07-27 14:04:50 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
/** invstep steps the inverted file forward one item **/
|
|
|
|
static void
|
|
|
|
invstep(INVCONTROL *invcntl)
|
|
|
|
{
|
2023-08-04 13:49:03 -04:00
|
|
|
if (invcntl->keypnt < (invcntl->logblk->invblk[0] - 1)) {
|
2023-08-04 14:34:51 -04:00
|
|
|
invcntl->keypnt++;
|
2023-08-04 13:49:03 -04:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* move forward a block else wrap */
|
2023-08-04 14:34:51 -04:00
|
|
|
invcntl->numblk = invcntl->logblk->invblk[1]; /* was: *(int *)(invcntl->logblk + sizeof(long))*/
|
2023-08-04 13:49:03 -04:00
|
|
|
|
|
|
|
/* now read in the block */
|
|
|
|
fseek(invcntl->invfile,
|
|
|
|
invcntl->numblk*invcntl->param.sizeblk + invcntl->param.cntlsize,
|
|
|
|
SEEK_SET);
|
|
|
|
fread(invcntl->logblk, (int) invcntl->param.sizeblk, 1,
|
2023-08-04 14:34:51 -04:00
|
|
|
invcntl->invfile);
|
|
|
|
invcntl->keypnt = 0;
|
2023-07-27 14:04:50 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
/** invforward moves forward one term in the inverted file **/
|
|
|
|
int
|
|
|
|
invforward(INVCONTROL *invcntl)
|
|
|
|
{
|
2023-08-04 14:34:51 -04:00
|
|
|
invstep(invcntl);
|
2023-08-04 13:49:03 -04:00
|
|
|
/* skip things with 0 postings */
|
|
|
|
/* FIXME HBB: magic number alert! (3) */
|
|
|
|
while (((ENTRY * )(invcntl->logblk->invblk + 3) + invcntl->keypnt)->post == 0) {
|
2023-08-04 14:34:51 -04:00
|
|
|
invstep(invcntl);
|
2023-08-04 13:49:03 -04:00
|
|
|
}
|
|
|
|
/* Check for having wrapped - reached start of inverted file! */
|
|
|
|
if ((invcntl->numblk == 0) && (invcntl->keypnt == 0))
|
|
|
|
return(0);
|
|
|
|
return(1);
|
2023-07-27 14:04:50 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
/** invterm gets the present term from the present logical block **/
|
|
|
|
long
|
|
|
|
invterm(INVCONTROL *invcntl, char *term)
|
|
|
|
{
|
2023-08-04 13:49:03 -04:00
|
|
|
ENTRY * entryptr;
|
|
|
|
|
|
|
|
/* FIXME HBB: magic number alert! (3) */
|
|
|
|
entryptr = (ENTRY *)(invcntl->logblk->invblk + 3) + invcntl->keypnt;
|
|
|
|
strncpy(term, invcntl->logblk->chrblk + entryptr->offset,
|
|
|
|
(int) entryptr->size);
|
|
|
|
*(term + entryptr->size) = '\0';
|
|
|
|
return(entryptr->post);
|
2023-07-27 14:04:50 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
/** invfind searches for an individual item in the inverted file **/
|
|
|
|
long
|
|
|
|
invfind(INVCONTROL *invcntl, char *searchterm) /* term being searched for */
|
|
|
|
{
|
2023-08-04 13:49:03 -04:00
|
|
|
int imid, ilow, ihigh;
|
|
|
|
long num;
|
|
|
|
int i;
|
|
|
|
unsigned long *intptr, *intptr2;
|
|
|
|
ENTRY *entryptr;
|
|
|
|
|
|
|
|
/* make sure it is initialized via invready */
|
|
|
|
if (invcntl->invfile == 0)
|
|
|
|
return(-1L);
|
|
|
|
|
|
|
|
/* now search for the appropriate finger block */
|
|
|
|
intptr = (unsigned long *)invcntl->iindex;
|
|
|
|
|
|
|
|
ilow = 0;
|
|
|
|
ihigh = *intptr++ - 1;
|
|
|
|
while (ilow <= ihigh) {
|
|
|
|
imid = (ilow + ihigh) / 2;
|
|
|
|
intptr2 = intptr + imid;
|
|
|
|
i = strcmp(searchterm, (invcntl->iindex + *intptr2));
|
|
|
|
if (i < 0)
|
|
|
|
ihigh = imid - 1;
|
|
|
|
else if (i > 0)
|
|
|
|
ilow = ++imid;
|
|
|
|
else {
|
|
|
|
ilow = imid + 1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/* be careful about case where searchterm is after last in this block */
|
|
|
|
imid = (ilow) ? ilow - 1 : 0;
|
|
|
|
|
|
|
|
/* fetch the appropriate logical block if not in core */
|
|
|
|
/* note always fetch it if the file is busy */
|
|
|
|
if ((imid != invcntl->numblk) || (invcntl->param.filestat >= INVBUSY)) {
|
|
|
|
fseek(invcntl->invfile,
|
|
|
|
(imid*invcntl->param.sizeblk) + invcntl->param.cntlsize,
|
|
|
|
SEEK_SET);
|
|
|
|
invcntl->numblk = imid;
|
|
|
|
fread(invcntl->logblk, (int)invcntl->param.sizeblk, 1,
|
|
|
|
invcntl->invfile);
|
|
|
|
}
|
2023-07-27 14:04:50 -04:00
|
|
|
|
|
|
|
srch_ext:
|
2023-08-04 13:49:03 -04:00
|
|
|
/* now find the term in this block. tricky this */
|
|
|
|
intptr = (unsigned long *) invcntl->logblk->invblk;
|
|
|
|
|
|
|
|
ilow = 0;
|
|
|
|
ihigh = *intptr - 1;
|
|
|
|
intptr += 3;
|
|
|
|
num = 0;
|
|
|
|
while (ilow <= ihigh) {
|
|
|
|
imid = (ilow + ihigh) / 2;
|
|
|
|
entryptr = (ENTRY *)intptr + imid;
|
|
|
|
i = strncmp(searchterm, invcntl->logblk->chrblk + entryptr->offset,
|
|
|
|
(int) entryptr->size );
|
|
|
|
if (i == 0)
|
|
|
|
i = strlen(searchterm) - entryptr->size;
|
|
|
|
if (i < 0)
|
|
|
|
ihigh = imid - 1;
|
|
|
|
else if (i > 0)
|
|
|
|
ilow = ++imid;
|
|
|
|
else {
|
|
|
|
num = entryptr->post;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/* be careful about case where searchterm is after last in this block */
|
|
|
|
if (imid >= invcntl->logblk->invblk[0]) {
|
|
|
|
invcntl->keypnt = invcntl->logblk->invblk[0];
|
|
|
|
invstep(invcntl);
|
|
|
|
/* note if this happens the term could be in extended block */
|
|
|
|
if (invcntl->param.startbyte < invcntl->numblk * invcntl->param.sizeblk)
|
|
|
|
goto srch_ext;
|
|
|
|
} else
|
|
|
|
invcntl->keypnt = imid;
|
|
|
|
return(num);
|
2023-07-27 14:04:50 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
#if DEBUG
|
|
|
|
|
|
|
|
/** invdump dumps the block the term parameter is in **/
|
|
|
|
void
|
|
|
|
invdump(INVCONTROL *invcntl, char *term)
|
|
|
|
{
|
2023-08-04 13:49:03 -04:00
|
|
|
long i, j, n, *longptr;
|
|
|
|
ENTRY * entryptr;
|
|
|
|
char temp[512], *ptr;
|
|
|
|
|
|
|
|
/* dump superindex if term is "-" */
|
|
|
|
if (*term == '-') {
|
|
|
|
j = atoi(term + 1);
|
|
|
|
longptr = (long *)invcntl->iindex;
|
|
|
|
n = *longptr++;
|
|
|
|
printf("Superindex dump, num blocks=%ld\n", n);
|
|
|
|
longptr += j;
|
|
|
|
while ((longptr <= ((long *)invcntl->iindex) + n) && invbreak == 0) {
|
|
|
|
printf("%2ld %6ld %s\n", j++, *longptr, invcntl->iindex + *longptr);
|
|
|
|
longptr++;
|
|
|
|
}
|
|
|
|
return;
|
|
|
|
} else if (*term == '#') {
|
|
|
|
j = atoi(term + 1);
|
|
|
|
/* fetch the appropriate logical block */
|
|
|
|
invcntl->numblk = j;
|
|
|
|
fseek(invcntl->invfile,
|
|
|
|
(j * invcntl->param.sizeblk) + invcntl->param.cntlsize,
|
|
|
|
SEEK_SET);
|
|
|
|
fread(invcntl->logblk, (int) invcntl->param.sizeblk, 1,
|
|
|
|
invcntl->invfile);
|
|
|
|
} else
|
|
|
|
i = abs((int) invfind(invcntl, term));
|
|
|
|
longptr = invcntl->logblk->invblk;
|
|
|
|
n = *longptr++;
|
|
|
|
printf("Entry term to invdump=%s, postings=%ld, forwrd ptr=%ld, back ptr=%ld\n"
|
|
|
|
, term, i, *(longptr), *(longptr + 1));
|
|
|
|
/* FIXME HBB: magic number alert! (3) */
|
|
|
|
entryptr = (ENTRY *) (invcntl->logblk->invblk + 3);
|
|
|
|
printf("%ld terms in this block, block=%ld\n", n, invcntl->numblk);
|
|
|
|
printf("\tterm\t\t\tposts\tsize\toffset\tspace\t1st word\n");
|
|
|
|
for (j = 0; j < n && invbreak == 0; j++) {
|
|
|
|
ptr = invcntl->logblk->chrblk + entryptr->offset;
|
|
|
|
strncpy(temp, ptr, (int) entryptr->size);
|
|
|
|
temp[entryptr->size] = '\0';
|
|
|
|
ptr += (sizeof(long) * (long)((entryptr->size + (sizeof(long) - 1)) / sizeof(long)));
|
|
|
|
printf("%2ld %-24s\t%5ld\t%3d\t%d\t%d\t%ld\n", j, temp, entryptr->post,
|
|
|
|
entryptr->size, entryptr->offset, entryptr->space,
|
|
|
|
*(long *)ptr);
|
|
|
|
entryptr++;
|
|
|
|
}
|
2023-07-27 14:04:50 -04:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
static int
|
|
|
|
boolready(void)
|
|
|
|
{
|
2023-08-04 13:49:03 -04:00
|
|
|
numitems = 0;
|
2023-08-04 14:34:51 -04:00
|
|
|
if (item1 != NULL)
|
2023-08-04 13:49:03 -04:00
|
|
|
free(item1);
|
|
|
|
setsize1 = SETINC;
|
|
|
|
if ((item1 = malloc(SETINC * sizeof(*item1))) == NULL) {
|
|
|
|
invcannotalloc(SETINC);
|
|
|
|
return(-1);
|
|
|
|
}
|
2023-08-04 14:34:51 -04:00
|
|
|
if (item2 != NULL)
|
2023-08-04 13:49:03 -04:00
|
|
|
free(item2);
|
|
|
|
setsize2 = SETINC;
|
|
|
|
if ((item2 = malloc(SETINC * sizeof(*item2))) == NULL) {
|
|
|
|
invcannotalloc(SETINC);
|
|
|
|
return(-1);
|
|
|
|
}
|
|
|
|
item = item1;
|
|
|
|
enditem = item;
|
|
|
|
return(0);
|
2023-07-27 14:04:50 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
boolclear(void)
|
|
|
|
{
|
2023-08-04 13:49:03 -04:00
|
|
|
numitems = 0;
|
|
|
|
item = item1;
|
|
|
|
enditem = item;
|
2023-07-27 14:04:50 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
POSTING *
|
|
|
|
boolfile(INVCONTROL *invcntl, long *num, int boolarg)
|
|
|
|
{
|
2023-08-04 13:49:03 -04:00
|
|
|
ENTRY *entryptr;
|
|
|
|
FILE *file;
|
|
|
|
void *ptr;
|
|
|
|
unsigned long *ptr2;
|
|
|
|
POSTING *newitem = NULL; /* initialize, to avoid warning */
|
|
|
|
POSTING posting;
|
|
|
|
unsigned u;
|
|
|
|
POSTING *newsetp = NULL, *set1p;
|
|
|
|
long newsetc, set1c, set2c;
|
|
|
|
|
|
|
|
/* FIXME HBB: magic number alert! (3) */
|
|
|
|
entryptr = (ENTRY *) (invcntl->logblk->invblk + 3) + invcntl->keypnt;
|
|
|
|
ptr = invcntl->logblk->chrblk + entryptr->offset;
|
|
|
|
ptr2 = ((unsigned long *) ptr) + (entryptr->size + (sizeof(long) - 1)) / sizeof(long);
|
|
|
|
*num = entryptr->post;
|
|
|
|
switch (boolarg) {
|
2023-08-04 15:09:58 -04:00
|
|
|
case bool_OR:
|
|
|
|
case falseT:
|
2023-08-04 13:49:03 -04:00
|
|
|
if (*num == 0) {
|
|
|
|
*num = numitems;
|
|
|
|
return(item);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/* make room for the new set */
|
|
|
|
u = 0;
|
|
|
|
switch (boolarg) {
|
|
|
|
case AND:
|
2023-08-04 15:09:58 -04:00
|
|
|
case falseT:
|
2023-08-04 13:49:03 -04:00
|
|
|
newsetp = item;
|
|
|
|
break;
|
|
|
|
|
2023-08-04 15:09:58 -04:00
|
|
|
case bool_OR:
|
2023-08-04 13:49:03 -04:00
|
|
|
u = enditem - item;
|
|
|
|
/* FALLTHROUGH */
|
2023-08-04 15:09:58 -04:00
|
|
|
case REVERSEfalseT:
|
2023-08-04 13:49:03 -04:00
|
|
|
u += *num;
|
|
|
|
if (item == item2) {
|
|
|
|
if (u > setsize1) {
|
|
|
|
u += SETINC;
|
|
|
|
if ((item1 = realloc(item1, u * sizeof(*item1))) == NULL) {
|
|
|
|
invcannotalloc(u * sizeof(*item1));
|
|
|
|
boolready();
|
|
|
|
*num = -1;
|
|
|
|
return(NULL);
|
|
|
|
}
|
|
|
|
setsize1 = u;
|
|
|
|
}
|
|
|
|
newitem = item1;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
if (u > setsize2) {
|
|
|
|
u += SETINC;
|
|
|
|
if ((item2 = realloc(item2, u * sizeof(*item2))) == NULL) {
|
|
|
|
invcannotalloc(u * sizeof(*item2));
|
|
|
|
boolready();
|
|
|
|
*num = -1;
|
|
|
|
return(NULL);
|
|
|
|
}
|
|
|
|
setsize2 = u;
|
|
|
|
}
|
|
|
|
newitem = item2;
|
|
|
|
}
|
2023-07-27 14:04:50 -04:00
|
|
|
#if 0 /* this write is only need by commented-out code later */
|
2023-08-04 13:49:03 -04:00
|
|
|
set1p = item;
|
2023-07-27 14:04:50 -04:00
|
|
|
#endif
|
2023-08-04 13:49:03 -04:00
|
|
|
newsetp = newitem;
|
|
|
|
}
|
|
|
|
file = invcntl->postfile;
|
|
|
|
fseek(file, *ptr2, SEEK_SET);
|
|
|
|
fread(&posting, sizeof(posting), 1, file);
|
|
|
|
newsetc = 0;
|
|
|
|
switch (boolarg) {
|
2023-08-04 15:09:58 -04:00
|
|
|
case bool_OR:
|
2023-08-04 13:49:03 -04:00
|
|
|
/* while something in both sets */
|
|
|
|
set1p = item;
|
|
|
|
newsetp = newitem;
|
|
|
|
for (set1c = 0, set2c = 0;
|
|
|
|
set1c < numitems && set2c < *num; newsetc++) {
|
|
|
|
if (set1p->lineoffset < posting.lineoffset) {
|
|
|
|
*newsetp++ = *set1p++;
|
|
|
|
set1c++;
|
|
|
|
}
|
|
|
|
else if (set1p->lineoffset > posting.lineoffset) {
|
|
|
|
*newsetp++ = posting;
|
|
|
|
fread(&posting, (int) sizeof(posting), 1, file);
|
|
|
|
set2c++;
|
|
|
|
}
|
|
|
|
else if (set1p->type < posting.type) {
|
|
|
|
*newsetp++ = *set1p++;
|
|
|
|
set1c++;
|
|
|
|
}
|
|
|
|
else if (set1p->type > posting.type) {
|
|
|
|
*newsetp++ = posting;
|
|
|
|
fread(&posting, (int) sizeof(posting), 1, file);
|
|
|
|
set2c++;
|
|
|
|
}
|
|
|
|
else { /* identical postings */
|
|
|
|
*newsetp++ = *set1p++;
|
|
|
|
set1c++;
|
|
|
|
fread(&posting, (int) sizeof(posting), 1, file);
|
|
|
|
set2c++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/* find out what ran out and move the rest in */
|
|
|
|
if (set1c < numitems) {
|
|
|
|
newsetc += numitems - set1c;
|
|
|
|
while (set1c++ < numitems) {
|
|
|
|
*newsetp++ = *set1p++;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
while (set2c++ < *num) {
|
|
|
|
*newsetp++ = posting;
|
|
|
|
newsetc++;
|
|
|
|
fread(&posting, (int) sizeof(posting), 1, file);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
item = newitem;
|
2023-08-04 15:09:58 -04:00
|
|
|
break; /* end of bool_OR */
|
2023-07-27 14:04:50 -04:00
|
|
|
#if 0
|
2023-08-04 13:49:03 -04:00
|
|
|
case AND:
|
|
|
|
for (set1c = 0, set2c = 0; set1c < numitems && set2c < *num; ) {
|
|
|
|
if (set1p->lineoffset < posting.lineoffset) {
|
|
|
|
set1p++;
|
|
|
|
set1c++;
|
|
|
|
}
|
|
|
|
else if (set1p->lineoffset > posting.lineoffset) {
|
|
|
|
fread(&posting, (int) sizeof(posting), 1, file);
|
|
|
|
set2c++;
|
|
|
|
}
|
|
|
|
else if (set1p->type < posting.type) {
|
|
|
|
*set1p++;
|
|
|
|
set1c++;
|
|
|
|
}
|
|
|
|
else if (set1p->type > posting.type) {
|
|
|
|
fread(&posting, (int) sizeof(posting), 1, file);
|
|
|
|
set2c++;
|
|
|
|
}
|
|
|
|
else { /* identical postings */
|
|
|
|
*newsetp++ = *set1p++;
|
|
|
|
newsetc++;
|
|
|
|
set1c++;
|
|
|
|
fread(&posting, (int) sizeof(posting), 1, file);
|
|
|
|
set2c++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break; /* end of AND */
|
|
|
|
|
2023-08-04 15:09:58 -04:00
|
|
|
case falseT:
|
2023-08-04 13:49:03 -04:00
|
|
|
for (set1c = 0, set2c = 0; set1c < numitems && set2c < *num; ) {
|
|
|
|
if (set1p->lineoffset < posting.lineoffset) {
|
|
|
|
*newsetp++ = *set1p++;
|
|
|
|
newsetc++;
|
|
|
|
set1c++;
|
|
|
|
}
|
|
|
|
else if (set1p->lineoffset > posting.lineoffset) {
|
|
|
|
fread(&posting, (int) sizeof(posting), 1, file);
|
|
|
|
set2c++;
|
|
|
|
}
|
|
|
|
else if (set1p->type < posting.type) {
|
|
|
|
*newsetp++ = *set1p++;
|
|
|
|
newsetc++;
|
|
|
|
set1c++;
|
|
|
|
}
|
|
|
|
else if (set1p->type > posting.type) {
|
|
|
|
fread(&posting, (int) sizeof(posting), 1, file);
|
|
|
|
set2c++;
|
|
|
|
}
|
|
|
|
else { /* identical postings */
|
|
|
|
set1c++;
|
|
|
|
set1p++;
|
|
|
|
fread(&posting, (int) sizeof(posting), 1, file);
|
|
|
|
set2c++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
newsetc += numitems - set1c;
|
|
|
|
while (set1c++ < numitems) {
|
|
|
|
*newsetp++ = *set1p++;
|
|
|
|
}
|
2023-08-04 15:09:58 -04:00
|
|
|
break; /* end of falseT */
|
2023-08-04 13:49:03 -04:00
|
|
|
|
2023-08-04 15:09:58 -04:00
|
|
|
case REVERSEfalseT: /* core falseT incoming set */
|
2023-08-04 13:49:03 -04:00
|
|
|
for (set1c = 0, set2c = 0; set1c < numitems && set2c < *num; ) {
|
|
|
|
if (set1p->lineoffset < posting.lineoffset) {
|
|
|
|
set1p++;
|
|
|
|
set1c++;
|
|
|
|
}
|
|
|
|
else if (set1p->lineoffset > posting.lineoffset) {
|
|
|
|
*newsetp++ = posting;
|
|
|
|
fread(&posting, (int) sizeof(posting), 1, file);
|
|
|
|
set2c++;
|
|
|
|
}
|
|
|
|
else if (set1p->type < posting.type) {
|
|
|
|
set1p++;
|
|
|
|
set1c++;
|
|
|
|
}
|
|
|
|
else if (set1p->type > posting.type) {
|
|
|
|
*newsetp++ = posting;
|
|
|
|
fread(&posting, (int) sizeof(posting), 1, file);
|
|
|
|
set2c++;
|
|
|
|
}
|
|
|
|
else { /* identical postings */
|
|
|
|
set1c++;
|
|
|
|
set1p++;
|
|
|
|
fread(&posting, (int) sizeof(posting), 1, file);
|
|
|
|
set2c++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
while (set2c++ < *num) {
|
|
|
|
*newsetp++ = posting;
|
|
|
|
newsetc++;
|
|
|
|
fread(&posting, (int) sizeof(posting), 1, file);
|
|
|
|
}
|
|
|
|
item = newitem;
|
2023-08-04 15:09:58 -04:00
|
|
|
break; /* end of REVERSEfalseT */
|
2023-07-27 14:04:50 -04:00
|
|
|
#endif
|
2023-08-04 13:49:03 -04:00
|
|
|
}
|
|
|
|
numitems = newsetc;
|
|
|
|
*num = newsetc;
|
|
|
|
enditem = (POSTING *) newsetp;
|
|
|
|
return((POSTING *) item);
|
2023-07-27 14:04:50 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
#if 0
|
|
|
|
POSTING *
|
2023-08-04 13:49:03 -04:00
|
|
|
boolsave(int clear) /* flag about whether to clear core */
|
2023-07-27 14:04:50 -04:00
|
|
|
{
|
2023-08-04 13:49:03 -04:00
|
|
|
int i;
|
|
|
|
POSTING *ptr;
|
|
|
|
POSTING *oldstuff, *newstuff;
|
|
|
|
|
|
|
|
if (numitems == 0) {
|
2023-08-04 14:34:51 -04:00
|
|
|
if (clear)
|
2023-08-04 13:49:03 -04:00
|
|
|
boolclear();
|
|
|
|
return(NULL);
|
|
|
|
}
|
|
|
|
/* if clear then give them what we have and use boolready to realloc */
|
|
|
|
if (clear) {
|
|
|
|
ptr = item;
|
|
|
|
/* free up the space we didn't give them */
|
|
|
|
if (item == item1)
|
|
|
|
item1 = NULL;
|
|
|
|
else
|
|
|
|
item2 = NULL;
|
|
|
|
boolready();
|
|
|
|
return(ptr);
|
|
|
|
}
|
|
|
|
i = (enditem - item) * sizeof(*ptr) + 100;
|
|
|
|
if ((ptr = malloc(i)) == NULL) {
|
|
|
|
invcannotalloc(i);
|
|
|
|
return(ptr);
|
|
|
|
}
|
|
|
|
/* move present set into place */
|
|
|
|
oldstuff = item;
|
|
|
|
newstuff = ptr;
|
|
|
|
while (oldstuff < enditem)
|
|
|
|
*newstuff++ = *oldstuff++;
|
|
|
|
return(ptr);
|
2023-07-27 14:04:50 -04:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
static void
|
|
|
|
invcannotalloc(unsigned n)
|
|
|
|
{
|
2023-08-04 13:49:03 -04:00
|
|
|
fprintf(stderr, "%s: cannot allocate %u bytes\n", argv0, n);
|
2023-07-27 14:04:50 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
invcannotopen(char *file)
|
|
|
|
{
|
2023-08-04 13:49:03 -04:00
|
|
|
fprintf(stderr, "%s: cannot open file %s\n", argv0, file);
|
2023-07-27 14:04:50 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
invcannotwrite(char *file)
|
|
|
|
{
|
2023-08-04 13:49:03 -04:00
|
|
|
perror(argv0); /* must be first to preserve errno */
|
|
|
|
fprintf(stderr, "%s: write to file %s failed\n", argv0, file);
|
2023-07-27 14:04:50 -04:00
|
|
|
}
|