You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1013 lines
29KB

  1. /*===========================================================================
  2. Copyright (c) 1998-2000, The Santa Cruz Operation
  3. All rights reserved.
  4. Redistribution and use in source and binary forms, with or without
  5. modification, are permitted provided that the following conditions are met:
  6. *Redistributions of source code must retain the above copyright notice,
  7. this list of conditions and the following disclaimer.
  8. *Redistributions in binary form must reproduce the above copyright notice,
  9. this list of conditions and the following disclaimer in the documentation
  10. and/or other materials provided with the distribution.
  11. *Neither name of The Santa Cruz Operation nor the names of its contributors
  12. may be used to endorse or promote products derived from this software
  13. without specific prior written permission.
  14. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS
  15. IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT falseT LIMITED TO,
  16. THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  17. PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE
  18. LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  19. CONSEQUENTIAL DAMAGES (INCLUDING, BUT falseT LIMITED TO, PROCUREMENT OF
  20. SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  21. INTERRUPTION)
  22. HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  23. LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  24. OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
  25. DAMAGE.
  26. =========================================================================*/
  27. #include <ctype.h>
  28. #include <stdio.h>
  29. #include <stdlib.h>
  30. #if SHARE
  31. # include <sys/types.h>
  32. # include <sys/ipc.h>
  33. # include <sys/shm.h>
  34. # define ERR -1
  35. #endif
  36. #include "invlib.h"
  37. #include "global.h"
  38. #include <assert.h>
  39. #define DEBUG 0 /* debugging code and realloc messages */
  40. #define BLOCKSIZE 2 * BUFSIZ /* logical block size */
  41. #define POSTINC 10000 /* posting buffer size increment */
  42. #define SEP ' ' /* sorted posting field separator */
  43. #define SETINC 100 /* posting set size increment */
  44. #define STATS 0 /* print statistics */
  45. #define SUPERINC 10000 /* super index size increment */
  46. #define TERMMAX 512 /* term max size */
  47. #define FMTVERSION 1 /* inverted index format version */
  48. #define ZIPFSIZE 200 /* zipf curve size */
  49. #if DEBUG
  50. /* FIXME HBB 20010705: nowhere in the source is `invbreak' ever set to
  51. * a value other than the (silent) initialization to zero. Pretty
  52. * useless, that looks */
  53. int invbreak;
  54. #endif
  55. static int boolready(void);
  56. static int invnewterm(void);
  57. static void invstep(INVCONTROL *invcntl);
  58. static void invcannotalloc(unsigned n);
  59. static void invcannotopen(char *file);
  60. static void invcannotwrite(char *file);
  61. #if STATS
  62. int showzipf; /* show postings per term distribution */
  63. #endif
  64. static POSTING *item, *enditem, *item1 = NULL, *item2 = NULL;
  65. static unsigned int setsize1, setsize2;
  66. static long numitems, totterm, zerolong;
  67. static char *indexfile, *postingfile;
  68. static FILE *outfile, *fpost;
  69. static size_t supersize = SUPERINC, supintsize;
  70. static unsigned int numpost, numlogblk, amtused, nextpost;
  71. static unsigned int lastinblk, numinvitems;
  72. static POSTING *POST, *postptr;
  73. static unsigned long *SUPINT, *supint, nextsupfing;
  74. static char *SUPFING, *supfing;
  75. static char thisterm[TERMMAX];
  76. typedef union logicalblk {
  77. long invblk[BLOCKSIZE / sizeof(long)];
  78. char chrblk[BLOCKSIZE];
  79. } t_logicalblk;
  80. static t_logicalblk logicalblk;
  81. #if DEBUG || STATS
  82. static long totpost;
  83. #endif
  84. #if STATS
  85. static int zipf[ZIPFSIZE + 1];
  86. #endif
  87. long invmake(char *invname, char *invpost, FILE *infile) {
  88. unsigned char *s;
  89. long num;
  90. int i;
  91. long fileindex = 0; /* initialze, to avoid warning */
  92. unsigned postsize = POSTINC * sizeof(*POST);
  93. unsigned long *intptr;
  94. char line[TERMMAX];
  95. long tlong;
  96. PARAM param;
  97. POSTING posting;
  98. char temp[BLOCKSIZE];
  99. #if STATS
  100. int j;
  101. unsigned maxtermlen = 0;
  102. #endif
  103. /* output file */
  104. if((outfile = vpfopen(invname, "w+b")) == NULL) {
  105. invcannotopen(invname);
  106. return (0);
  107. }
  108. indexfile = invname;
  109. fseek(outfile, BUFSIZ, SEEK_SET);
  110. /* posting file */
  111. if((fpost = vpfopen(invpost, "wb")) == NULL) {
  112. invcannotopen(invpost);
  113. return (0);
  114. }
  115. postingfile = invpost;
  116. nextpost = 0;
  117. /* get space for the postings list */
  118. if((POST = malloc(postsize)) == NULL) {
  119. invcannotalloc(postsize);
  120. return (0);
  121. }
  122. postptr = POST;
  123. /* get space for the superfinger (superindex) */
  124. if((SUPFING = malloc(supersize)) == NULL) {
  125. invcannotalloc(supersize);
  126. return (0);
  127. }
  128. supfing = SUPFING;
  129. /* FIXME HBB: magic number alert (40) */
  130. supintsize = supersize / 40u;
  131. /* also for the superfinger index */
  132. if((SUPINT = malloc(supintsize * sizeof(*SUPINT))) == NULL) {
  133. invcannotalloc(supintsize * sizeof(*SUPINT));
  134. return (0);
  135. }
  136. supint = SUPINT;
  137. supint++; /* leave first term open for a count */
  138. /* initialize using an empty term */
  139. strcpy(thisterm, "");
  140. *supint++ = 0;
  141. *supfing++ = ' ';
  142. *supfing++ = '\0';
  143. nextsupfing = 2;
  144. #if DEBUG || STATS
  145. totpost = 0L;
  146. #endif
  147. totterm = 0L;
  148. numpost = 1;
  149. /* set up as though a block had come and gone, i.e., set up for new block */
  150. /* 3 longs needed for: numinvitems, next block, and previous block */
  151. amtused = 3 * sizeof(long);
  152. numinvitems = 0;
  153. numlogblk = 0;
  154. lastinblk = sizeof(t_logicalblk);
  155. /* now loop as long as more to read (till eof) */
  156. while(fgets(line, TERMMAX, infile) != NULL) {
  157. #if DEBUG || STATS
  158. ++totpost;
  159. #endif
  160. s = strchr(line, SEP);
  161. if(s != NULL) {
  162. *s = '\0';
  163. } else {
  164. continue;
  165. }
  166. #if STATS
  167. if((i = strlen(line)) > maxtermlen) { maxtermlen = i; }
  168. #endif
  169. #if DEBUG
  170. printf("%ld: %s ", totpost, line);
  171. fflush(stdout);
  172. #endif
  173. if(strcmp(thisterm, line) == 0) {
  174. if((postptr + 10) > (POST + (postsize / sizeof(*POST)))) {
  175. i = postptr - POST;
  176. postsize += POSTINC * sizeof(*POST);
  177. if((POST = realloc(POST, postsize)) == NULL) {
  178. invcannotalloc(postsize);
  179. return (0);
  180. }
  181. postptr = i + POST;
  182. #if DEBUG
  183. printf("reallocated post space to %u, totpost=%ld\n", postsize, totpost);
  184. #endif
  185. }
  186. numpost++;
  187. } else {
  188. /* have a new term */
  189. if(!invnewterm()) { return (0); }
  190. strcpy(thisterm, line);
  191. numpost = 1;
  192. postptr = POST;
  193. fileindex = 0;
  194. }
  195. /* get the new posting */
  196. num = *++s - '!';
  197. i = 1;
  198. do {
  199. num = BASE * num + *++s - '!';
  200. } while(++i < PRECISION);
  201. posting.lineoffset = num;
  202. while(++fileindex < nsrcfiles && num > srcoffset[fileindex]) {
  203. ;
  204. }
  205. posting.fileindex = --fileindex;
  206. posting.type = *++s;
  207. ++s;
  208. if(*s != '\n') {
  209. num = *++s - '!';
  210. while(*++s != '\n') {
  211. num = BASE * num + *s - '!';
  212. }
  213. posting.fcnoffset = num;
  214. } else {
  215. posting.fcnoffset = 0;
  216. }
  217. *postptr++ = posting;
  218. #if DEBUG
  219. printf("%ld %ld %ld %ld\n",
  220. posting.fileindex,
  221. posting.fcnoffset,
  222. posting.lineoffset,
  223. posting.type);
  224. fflush(stdout);
  225. #endif
  226. }
  227. if(!invnewterm()) { return (0); }
  228. /* now clean up final block */
  229. logicalblk.invblk[0] = numinvitems;
  230. /* loops pointer around to start */
  231. logicalblk.invblk[1] = 0;
  232. logicalblk.invblk[2] = numlogblk - 1;
  233. if(fwrite(&logicalblk, sizeof(t_logicalblk), 1, outfile) == 0) { goto cannotwrite; }
  234. numlogblk++;
  235. /* write out block to save space. what in it doesn't matter */
  236. if(fwrite(&logicalblk, sizeof(t_logicalblk), 1, outfile) == 0) { goto cannotwrite; }
  237. /* finish up the super finger */
  238. *SUPINT = numlogblk;
  239. /* add to the offsets the size of the offset pointers */
  240. intptr = (SUPINT + 1);
  241. i = (char *)supint - (char *)SUPINT;
  242. while(intptr < supint)
  243. *intptr++ += i;
  244. /* write out the offsets (1 for the N at start) and the super finger */
  245. if(fwrite(SUPINT, sizeof(*SUPINT), numlogblk + 1, outfile) == 0 ||
  246. fwrite(SUPFING, 1, supfing - SUPFING, outfile) == 0) {
  247. goto cannotwrite;
  248. }
  249. /* save the size for reference later */
  250. nextsupfing = sizeof(long) + sizeof(long) * numlogblk + (supfing - SUPFING);
  251. /* make sure the file ends at a logical block boundary. This is
  252. necessary for invinsert to correctly create extended blocks
  253. */
  254. i = nextsupfing % sizeof(t_logicalblk);
  255. /* write out junk to fill log blk */
  256. if(fwrite(temp, sizeof(t_logicalblk) - i, 1, outfile) == 0 ||
  257. fflush(outfile) == EOF) { /* rewind doesn't check for write failure */
  258. goto cannotwrite;
  259. }
  260. /* write the control area */
  261. rewind(outfile);
  262. param.version = FMTVERSION;
  263. param.filestat = 0;
  264. param.sizeblk = sizeof(t_logicalblk);
  265. param.startbyte = (numlogblk + 1) * sizeof(t_logicalblk) + BUFSIZ;
  266. ;
  267. param.supsize = nextsupfing;
  268. param.cntlsize = BUFSIZ;
  269. param.share = 0;
  270. if(fwrite(&param, sizeof(param), 1, outfile) == 0) { goto cannotwrite; }
  271. for(i = 0; i < 10; i++) /* for future use */
  272. if(fwrite(&zerolong, sizeof(zerolong), 1, outfile) == 0) { goto cannotwrite; }
  273. /* make first block loop backwards to last block */
  274. if(fflush(outfile) == EOF) { /* fseek doesn't check for write failure */
  275. goto cannotwrite;
  276. }
  277. /* get to second word first block */
  278. fseek(outfile, BUFSIZ + 2 * sizeof(long), SEEK_SET);
  279. tlong = numlogblk - 1;
  280. if(fwrite(&tlong, sizeof(tlong), 1, outfile) == 0 || fclose(outfile) == EOF) {
  281. cannotwrite:
  282. invcannotwrite(invname);
  283. return (0);
  284. }
  285. if(fclose(fpost) == EOF) {
  286. invcannotwrite(postingfile);
  287. return (0);
  288. }
  289. --totterm; /* don't count null term */
  290. #if STATS
  291. printf("logical blocks = %d, postings = %ld, terms = %ld, max term length = %d\n",
  292. numlogblk,
  293. totpost,
  294. totterm,
  295. maxtermlen);
  296. if(showzipf) {
  297. printf("\n************* ZIPF curve ****************\n");
  298. for(j = ZIPFSIZE; j > 1; j--)
  299. if(zipf[j]) break;
  300. for(i = 1; i < j; ++i) {
  301. printf("%3d -%6d ", i, zipf[i]);
  302. if(i % 6 == 0) putchar('\n');
  303. }
  304. printf(">%d-%6d\n", ZIPFSIZE, zipf[0]);
  305. }
  306. #endif
  307. /* free all malloc'd memory */
  308. free(POST);
  309. free(SUPFING);
  310. free(SUPINT);
  311. return (totterm);
  312. }
  313. /* add a term to the data base */
  314. static int invnewterm(void) {
  315. int backupflag, i, j, holditems, gooditems, howfar;
  316. unsigned int maxback, len, numwilluse, wdlen;
  317. char *tptr, *tptr3;
  318. union {
  319. unsigned long packword[2];
  320. ENTRY e;
  321. } iteminfo;
  322. gooditems = 0; /* initialize, to avoid warning */
  323. totterm++;
  324. #if STATS
  325. /* keep zipfian info on the distribution */
  326. if(numpost <= ZIPFSIZE)
  327. zipf[numpost]++;
  328. else
  329. zipf[0]++;
  330. #endif
  331. len = strlen(thisterm);
  332. /* length of term rounded up to long boundary */
  333. wdlen = (len + (sizeof(long) - 1)) / sizeof(long);
  334. /* each term needs 2 longs for its iteminfo and
  335. * 1 long for its offset */
  336. numwilluse = (wdlen + 3) * sizeof(long);
  337. /* new block if at least 1 item in block */
  338. if(numinvitems && numwilluse + amtused > sizeof(t_logicalblk)) {
  339. /* set up new block */
  340. if(supfing + 500u > SUPFING + supersize) {
  341. i = supfing - SUPFING;
  342. supersize += 20000u;
  343. if((SUPFING = realloc(SUPFING, supersize)) == NULL) {
  344. invcannotalloc(supersize);
  345. return (0);
  346. }
  347. supfing = i + SUPFING;
  348. #if DEBUG
  349. printf("reallocated superfinger space to %d, totpost=%ld\n",
  350. supersize,
  351. totpost);
  352. #endif
  353. }
  354. /* check that room for the offset as well */
  355. /* FIXME HBB: magic number alert (10) */
  356. if((numlogblk + 10) > supintsize) {
  357. i = supint - SUPINT;
  358. supintsize += SUPERINC;
  359. if((SUPINT = realloc(SUPINT, supintsize * sizeof(*SUPINT))) == NULL) {
  360. invcannotalloc(supintsize * sizeof(*SUPINT));
  361. return (0);
  362. }
  363. supint = i + SUPINT;
  364. #if DEBUG
  365. printf("reallocated superfinger offset to %d, totpost = %ld\n",
  366. supintsize * sizeof(*SUPINT),
  367. totpost);
  368. #endif
  369. }
  370. /* See if backup is efficatious */
  371. backupflag = 0;
  372. maxback = (int)strlen(thisterm) / 10;
  373. holditems = numinvitems;
  374. if(maxback > numinvitems) maxback = numinvitems - 2;
  375. howfar = 0;
  376. while(maxback-- > 1) {
  377. howfar++;
  378. iteminfo.packword[0] =
  379. logicalblk.invblk[--holditems * 2 + (sizeof(long) - 1)];
  380. if((i = iteminfo.e.size / 10) < maxback) {
  381. maxback = i;
  382. backupflag = howfar;
  383. gooditems = holditems;
  384. }
  385. }
  386. /* see if backup will occur */
  387. if(backupflag) { numinvitems = gooditems; }
  388. logicalblk.invblk[0] = numinvitems;
  389. /* set forward pointer pointing to next */
  390. logicalblk.invblk[1] = numlogblk + 1;
  391. /* set back pointer to last block */
  392. logicalblk.invblk[2] = numlogblk - 1;
  393. if(fwrite(logicalblk.chrblk, 1, sizeof(t_logicalblk), outfile) == 0) {
  394. invcannotwrite(indexfile);
  395. return (0);
  396. }
  397. /* 3 longs needed for: numinvitems, next block, and previous block */
  398. amtused = 3 * sizeof(long);
  399. numlogblk++;
  400. /* check if had to back up, if so do it */
  401. if(backupflag) {
  402. char *tptr2;
  403. /* find out where the end of the new block is */
  404. iteminfo.packword[0] = logicalblk.invblk[numinvitems * 2 + 1];
  405. tptr3 = logicalblk.chrblk + iteminfo.e.offset;
  406. /* move the index for this block */
  407. for(i = 3; i <= (backupflag * 2 + 2); i++)
  408. logicalblk.invblk[i] = logicalblk.invblk[numinvitems * 2 + i];
  409. /* move the word into the super index */
  410. iteminfo.packword[0] = logicalblk.invblk[3];
  411. iteminfo.packword[1] = logicalblk.invblk[4];
  412. tptr2 = logicalblk.chrblk + iteminfo.e.offset;
  413. strncpy(supfing, tptr2, (int)iteminfo.e.size);
  414. *(supfing + iteminfo.e.size) = '\0';
  415. #if DEBUG
  416. printf("backup %d at term=%s to term=%s\n", backupflag, thisterm, supfing);
  417. #endif
  418. *supint++ = nextsupfing;
  419. nextsupfing += strlen(supfing) + 1;
  420. supfing += strlen(supfing) + 1;
  421. /* now fix up the logical block */
  422. tptr = logicalblk.chrblk + lastinblk;
  423. lastinblk = sizeof(t_logicalblk);
  424. tptr2 = logicalblk.chrblk + lastinblk;
  425. j = tptr3 - tptr;
  426. while(tptr3 > tptr)
  427. *--tptr2 = *--tptr3;
  428. lastinblk -= j;
  429. amtused += ((2 * sizeof(long)) * backupflag + j);
  430. for(i = 3; i < (backupflag * 2 + 2); i += 2) {
  431. iteminfo.packword[0] = logicalblk.invblk[i];
  432. iteminfo.e.offset += (tptr2 - tptr3);
  433. logicalblk.invblk[i] = iteminfo.packword[0];
  434. }
  435. numinvitems = backupflag;
  436. } else { /* no backup needed */
  437. numinvitems = 0;
  438. lastinblk = sizeof(t_logicalblk);
  439. /* add new term to superindex */
  440. strcpy(supfing, thisterm);
  441. supfing += strlen(thisterm) + 1;
  442. *supint++ = nextsupfing;
  443. nextsupfing += strlen(thisterm) + 1;
  444. }
  445. }
  446. /* HBB 20010501: Fixed bug by replacing magic number '8' by
  447. * what it actually represents. */
  448. lastinblk -= (numwilluse - 2 * sizeof(long));
  449. iteminfo.e.offset = lastinblk;
  450. iteminfo.e.size = len;
  451. iteminfo.e.space = 0;
  452. iteminfo.e.post = numpost;
  453. strncpy(logicalblk.chrblk + lastinblk, thisterm, len);
  454. amtused += numwilluse;
  455. logicalblk.invblk[(lastinblk / sizeof(long)) + wdlen] = nextpost;
  456. if((i = postptr - POST) > 0) {
  457. if(fwrite(POST, sizeof(*POST), i, fpost) == 0) {
  458. invcannotwrite(postingfile);
  459. return (0);
  460. }
  461. nextpost += i * sizeof(*POST);
  462. }
  463. logicalblk.invblk[3 + 2 * numinvitems++] = iteminfo.packword[0];
  464. logicalblk.invblk[2 + 2 * numinvitems] = iteminfo.packword[1];
  465. return (1);
  466. }
  467. /*
  468. * If 'invname' ends with the 'from' substring, it is replaced inline with the
  469. * 'to' substring (which must be of the exact same length), and the function
  470. * returns 0. Otherwise, returns -1.
  471. */
  472. static int invflipname(char *invname, const char *from, const char *to) {
  473. char *temp, *i = NULL;
  474. assert(strlen(from) == strlen(to));
  475. temp = invname - 1;
  476. while((temp = strstr(temp + 1, from)))
  477. i = temp;
  478. if(!i || i[strlen(from)] != '\0') return -1;
  479. while(*to)
  480. *i++ = *to++;
  481. return 0;
  482. }
  483. /* small helper function to centralize handling of binary opening
  484. * for reading, and use of the 'stat" flag */
  485. static FILE *open_for_reading(char *name, int stat) {
  486. return vpfopen(name, ((stat == 0) ? "rb" : "r+b"));
  487. }
  488. /* handle opening of a file under a possibly "flipped" name */
  489. /* If db created without '-f', but now invoked with '-f cscope.out',
  490. * we need to check for 'cscope.in.out', rather than 'cscope.out.in':
  491. * I.e, hack around our own violation of the inverse db naming convention */
  492. /* more silliness: if you create the db with '-f cscope', then try to open
  493. * it without '-f cscope', you'll fail unless we check for 'cscope.out.in'
  494. * here. */
  495. static FILE *open_file_with_flipped_name(char *name, const char *flip_in,
  496. const char *flip_out, int stat) {
  497. if(!invflipname(name, flip_in, flip_out)) {
  498. FILE *fptr = open_for_reading(name, stat);
  499. if(!fptr) /* flip back for error message */
  500. invflipname(name, flip_out, flip_in);
  501. return fptr;
  502. };
  503. return 0;
  504. }
  505. static FILE *open_file_with_possibly_flipped_name(char *name, const char *flip1,
  506. const char *flip2, int stat) {
  507. FILE *fptr = open_for_reading(name, stat);
  508. if(!fptr) fptr = open_file_with_flipped_name(name, flip2, flip1, stat);
  509. if(!fptr) fptr = open_file_with_flipped_name(name, flip1, flip2, stat);
  510. return fptr;
  511. }
  512. int invopen(INVCONTROL *invcntl, char *invname, char *invpost, int stat) {
  513. int read_index;
  514. invcntl->invfile =
  515. open_file_with_possibly_flipped_name(invname, INVNAME, INVNAME2, stat);
  516. if(!invcntl->invfile) {
  517. invcannotopen(invname);
  518. return (-1);
  519. }
  520. if(fread(&invcntl->param, sizeof(invcntl->param), 1, invcntl->invfile) == 0) {
  521. fprintf(stderr, PROGRAM_NAME ": empty inverted file\n");
  522. fclose(invcntl->invfile);
  523. return (-1);
  524. }
  525. if(invcntl->param.version != FMTVERSION) {
  526. fprintf(stderr,
  527. PROGRAM_NAME
  528. ": cannot read old index format; use -U option to force database to rebuild\n");
  529. fclose(invcntl->invfile);
  530. return (-1);
  531. }
  532. assert(invcntl->param.sizeblk == sizeof(t_logicalblk));
  533. if(stat == 0 && invcntl->param.filestat == INVALONE) {
  534. fprintf(stderr, PROGRAM_NAME ": inverted file is locked\n");
  535. fclose(invcntl->invfile);
  536. return (-1);
  537. }
  538. invcntl->postfile =
  539. open_file_with_possibly_flipped_name(invpost, INVPOST, INVPOST2, stat);
  540. if(!invcntl->postfile) {
  541. invcannotopen(invpost);
  542. fclose(invcntl->invfile);
  543. return (-1);
  544. }
  545. /* allocate core for a logical block */
  546. if((invcntl->logblk = malloc((size_t)invcntl->param.sizeblk)) == NULL) {
  547. invcannotalloc((size_t)invcntl->param.sizeblk);
  548. fclose(invcntl->postfile);
  549. fclose(invcntl->invfile);
  550. return (-1);
  551. }
  552. /* allocate for and read in superfinger */
  553. read_index = 1;
  554. invcntl->iindex = NULL;
  555. #if SHARE
  556. if(invcntl->param.share == 1) {
  557. key_t shm_key;
  558. struct shmid_ds shm_buf;
  559. int shm_id;
  560. /* see if the shared segment exists */
  561. shm_key = ftok(invname, 2);
  562. shm_id = shmget(shm_key, 0, 0);
  563. /* Failure simply means (hopefully) that segment doesn't exists */
  564. if(shm_id == -1) {
  565. /* Have to give general write permission due to AMdahl not having protected
  566. * segments */
  567. shm_id =
  568. shmget(shm_key, invcntl->param.supsize + sizeof(long), IPC_CREAT | 0666);
  569. if(shm_id == -1) perror("Could not create shared memory segment");
  570. } else
  571. read_index = 0;
  572. if(shm_id != -1) {
  573. invcntl->iindex = shmat(shm_id, 0, ((read_index) ? 0 : SHM_RDONLY));
  574. if(invcntl->iindex == (char *)ERR) {
  575. fprintf(stderr, PROGRAM_NAME ": shared memory link failed\n");
  576. invcntl->iindex = NULL;
  577. read_index = 1;
  578. }
  579. }
  580. }
  581. #endif
  582. if(invcntl->iindex == NULL) /* FIXME HBB: magic number alert (4, sizeof(long)) */
  583. invcntl->iindex = malloc((size_t)invcntl->param.supsize + 4 * sizeof(long));
  584. if(invcntl->iindex == NULL) {
  585. invcannotalloc((size_t)invcntl->param.supsize);
  586. free(invcntl->logblk);
  587. fclose(invcntl->postfile);
  588. fclose(invcntl->invfile);
  589. return (-1);
  590. }
  591. if(read_index) {
  592. fseek(invcntl->invfile, invcntl->param.startbyte, SEEK_SET);
  593. fread(invcntl->iindex, (int)invcntl->param.supsize, 1, invcntl->invfile);
  594. }
  595. invcntl->numblk = -1;
  596. if(boolready() == -1) {
  597. fclose(invcntl->postfile);
  598. fclose(invcntl->invfile);
  599. return (-1);
  600. }
  601. /* write back out the control block if anything changed */
  602. invcntl->param.filestat = stat;
  603. if(stat > invcntl->param.filestat) {
  604. rewind(invcntl->invfile);
  605. fwrite(&invcntl->param, sizeof(invcntl->param), 1, invcntl->invfile);
  606. }
  607. return (1);
  608. }
  609. /** invclose must be called to wrap things up and deallocate core **/
  610. void invclose(INVCONTROL *invcntl) {
  611. /* write out the control block in case anything changed */
  612. if(invcntl->param.filestat > 0) {
  613. invcntl->param.filestat = 0;
  614. rewind(invcntl->invfile);
  615. fwrite(&invcntl->param, 1, sizeof(invcntl->param), invcntl->invfile);
  616. }
  617. if(invcntl->param.filestat == INVALONE) {
  618. /* write out the super finger */
  619. fseek(invcntl->invfile, invcntl->param.startbyte, SEEK_SET);
  620. fwrite(invcntl->iindex, 1, (int)invcntl->param.supsize, invcntl->invfile);
  621. }
  622. fclose(invcntl->invfile);
  623. fclose(invcntl->postfile);
  624. #if SHARE
  625. if(invcntl->param.share > 0) {
  626. shmdt(invcntl->iindex);
  627. invcntl->iindex = NULL;
  628. }
  629. #endif
  630. if(invcntl->iindex != NULL) free(invcntl->iindex);
  631. free(invcntl->logblk);
  632. }
  633. /** invstep steps the inverted file forward one item **/
  634. static void invstep(INVCONTROL *invcntl) {
  635. if(invcntl->keypnt < (invcntl->logblk->invblk[0] - 1)) {
  636. invcntl->keypnt++;
  637. return;
  638. }
  639. /* move forward a block else wrap */
  640. invcntl->numblk =
  641. invcntl->logblk->invblk[1]; /* was: *(int *)(invcntl->logblk + sizeof(long))*/
  642. /* now read in the block */
  643. fseek(invcntl->invfile,
  644. invcntl->numblk * invcntl->param.sizeblk + invcntl->param.cntlsize,
  645. SEEK_SET);
  646. fread(invcntl->logblk, (int)invcntl->param.sizeblk, 1, invcntl->invfile);
  647. invcntl->keypnt = 0;
  648. }
  649. /** invforward moves forward one term in the inverted file **/
  650. int invforward(INVCONTROL *invcntl) {
  651. invstep(invcntl);
  652. /* skip things with 0 postings */
  653. /* FIXME HBB: magic number alert! (3) */
  654. while(((ENTRY *)(invcntl->logblk->invblk + 3) + invcntl->keypnt)->post == 0) {
  655. invstep(invcntl);
  656. }
  657. /* Check for having wrapped - reached start of inverted file! */
  658. if((invcntl->numblk == 0) && (invcntl->keypnt == 0)) return (0);
  659. return (1);
  660. }
  661. /** invterm gets the present term from the present logical block **/
  662. long invterm(INVCONTROL *invcntl, char *term) {
  663. ENTRY *entryptr;
  664. /* FIXME HBB: magic number alert! (3) */
  665. entryptr = (ENTRY *)(invcntl->logblk->invblk + 3) + invcntl->keypnt;
  666. strncpy(term, invcntl->logblk->chrblk + entryptr->offset, (int)entryptr->size);
  667. *(term + entryptr->size) = '\0';
  668. return (entryptr->post);
  669. }
  670. /** invfind searches for an individual item in the inverted file **/
  671. long invfind(INVCONTROL *invcntl, char *searchterm) /* term being searched for */
  672. {
  673. int imid, ilow, ihigh;
  674. long num;
  675. int i;
  676. unsigned long *intptr, *intptr2;
  677. ENTRY *entryptr;
  678. /* make sure it is initialized via invready */
  679. if(invcntl->invfile == 0) return (-1L);
  680. /* now search for the appropriate finger block */
  681. intptr = (unsigned long *)invcntl->iindex;
  682. ilow = 0;
  683. ihigh = *intptr++ - 1;
  684. while(ilow <= ihigh) {
  685. imid = (ilow + ihigh) / 2;
  686. intptr2 = intptr + imid;
  687. i = strcmp(searchterm, (invcntl->iindex + *intptr2));
  688. if(i < 0)
  689. ihigh = imid - 1;
  690. else if(i > 0)
  691. ilow = ++imid;
  692. else {
  693. ilow = imid + 1;
  694. break;
  695. }
  696. }
  697. /* be careful about case where searchterm is after last in this block */
  698. imid = (ilow) ? ilow - 1 : 0;
  699. /* fetch the appropriate logical block if not in core */
  700. /* note always fetch it if the file is busy */
  701. if((imid != invcntl->numblk) || (invcntl->param.filestat >= INVBUSY)) {
  702. fseek(invcntl->invfile,
  703. (imid * invcntl->param.sizeblk) + invcntl->param.cntlsize,
  704. SEEK_SET);
  705. invcntl->numblk = imid;
  706. fread(invcntl->logblk, (int)invcntl->param.sizeblk, 1, invcntl->invfile);
  707. }
  708. srch_ext:
  709. /* now find the term in this block. tricky this */
  710. intptr = (unsigned long *)invcntl->logblk->invblk;
  711. ilow = 0;
  712. ihigh = *intptr - 1;
  713. intptr += 3;
  714. num = 0;
  715. while(ilow <= ihigh) {
  716. imid = (ilow + ihigh) / 2;
  717. entryptr = (ENTRY *)intptr + imid;
  718. i = strncmp(searchterm,
  719. invcntl->logblk->chrblk + entryptr->offset,
  720. (int)entryptr->size);
  721. if(i == 0) i = strlen(searchterm) - entryptr->size;
  722. if(i < 0)
  723. ihigh = imid - 1;
  724. else if(i > 0)
  725. ilow = ++imid;
  726. else {
  727. num = entryptr->post;
  728. break;
  729. }
  730. }
  731. /* be careful about case where searchterm is after last in this block */
  732. if(imid >= invcntl->logblk->invblk[0]) {
  733. invcntl->keypnt = invcntl->logblk->invblk[0];
  734. invstep(invcntl);
  735. /* note if this happens the term could be in extended block */
  736. if(invcntl->param.startbyte < invcntl->numblk * invcntl->param.sizeblk)
  737. goto srch_ext;
  738. } else
  739. invcntl->keypnt = imid;
  740. return (num);
  741. }
  742. #if DEBUG
  743. /** invdump dumps the block the term parameter is in **/
  744. void invdump(INVCONTROL *invcntl, char *term) {
  745. long i, j, n, *longptr;
  746. ENTRY *entryptr;
  747. char temp[512], *ptr;
  748. /* dump superindex if term is "-" */
  749. if(*term == '-') {
  750. j = atoi(term + 1);
  751. longptr = (long *)invcntl->iindex;
  752. n = *longptr++;
  753. printf("Superindex dump, num blocks=%ld\n", n);
  754. longptr += j;
  755. while((longptr <= ((long *)invcntl->iindex) + n) && invbreak == 0) {
  756. printf("%2ld %6ld %s\n", j++, *longptr, invcntl->iindex + *longptr);
  757. longptr++;
  758. }
  759. return;
  760. } else if(*term == '#') {
  761. j = atoi(term + 1);
  762. /* fetch the appropriate logical block */
  763. invcntl->numblk = j;
  764. fseek(invcntl->invfile,
  765. (j * invcntl->param.sizeblk) + invcntl->param.cntlsize,
  766. SEEK_SET);
  767. fread(invcntl->logblk, (int)invcntl->param.sizeblk, 1, invcntl->invfile);
  768. } else
  769. i = abs((int)invfind(invcntl, term));
  770. longptr = invcntl->logblk->invblk;
  771. n = *longptr++;
  772. printf("Entry term to invdump=%s, postings=%ld, forwrd ptr=%ld, back ptr=%ld\n",
  773. term,
  774. i,
  775. *(longptr),
  776. *(longptr + 1));
  777. /* FIXME HBB: magic number alert! (3) */
  778. entryptr = (ENTRY *)(invcntl->logblk->invblk + 3);
  779. printf("%ld terms in this block, block=%ld\n", n, invcntl->numblk);
  780. printf("\tterm\t\t\tposts\tsize\toffset\tspace\t1st word\n");
  781. for(j = 0; j < n && invbreak == 0; j++) {
  782. ptr = invcntl->logblk->chrblk + entryptr->offset;
  783. strncpy(temp, ptr, (int)entryptr->size);
  784. temp[entryptr->size] = '\0';
  785. ptr +=
  786. (sizeof(long) * (long)((entryptr->size + (sizeof(long) - 1)) / sizeof(long)));
  787. printf("%2ld %-24s\t%5ld\t%3d\t%d\t%d\t%ld\n",
  788. j,
  789. temp,
  790. entryptr->post,
  791. entryptr->size,
  792. entryptr->offset,
  793. entryptr->space,
  794. *(long *)ptr);
  795. entryptr++;
  796. }
  797. }
  798. #endif
  799. static int boolready(void) {
  800. numitems = 0;
  801. if(item1 != NULL) free(item1);
  802. setsize1 = SETINC;
  803. if((item1 = malloc(SETINC * sizeof(*item1))) == NULL) {
  804. invcannotalloc(SETINC);
  805. return (-1);
  806. }
  807. if(item2 != NULL) free(item2);
  808. setsize2 = SETINC;
  809. if((item2 = malloc(SETINC * sizeof(*item2))) == NULL) {
  810. invcannotalloc(SETINC);
  811. return (-1);
  812. }
  813. item = item1;
  814. enditem = item;
  815. return (0);
  816. }
  817. void boolclear(void) {
  818. numitems = 0;
  819. item = item1;
  820. enditem = item;
  821. }
  822. POSTING *boolfile(INVCONTROL *invcntl, long *num, int boolarg) {
  823. ENTRY *entryptr;
  824. FILE *file;
  825. void *ptr;
  826. unsigned long *ptr2;
  827. POSTING *newitem = NULL; /* initialize, to avoid warning */
  828. POSTING posting;
  829. unsigned u;
  830. POSTING *newsetp = NULL, *set1p;
  831. long newsetc, set1c, set2c;
  832. /* FIXME HBB: magic number alert! (3) */
  833. entryptr = (ENTRY *)(invcntl->logblk->invblk + 3) + invcntl->keypnt;
  834. ptr = invcntl->logblk->chrblk + entryptr->offset;
  835. ptr2 = ((unsigned long *)ptr) + (entryptr->size + (sizeof(long) - 1)) / sizeof(long);
  836. *num = entryptr->post;
  837. switch(boolarg) {
  838. case bool_OR:
  839. case falseT:
  840. if(*num == 0) {
  841. *num = numitems;
  842. return (item);
  843. }
  844. }
  845. /* make room for the new set */
  846. u = 0;
  847. switch(boolarg) {
  848. case AND:
  849. case falseT:
  850. newsetp = item;
  851. break;
  852. case bool_OR:
  853. u = enditem - item;
  854. /* FALLTHROUGH */
  855. case REVERSEfalseT:
  856. u += *num;
  857. if(item == item2) {
  858. if(u > setsize1) {
  859. u += SETINC;
  860. if((item1 = realloc(item1, u * sizeof(*item1))) == NULL) {
  861. invcannotalloc(u * sizeof(*item1));
  862. boolready();
  863. *num = -1;
  864. return (NULL);
  865. }
  866. setsize1 = u;
  867. }
  868. newitem = item1;
  869. } else {
  870. if(u > setsize2) {
  871. u += SETINC;
  872. if((item2 = realloc(item2, u * sizeof(*item2))) == NULL) {
  873. invcannotalloc(u * sizeof(*item2));
  874. boolready();
  875. *num = -1;
  876. return (NULL);
  877. }
  878. setsize2 = u;
  879. }
  880. newitem = item2;
  881. }
  882. newsetp = newitem;
  883. }
  884. file = invcntl->postfile;
  885. fseek(file, *ptr2, SEEK_SET);
  886. fread(&posting, sizeof(posting), 1, file);
  887. newsetc = 0;
  888. switch(boolarg) {
  889. case bool_OR:
  890. /* while something in both sets */
  891. set1p = item;
  892. newsetp = newitem;
  893. for(set1c = 0, set2c = 0; set1c < numitems && set2c < *num; newsetc++) {
  894. if(set1p->lineoffset < posting.lineoffset) {
  895. *newsetp++ = *set1p++;
  896. set1c++;
  897. } else if(set1p->lineoffset > posting.lineoffset) {
  898. *newsetp++ = posting;
  899. fread(&posting, (int)sizeof(posting), 1, file);
  900. set2c++;
  901. } else if(set1p->type < posting.type) {
  902. *newsetp++ = *set1p++;
  903. set1c++;
  904. } else if(set1p->type > posting.type) {
  905. *newsetp++ = posting;
  906. fread(&posting, (int)sizeof(posting), 1, file);
  907. set2c++;
  908. } else { /* identical postings */
  909. *newsetp++ = *set1p++;
  910. set1c++;
  911. fread(&posting, (int)sizeof(posting), 1, file);
  912. set2c++;
  913. }
  914. }
  915. /* find out what ran out and move the rest in */
  916. if(set1c < numitems) {
  917. newsetc += numitems - set1c;
  918. while(set1c++ < numitems) {
  919. *newsetp++ = *set1p++;
  920. }
  921. } else {
  922. while(set2c++ < *num) {
  923. *newsetp++ = posting;
  924. newsetc++;
  925. fread(&posting, (int)sizeof(posting), 1, file);
  926. }
  927. }
  928. item = newitem;
  929. break; /* end of bool_OR */
  930. }
  931. numitems = newsetc;
  932. *num = newsetc;
  933. enditem = (POSTING *)newsetp;
  934. return ((POSTING *)item);
  935. }
  936. static void invcannotalloc(unsigned n) {
  937. fprintf(stderr, PROGRAM_NAME ": cannot allocate %u bytes\n", n);
  938. }
  939. static void invcannotopen(char *file) {
  940. fprintf(stderr, PROGRAM_NAME ": cannot open file %s\n", file);
  941. }
  942. static void invcannotwrite(char *file) {
  943. perror(PROGRAM_NAME); /* must be first to preserve errno */
  944. fprintf(stderr, PROGRAM_NAME ": write to file %s failed\n", file);
  945. }