3 * Traakan, Inc., Los Altos, CA
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice unmodified, this list of conditions, and the following
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * Binary Search Text File (BSTF)
36 * Use conventional binary search method on a sorted text
37 * file. The file MUST be sorted in ascending, lexicographic
38 * order. This is the default order of sort(1).
47 int n_seek, n_align, n_line, n_getc;
51 #define MIN_DELTA 2048
57 * ndmbstf_first_with_bounds()
62 * -2 Malformed line/file
63 * -3 fseek() to determine file size failed
64 * -4 fseek() to hop failed
65 * =0 First line found in buf[], does not match key
66 * >0 Length of first matching line in buf[]
71 FILE *fp, /* the file to search */
72 char *key, /* what we're looking for */
73 char *buf, /* returned line */
74 unsigned max_buf) /* maximum lenght of buf (sizeof (buf)) */
76 return ndmbstf_first_with_bounds (fp, key, buf, max_buf, 0, 0);
80 ndmbstf_first_with_bounds (
81 FILE *fp, /* the file to search */
82 char *key, /* what we're looking for */
83 char *buf, /* returned line */
84 unsigned max_buf, /* maximum lenght of buf (sizeof (buf)) */
85 off_t lower_bound, /* offset, to skip headers, usually 0 */
86 off_t upper_bound) /* 0->don't know, >0 limit */
89 off_t lower, upper; /* bounds */
93 if (upper_bound == 0) {
97 * Determine the file size using fseek()/ftell()
100 fseeko (fp, 0, SEEK_END);
101 end_off = ftello (fp);
104 upper_bound = end_off;
108 * Set lower and upper bounds of the binary search
114 * Determine the delta (distance) between the current
115 * lower and upper bounds. If delta is small, it is more
116 * efficient to do a linear search than to continue
117 * seeking. This is because stdio will pre-read
118 * portions no matter what and fseek()ing will discard
119 * the pre-read portions. MIN_DELTA is the approximation
120 * of the stdio pre-read size. Better to just
121 * linearly process the pre-read portion in the
122 * hopes that our answer is already sitting in the
125 delta = upper - lower;
126 if (delta <= MIN_DELTA)
130 * Seek to the first line after the midpoint
131 * between the lower and upper bounds.
134 off = lower + delta / 2;
135 rc = ndmbstf_seek_and_align (fp, &off);
139 * Alignment found a very long line without
140 * a \n at the end of the file. All we
141 * can do now is try a linear search
142 * from the current lower bound.
145 return -4; /* fseek() for hop failed */
149 * Read the next line up into buf[].
152 buf_len = ndmbstf_getline (fp, buf, max_buf);
155 * EOF, or malformed line. All we can do now
156 * is try a linear search from the current
163 * This is the crucial point.
165 * buf[] contains a line just read.
166 * off points the the line we just read.
167 * key[] is what we're looking for.
169 * Is the objective line (what we're trying to find)
170 * somewhere between lower..off or between off..upper.
172 rc = ndmbstf_compare (key, buf);
174 /* key>buf. Objective somewhere in off..upper */
178 * key<=buf. Objective somewhere in lower..off
179 * Notice that if key==buf, there still might
180 * be a line ==key before the one we just
181 * read. There might be hundreds of such
182 * lines. The objective is the FIRST line
183 * greater than or equal to the key.
184 * This might be it. It might not.
185 * So, keep searching.
192 * Do an unbounded linear search begining at the
197 rc = ndmbstf_seek_and_align (fp, &off);
201 * Alignment found a very long line without
202 * a \n at the end of the file. All we
207 return -4; /* fseek() for hop failed */
211 * Read the next line up into buf[].
214 buf_len = ndmbstf_getline (fp, buf, max_buf);
218 break; /* at end of file */
222 rc = ndmbstf_compare (key, buf);
228 return 0; /* have line but it doesn't match */
236 FILE *fp, /* the file to search */
237 char *key, /* what we're looking for */
238 char *buf, /* returned line */
239 unsigned max_buf) /* maximum lenght of buf (sizeof (buf)) */
244 * Read the next line up into buf[].
246 buf_len = ndmbstf_getline (fp, buf, max_buf);
250 return EOF; /* at end of file */
251 return -2; /* malformed line */
254 rc = ndmbstf_compare (key, buf);
259 return 0; /* have line but it doesn't match */
275 ndmbstf_getline (FILE *fp, char *buf, unsigned max_buf)
278 char * p_end = buf + max_buf - 2;
281 while ((c = getc(fp)) != EOF) {
294 return -2; /* malformed line */
305 ndmbstf_seek_and_align (FILE *fp, off_t *off)
309 if (fseeko (fp, *off, SEEK_SET) == -1) {
317 * There is a slim chance that we're at the
318 * true begining of a line. Too slim.
319 * Scan forward discarding the trailing
320 * portion of the line we just fseek()ed
321 * to, and leave the stdio stream positioned
322 * for the subsequent line. Notice
323 * we keep off upated so that it reflects
324 * the seek position of the stdio stream.
327 while ((c = getc(fp)) != EOF) {
348 * Given a key[] and a buf[], return whether or not they match.
349 * This effectively is strncmp (key, buf, strlen(key)).
350 * Because of the cost of the call to strlen(), we implement
351 * it ndmbstf_compare() here with the exact semantics.
354 * <0 No match, key[] less than buf[]
355 * =0 Match, the key[] is a prefix for buf[]
356 * >0 No match, key[] greater than buf[]
360 ndmbstf_compare (char *key, char *buf)
365 while (*p != 0 && *p == *q) {
371 return 0; /* entire key matched */
380 * A simple wrapper around ndmbstf_compare() with an
381 * easy-to-use return value..
389 ndmbstf_match (char *key, char *buf)
391 return ndmbstf_compare (key, buf) == 0;
399 main (int ac, char *av[])
404 int total_n_match = 0;
405 int total_n_no_match = 0;
406 int total_n_error = 0;
409 if (i < ac && strcmp (av[i], "-q") == 0) {
415 printf ("usage: %s [-q] FILE KEY ...\n", av[0]);
419 fp = fopen (av[i], "r");
425 for (i++; i < ac; i++) {
431 rc = ndmbstf_first (fp, key, buf, sizeof buf);
433 printf ("Key '%s' err=%d\n", key, rc);
439 printf ("Key '%s' no matches\n", key);
445 printf ("Key '%s'\n", key);
446 for (; rc > 0; rc = ndmbstf_next (fp, key, buf, sizeof buf)) {
449 printf (" %2d: '%s'\n", n_match, buf);
451 total_n_match += n_match;
456 printf ("n_match=%d n_miss=%d n_error=%d\n",
457 total_n_match, total_n_no_match, total_n_error);
458 printf ("n_seek=%d n_align=%d n_line=%d\n", n_seek, n_align, n_line);
462 #endif /* SELF_TEST */