Main Page | Modules | Alphabetical List | Data Structures | File List | Data Fields | Related Pages

cdgdb.c

00001 /* 00002 * Copyright (C) 1997-2004 The CDG Team <cdg@nats.inwordatik.uni-hamburg.de> 00003 * 00004 * This file is free software; as a special exception the author gives 00005 * unlimited permission to copy and/or distribute it, with or without 00006 * modifications, as long as this notice is preserved. 00007 * 00008 * This program is distributed in the hope that it will be useful, but 00009 * WITHOUT ANY WARRANTY, to the extent permitted by law; without even the 00010 * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 00011 * 00012 * $Id: cdgdb.c,v 1.21 2004/09/27 17:07:02 micha Exp $ 00013 */ 00014 00015 /* ------------------------------------------------------------------------- 00016 * @addtogroup Cdgdb Cdgdb - Get lexical entries out of a berkeley database 00017 * @author Othello Maurer 00018 * @date unknown 00019 * 00020 * This file provides an interface for retrieving lexical entries out of a 00021 * berkeley database. The database file contains only references to an 00022 * underlying cdg file which actually holds the entries. For loading the 00023 * requested lexical entries, a temp file will be created out of the cdg 00024 * file, containing only these entries. This file will then be parsed with 00025 * \b inpuLoad() into the memory. 00026 * @{ 00027 */ 00028 00029 /* -- INCLUDES ------------------------------------------------------------- */ 00030 #include <config.h> 00031 #include <unistd.h> 00032 #include <time.h> 00033 #include <libgen.h> 00034 #include <stdio.h> 00035 #include <string.h> 00036 #include <errno.h> 00037 #include <sys/resource.h> 00038 #include <blah.h> 00039 #include <sys/types.h> 00040 #include <sys/stat.h> 00041 #include <unistd.h> 00042 #include <db.h> 00043 #include "cdgdb.h" 00044 #include "cdg.h" 00045 #include "hook.h" 00046 #include "input.h" 00047 #include "parsing.h" 00048 00049 /* -- VARIABLES ------------------------------------------------------------ */ 00050 00051 /** database handle. The handle to the database struct */ 00052 static DB *database; 00053 00054 /** the underlying cdg file. This is actually the file, where the lexical 00055 * entries are stored in. */ 00056 static FILE *cdgstream; 00057 00058 /** file name of the file with CDG input in it */ 00059 static String dbFileName; 00060 00061 /** file name of the index */ 00062 static String dbIndexName; 00063 00064 /** age of that file at the time of the last opening */ 00065 static time_t dbAge; 00066 00067 /** the filename of the temporary file. This temporary file will contain 00068 * a set of lexical entries that can then be parsed into the memory. */ 00069 static String tmpFilename = NULL; 00070 00071 /** table of words already loaded. Holds the word which are yet retrieved 00072 * from the database, so that the database won't be questioned for them */ 00073 static Hashtable done = NULL; 00074 00075 /** a database entry. 00076 * This is an entry for the database which holds a key and the byte position 00077 * of its lexical entry in the underlying cdg file. */ 00078 typedef struct { 00079 String key; /**< the key of the lexical entry. */ 00080 int oBegin; /**< offset of its Beginning in the file */ 00081 int oEnd; /**< offset of its End in the file */ 00082 } entryStruct; 00083 00084 /** a pointer to the entryStruct */ 00085 typedef entryStruct *dbEntry; 00086 00087 00088 /* -- FUNCTIONS ------------------------------------------------------------ */ 00089 00090 /* -- IMPLEMENTATION ------------------------------------------------------- */ 00091 00092 /* ------------------------------------------------------------------------- 00093 * opens a database. 00094 * this function opens a database, specified by the parameter 'filename' 00095 * and stores the handle in the static variable 'database'. If there is no 00096 * database file with this filename, a new database is created. 00097 * @param filename the filename of the database file 00098 * @returns TRUE if success, FALSE otherwise 00099 */ 00100 Boolean dbOpenIndexFile(String filename) 00101 { 00102 int ret = 0; // return code for database operations 00103 00104 // create database handle 00105 if ((ret = db_create(&database, NULL, 0)) != 0) { 00106 cdgPrintf(CDG_ERROR, "ERROR (db_create): %s\n", db_strerror(ret)); 00107 return FALSE; 00108 } 00109 // setting database to accept more than one data item for one key 00110 // sort order is lexical (this is the default) 00111 database->set_flags(database, DB_DUP); 00112 00113 /* open database 'filename' */ 00114 #if (DB_VERSION_MAJOR < 4 || \ 00115 DB_VERSION_MAJOR == 4 && DB_VERSION_MINOR == 0) 00116 if ((ret = 00117 database->open(database, filename, NULL, DB_BTREE, DB_RDONLY, 0664)) != 0) { 00118 #else 00119 if ((ret = 00120 database->open(database, NULL, filename, NULL, DB_BTREE, DB_RDONLY, 0664)) != 0) { 00121 #endif 00122 cdgPrintf(CDG_ERROR, "ERROR (database->open): %s\n", db_strerror(ret)); 00123 database = NULL; 00124 return FALSE; 00125 } 00126 dbIndexName = strRegister(filename); 00127 return TRUE; 00128 } 00129 00130 00131 /* ------------------------------------------------------------------------- 00132 * open the cdg file. 00133 * opens the cdg file, which the database is indexing 00134 * you have to ensure that it is the right one, otherwise you will get crap 00135 * @param filename the name of the cdg file 00136 * @returns TRUE if success, else FALSE 00137 */ 00138 00139 Boolean dbOpenCdgFile(String filename) { 00140 struct stat buf; 00141 00142 cdgstream = fopen(filename, "r"); 00143 if(!cdgstream) { 00144 database = NULL; 00145 } 00146 00147 stat(filename, &buf); 00148 dbAge = buf.st_mtime; 00149 dbFileName = strRegister(filename); 00150 00151 return (cdgstream != NULL); 00152 } 00153 00154 /* ------------------------------------------------------------------------- 00155 * close the database. 00156 * writes database to disk and closes it. 00157 * @returns TRUE if success, FALSE otherwise 00158 */ 00159 00160 Boolean dbClose(void) 00161 { 00162 int ret = 0; // return code for database operations 00163 00164 if (database == NULL) { 00165 cdgPrintf(CDG_ERROR, "ERROR: No open database found for closing\n"); 00166 return FALSE; 00167 } 00168 00169 if ((ret = database->close(database, 0)) != 0) { 00170 cdgPrintf(CDG_ERROR, "ERROR (database->close): %s\n", db_strerror(ret)); 00171 return FALSE; 00172 } 00173 database = NULL; 00174 return TRUE; 00175 } 00176 00177 /* ---------------------------------------------------------------------- 00178 Open the data base. 00179 00180 NAME is the base name of the cdg and the index file. 00181 */ 00182 Boolean dbOpen(String name) { 00183 struct stat buffer; 00184 00185 dbFileName = strAppend(name, ".cdg", NULL); 00186 dbIndexName = strAppend(name, ".db", NULL); 00187 00188 /* Try very hard to find the files. If they aren't in the current 00189 directory, maybe they are in another directory we loaded files 00190 from. */ 00191 if(stat(dbFileName, &buffer)) { 00192 /* remove directory component */ 00193 String filebase = strCopy(basename(dbFileName)); 00194 String indexbase = strCopy(basename(dbIndexName)); 00195 String separator = strRegister("/"); 00196 List l; 00197 for(l = inputCurrentGrammar->files; l != NULL; l = listNext(l)) { 00198 String file = strCopy(listElement(l)); 00199 String dir = dirname(file); 00200 String filealt = strAppend(dir,separator,filebase, NULL); 00201 String indexalt = strAppend(dir,separator,indexbase, NULL); 00202 if(!stat(filealt, &buffer) && !stat(indexalt, &buffer)) { 00203 dbFileName = filealt; 00204 dbIndexName = indexalt; 00205 break; 00206 } 00207 } 00208 } 00209 00210 /* open the cdg file */ 00211 if (!dbOpenCdgFile(dbFileName)) { 00212 cdgPrintf(CDG_ERROR, "ERROR: cannot open %s\n", dbFileName); 00213 return FALSE; 00214 } 00215 /* open the database file */ 00216 if (!dbOpenIndexFile(dbIndexName)) { 00217 cdgPrintf(CDG_ERROR, "ERROR: cannot open %s\n", dbIndexName); 00218 return FALSE; 00219 } 00220 return TRUE; 00221 } 00222 00223 00224 /* ------------------------------------------------------------------------- 00225 * checks if there is a database handle available. 00226 * @returns TRUE if there is a database opened, else FALSE 00227 */ 00228 Boolean dbAvailable(void) { 00229 struct stat buf; 00230 time_t age; 00231 00232 if(!database) { 00233 return FALSE; 00234 } 00235 00236 /* If the file that we read data from is newer than it was 00237 when we last opened it, this means it has been recreated 00238 while we ran. To be safe we re-open the data base. */ 00239 stat(dbFileName, &buf); 00240 age = buf.st_mtime; 00241 if(age > dbAge) { 00242 cdgPrintf(CDG_WARNING, "WARNING: data base has changed!\n"); 00243 cdgPrintf(CDG_WARNING, "WARNING: re-opening data base.\n"); 00244 dbClose(); 00245 dbOpenIndexFile(dbIndexName); 00246 dbOpenCdgFile(dbFileName); 00247 } 00248 00249 return TRUE; 00250 00251 } 00252 00253 /* ------------------------------------------------------------------------- 00254 * constructor for dbEntry objects. 00255 * @returns a new dbEntry 00256 */ 00257 dbEntry newDbEntry() { 00258 dbEntry entry = memMalloc(sizeof(entryStruct)); 00259 entry->key = NULL; 00260 entry->oBegin = 0; 00261 entry->oEnd = 0; 00262 return entry; 00263 } 00264 00265 00266 /* ------------------------------------------------------------------------- 00267 * gets lexical entries from the database. 00268 * gets a result list of LexiconItem matching the given word (the key) from 00269 * the database. This function succeeds only once for a given word form. 00270 * @param word the word which is the key 00271 * @returns a list of dbEntry struct, all of them having the same key and 00272 * NULL if there was an Error 00273 */ 00274 List dbGetEntries(String word) 00275 { 00276 int ret = 0; // return code for database operations 00277 DBT key, value; // DBT structs for key and value 00278 DBC *cursorp; // cursor for getting duplicates 00279 dbEntry ent; 00280 List entries = NULL; // the return list 00281 00282 if (!database) { 00283 cdgPrintf(CDG_ERROR, "ERROR: No database opened\n"); 00284 return entries; 00285 } 00286 00287 // create cursor for getting duplicates 00288 database->cursor(database, NULL, &cursorp, 0); 00289 00290 memset(&key, 0, sizeof (key)); 00291 memset(&value, 0, sizeof (value)); 00292 key.data = word; 00293 key.size = strlen(word); 00294 00295 // position the cursor at the right key and get the first item 00296 if ((ret = cursorp->c_get(cursorp, &key, &value, DB_SET)) != 0) { 00297 if (ret == DB_NOTFOUND) { 00298 return NULL; 00299 } else { 00300 cdgPrintf(CDG_ERROR, "ERROR (cursorp->c_get): %s\n", db_strerror(ret)); 00301 return NULL; 00302 } 00303 } 00304 ent = newDbEntry(); 00305 ent->key = strCopy(key.data); 00306 ent->oBegin = *((int *)value.data); 00307 ent->oEnd = *((int *)value.data + 1); 00308 entries = listAppendElement(entries, ent); 00309 // now get all duplicates 00310 while ((ret = 00311 cursorp->c_get(cursorp, &key, &value, DB_NEXT_DUP)) != DB_NOTFOUND) { 00312 if (ret != 0) { 00313 cdgPrintf(CDG_ERROR, "ERROR (cursorp->c_get): %s\n", db_strerror(ret)); 00314 break; 00315 } 00316 00317 ent = newDbEntry(); 00318 ent->key = strCopy(word); 00319 ent->oBegin = *((int *)value.data); 00320 ent->oEnd = *((int *)value.data + 1); 00321 entries = listAppendElement(entries, ent); 00322 } 00323 return entries; 00324 } 00325 00326 /* ------------------------------------------------------------------------- 00327 * loads lexical entries into the memory. 00328 * generates a temp file which holds lexical entries and parses them into 00329 * the memory 00330 * @param entries a list of dbEntry structs you want to parse 00331 * @returns True if success, else FALSE 00332 */ 00333 Boolean dbLoadEntries(List entries) 00334 { 00335 dbEntry ent; 00336 List l; 00337 FILE *tmpstream; 00338 unsigned long mode; 00339 tmpFilename = strPrintf("/tmp/dbload.%d.cdg", getpid()); 00340 00341 /* if there are no entries for loading*/ 00342 if(!entries) { 00343 return FALSE; 00344 } 00345 00346 /* open a temporary file */ 00347 if((tmpstream = fopen(tmpFilename, "w")) == NULL) { 00348 cdgPrintf(CDG_ERROR, "ERROR (dbLoadEntries): cannot generate tempfile\n"); 00349 return FALSE; 00350 } 00351 for (l = entries; l != NULL; l = listNext(l)) { 00352 ent = (dbEntry)listElement(l); 00353 fseek(cdgstream, (ent->oBegin -1), SEEK_SET); // seek to begin of entry 00354 while(ftell(cdgstream) != ent->oEnd) { 00355 putc(getc(cdgstream), tmpstream); 00356 } 00357 putc((int)'\n', tmpstream); 00358 } 00359 listForEachDelete(entries, free); 00360 fclose(tmpstream); 00361 00362 /* suppress uninteresting `loading 58749857.cdg' message */ 00363 mode = hkVerbosity; 00364 hkVerbosity = hkVerbosity & (~CDG_INFO); 00365 inputLoad(tmpFilename); 00366 hkVerbosity = mode; 00367 00368 /* promote new data to current data */ 00369 mergeInput(inputCurrentGrammar, parseResult); 00370 inputCacheInput(inputCurrentGrammar); 00371 memFree(parseResult); 00372 parseResult = NULL; 00373 00374 return TRUE; 00375 } 00376 00377 /* ------------------------------------------------------------------------- 00378 * wrapper for dbLoadEntries(). 00379 * generates a temp file which holds lexical entries and parses them into 00380 * the memory 00381 * @param key the key of the lex entries which have to be parsed 00382 * @returns TRUE if success, else FALSE 00383 */ 00384 00385 Boolean dbLoad(String key) { 00386 List records = NULL; 00387 if(!dbAvailable()) { 00388 return FALSE; 00389 } 00390 00391 /* detect duplicate query */ 00392 if(hashGet(done, strRegister(key))) { 00393 return TRUE; 00394 } 00395 records = dbGetEntries(key); 00396 00397 /* provide interesting `loading form "Schnabeltier"' message */ 00398 if(records) { 00399 cdgPrintf(CDG_INFO, "INFO: loading form `%s'\n", key); 00400 00401 /* remember never to read this key again */ 00402 hashSet(done, strRegister(key), (Pointer)TRUE); 00403 return dbLoadEntries(records); 00404 } 00405 return TRUE; 00406 } 00407 00408 /* ------------------------------------------------------------------------- 00409 Another wrapper for dbLoadEntries(). 00410 00411 It loads all known lexicon items for words in the list FORMS. This has 00412 exactly the same effect as calling dbLoad() repeatedly, but is faster 00413 because there will be only one file system operation instead of n. 00414 */ 00415 Boolean dbLoadAll(List forms) { 00416 List records = NULL; 00417 List l; 00418 int i = 0; 00419 00420 if(!dbAvailable()) { 00421 return FALSE; 00422 } 00423 for(l = forms; l != NULL; l = listNext(l)) { 00424 String form = listElement(l); 00425 00426 /* detect duplicate query */ 00427 if(hashGet(done, form)) { 00428 continue; 00429 } 00430 00431 records = listAppendList(records, dbGetEntries(form)); 00432 hashSet(done, strRegister(form), (Pointer)TRUE); 00433 if(0 == (++i % 1000)) { 00434 cdgPrintf(CDG_PROGRESS, "%d forms...\n", i); 00435 } 00436 } 00437 cdgPrintf(CDG_INFO, "INFO: loading %d items\n", listSize(records)); 00438 00439 return dbLoadEntries(records); 00440 } 00441 00442 00443 /* ------------------------------------------------------------------------- 00444 * Initialize 00445 */ 00446 00447 void dbInitialize(void) { 00448 done = hashNew( 500, 0.8, 00449 hashStringHashFunction, 00450 hashStringEqualFunction ); 00451 } 00452 00453 /* ------------------------------------------------------------------------- 00454 * finalize 00455 * close the database, delete \b done hash, remove temporary file 00456 */ 00457 void dbFinalize(void) 00458 { 00459 if (dbAvailable()) 00460 dbClose(); 00461 if (tmpFilename) { 00462 remove(tmpFilename); 00463 } 00464 hashDelete(done); 00465 } 00466 00467 00468 /* ------------------------------------------------------------------------- */ 00469 /* -- ENDOFFILE ------------------------------------------------------------ */ 00470 /** @} */

CDG 0.95 (20 Oct 2004)