Main Page | Modules | Alphabetical List | Data Structures | File List | Data Fields | Related Pages

lexemgraph.c

00001 /* 00002 * Copyright (C) 1997-2004 The CDG Team <cdg@nats.informatik.uni-hamburg.de> 00003 * 00004 * This file is free software; as a special exception the author gives 00005 * unlimited permission to copy and/or distribute it, with or without 00006 * modifications, as long as this notice is preserved. 00007 * 00008 * This program is distributed in the hope that it will be useful, but 00009 * WITHOUT ANY WARRANTY, to the extent permitted by law; without even the 00010 * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 00011 * 00012 */ 00013 00014 /* --------------------------------------------------------------------------- 00015 * @addtogroup Lexemgraph Lexemgraph - maintainance of lexem graphs 00016 * @author Ingo Schroeder (see also AUTHORS and THANKS for more) 00017 * @date 1997-03-04 00018 * 00019 * $Id: lexemgraph.c,v 1.140 2004/09/27 17:07:05 micha Exp $ 00020 * @{ 00021 */ 00022 00023 /* ------------------------------------------------------------------------- */ 00024 #include <config.h> 00025 00026 #include <ctype.h> 00027 #include <stdio.h> 00028 #include <blah.h> 00029 #include <string.h> 00030 #include <limits.h> 00031 #include "hook.h" 00032 #include "input.h" 00033 #include "lexemgraph.h" 00034 #include "eval.h" 00035 #include "increment.h" 00036 #include "set.h" 00037 #include "tagger.h" 00038 #include "parse.h" 00039 #include "chunker.h" 00040 00041 /* -- VARIABLES ------------------------------------------------------------ */ 00042 00043 /** 00044 This variable controls whether the levelvalues should be deflated if 00045 they are equivalent. Usually, we want this switched on, only for 00046 testing a value of FALSE might be appropriate. 00047 */ 00048 Boolean lgCompactLVs=TRUE; 00049 00050 /* ---------------------------------------------------------------------------- 00051 @brief (re-)computes the distance matrix LexemGraph::distance 00052 00053 This function computes the distance between any two lexeme nodes in @a lg and 00054 stores the result in @a lg->distance. 00055 */ 00056 void lgComputeDistances(LexemGraph lg) 00057 { 00058 GraphemNode gn, gnNext; 00059 int noOfNodes = vectorSize(lg->graphemnodes); 00060 int i, j, k, ji, ik, jk, ij; 00061 00062 /* arrayNew initializes the cells with NULL */ 00063 if (!lg->distance) { 00064 lg->distance = arrayNew(noOfNodes, noOfNodes, 0); 00065 } else { 00066 arraySetAllElements(lg->distance, (Pointer) 0); 00067 } 00068 00069 for (i = 0; i < noOfNodes; i++) { 00070 gn = (GraphemNode) vectorElement(lg->graphemnodes, i); 00071 00072 /* If this gn has been deleted from the problem, leave its distances all 00073 * at 0. This is necessary because the distance info does double duty as 00074 * compatibility info. */ 00075 if (lgAreDeletedNodes(lg, gn->lexemes)) { 00076 continue; 00077 } 00078 for (j = 0; j < noOfNodes; j++) { 00079 gnNext = (GraphemNode) vectorElement(lg->graphemnodes, j); 00080 if (lgAreDeletedNodes(lg, gnNext->lexemes)) { 00081 continue; 00082 } 00083 if (gn->arc->to == gnNext->arc->from) { 00084 arraySetElement(lg->distance, (Pointer) 1, i, j); 00085 } 00086 } 00087 } 00088 /* Warshall's algorithm for transitive closure */ 00089 for (i = 0; i < noOfNodes; i++) { 00090 for (j = 0; j < noOfNodes; j++) { 00091 ji = (int)arrayElement(lg->distance, j, i); 00092 00093 if (ji > 0) { 00094 for (k = 0; k < noOfNodes; k++) { 00095 ik = (int)arrayElement(lg->distance, i, k); 00096 jk = (int)arrayElement(lg->distance, j, k); 00097 00098 if (ik > 0 && (jk == 0 || ji + ik < jk)) { 00099 arraySetElement(lg->distance, (Pointer) (ji + ik), j, k); 00100 } 00101 } 00102 } 00103 } 00104 } 00105 00106 for (i = 0; i < noOfNodes; i++) { 00107 for (j = 0; j < noOfNodes; j++) { 00108 ij = (int)arrayElement(lg->distance, i, j); 00109 ji = (int)arrayElement(lg->distance, j, i); 00110 00111 if (ij == 0 && ji > 0) { 00112 arraySetElement(lg->distance, (Pointer) (-ji), i, j); 00113 } 00114 } 00115 } 00116 } 00117 00118 00119 /* ---------------------------------------------------------------------------- 00120 @brief computes LexemGraph::noOfPathsFromStart 00121 00122 This function computes the number of paths leading to @a gn from the start of 00123 @a lg. If @a gn corresponds to a start node, this is simply the number of 00124 lexeme nodes sprung from @a gn. Otherwise it is that number multiplied by the 00125 sum of the numbers of paths leading from the start to immediately preceding 00126 grapheme nodes. If @a gn is deleted, the number is always zero. 00127 */ 00128 long long computeNoOfPathsFromStart(LexemGraph lg, GraphemNode gn, 00129 long long sofar, long long maximal) 00130 { 00131 long long i, sum; 00132 GraphemNode prevNode; 00133 long long noOfPathsFromStart; 00134 00135 noOfPathsFromStart = lg->noOfPathsFromStart[gn->no]; 00136 00137 if (noOfPathsFromStart >= 0) { 00138 return noOfPathsFromStart; 00139 } 00140 00141 if (!gn->live) { 00142 return lg->noOfPathsFromStart[gn->no] = 0; 00143 } 00144 00145 if (sofar > maximal) { 00146 cdgPrintf(CDG_ERROR, "ERROR: lexem lattice contains cycles\n"); 00147 abort(); 00148 } 00149 00150 if (gn->arc->from == lg->min) { 00151 return lg->noOfPathsFromStart[gn->no] = 1; 00152 } 00153 sum = 0; 00154 for (i = 0; i < maximal; i++) { 00155 prevNode = (GraphemNode) vectorElement(lg->graphemnodes, i); 00156 00157 if (!prevNode->live || prevNode->arc->to != gn->arc->from) 00158 continue; 00159 00160 sum += computeNoOfPathsFromStart(lg, prevNode, sofar + 1, maximal); 00161 00162 if (sum < 0) { 00163 cdgPrintf(CDG_ERROR, 00164 "PANIC: integer overflow in computeNoOfPathsFromStart()!\n"); 00165 abort(); 00166 } 00167 } 00168 return lg->noOfPathsFromStart[gn->no] = sum; 00169 } 00170 00171 /* ---------------------------------------------------------------------------- 00172 @brief computes LexemGraph::noOfPathsToEnd 00173 00174 This function computes the number of paths leading from @a gn to the end of 00175 @a lg. If @a gn corresponds to an end node, this is simply the number of 00176 lexeme nodes sprung from @a g. Otherwise it is that number multiplied by the 00177 sum of the numbers of paths leading to the end from immediately following 00178 grapheme nodes. If @a gn is deleted, the number is always zero. 00179 */ 00180 long long computeNoOfPathsToEnd(LexemGraph lg, GraphemNode gn, 00181 long long sofar, long long maximal) 00182 { 00183 long long i, sum; 00184 GraphemNode nextNode; 00185 long long noOfPathsToEnd; 00186 00187 noOfPathsToEnd = lg->noOfPathsToEnd[gn->no]; 00188 00189 if (noOfPathsToEnd >= 0) 00190 return noOfPathsToEnd; 00191 00192 if (!gn->live) { 00193 return lg->noOfPathsToEnd[gn->no] = 0; 00194 } 00195 00196 if (sofar > maximal) { 00197 cdgPrintf(CDG_ERROR, "ERROR: lexem lattice contains cycles\n"); 00198 abort(); 00199 } 00200 00201 if (gn->arc->to == lg->max) { 00202 return lg->noOfPathsToEnd[gn->no] = 1; 00203 } 00204 00205 sum = 0; 00206 for (i = 0; i < maximal; i++) { 00207 nextNode = (GraphemNode) vectorElement(lg->graphemnodes, i); 00208 00209 if (!nextNode->live || nextNode->arc->from != gn->arc->to) 00210 continue; 00211 00212 sum += computeNoOfPathsToEnd(lg, nextNode, sofar + 1, maximal); 00213 if (sum < 0) { 00214 cdgPrintf(CDG_ERROR, 00215 "PANIC: integer overflow in computeNoOfPathsFromStart()!\n"); 00216 abort(); 00217 } 00218 } 00219 return lg->noOfPathsToEnd[gn->no] = sum; 00220 } 00221 00222 /* ---------------------------------------------------------------------------- 00223 @brief computes # of paths possible in the graph, given the current state of 00224 deletions. 00225 00226 This function computes the number of paths possible in @a lg, according to 00227 the state of its Vector LexemGraph::isDeletedNode. It calls 00228 computeNoOfPathsToEnd() and computeNoOfPathsFromStart() for each lexeme node. 00229 The total number of all paths is the sum of all numbers of paths leading to 00230 grapheme nodes that are end nodes. 00231 */ 00232 void lgComputeNoOfPaths(LexemGraph lg) 00233 { 00234 int i; 00235 LexemNode ln; 00236 GraphemNode gn; 00237 List l; 00238 int noOfNodes = vectorSize(lg->graphemnodes); 00239 00240 /* append cells needed by iSearch */ 00241 for (i = bvSize(lg->isDeletedNode); i < vectorSize(lg->nodes); i++) { 00242 bvSetElement(lg->isDeletedNode, FALSE, i); 00243 } 00244 00245 /* recompute ambiguities of each word */ 00246 for (i = 0; i < vectorSize(lg->graphemnodes); i++) { 00247 gn = vectorElement(lg->graphemnodes, i); 00248 gn->live = FALSE; 00249 for (l = gn->lexemes; l != NULL; l = listNext(l)) { 00250 ln = listElement(l); 00251 if (!bvElement(lg->isDeletedNode, ln->no)) { 00252 gn->live = TRUE; 00253 break; 00254 } 00255 } 00256 } 00257 00258 /* maybe allocate the arrays */ 00259 if (!lg->noOfPathsFromStart) { 00260 lg->noOfPathsFromStart = memMalloc(sizeof (long long) * noOfNodes); 00261 } 00262 if (!lg->noOfPathsToEnd) { 00263 lg->noOfPathsToEnd = memMalloc(sizeof (long long) * noOfNodes); 00264 } 00265 00266 /* no paths from start/to end yet */ 00267 for (i = 0; i < vectorSize(lg->graphemnodes); i++) { 00268 lg->noOfPathsFromStart[i] = -1; 00269 lg->noOfPathsToEnd[i] = -1; 00270 } 00271 00272 /* compute # of paths for all nodes */ 00273 for (i = 0; i < vectorSize(lg->graphemnodes); i++) { 00274 gn = (GraphemNode) vectorElement(lg->graphemnodes, i); 00275 00276 if (lg->noOfPathsFromStart[gn->no] < 0) { 00277 computeNoOfPathsFromStart(lg, gn, 0, vectorSize(lg->graphemnodes)); 00278 } 00279 if (lg->noOfPathsToEnd[gn->no] < 0) { 00280 computeNoOfPathsToEnd(lg, gn, 0, vectorSize(lg->graphemnodes)); 00281 } 00282 } 00283 00284 /* count # of paths in lg */ 00285 lg->noOfPaths = 0; 00286 for (i = 0; i < vectorSize(lg->graphemnodes); i++) { 00287 gn = (GraphemNode) vectorElement(lg->graphemnodes, i); 00288 00289 if (!gn->live || gn->arc->from != lg->min) 00290 continue; 00291 00292 lg->noOfPaths += lg->noOfPathsToEnd[gn->no]; 00293 } 00294 00295 } 00296 00297 /* ---------------------------------------------------------------------------- 00298 @brief partitions a set of lexeme nodes into equivalence classes 00299 00300 This function partitions the set of lexeme nodes of @a gn into equivalence 00301 classes. The equivalence relation used is the function inputCompareLeByAtts() 00302 with the argument @a features. The function returns a new List of new lists 00303 of lexemes. (The latter are re-used in ConstraintNode structures, the former 00304 are deallocated by cnBuildNodes().) 00305 */ 00306 List lgPartitions(GraphemNode gn, BitString features) 00307 { 00308 00309 /** @code 00310 00311 result = []; 00312 00313 FOR each lexeme l: 00314 IF l fits into one of the known classes, 00315 insert l there; 00316 ELSE 00317 create new class [lexem]; 00318 insert the new class into result; 00319 FI 00320 ROF 00321 00322 return result. 00323 00324 @endcode 00325 */ 00326 00327 List result, p; 00328 List q; 00329 List class; 00330 LexemNode ln; 00331 LexiconItem lexeme1 = NULL, lexeme2 = NULL; 00332 00333 result = NULL; 00334 00335 /* partitioning the empty set 00336 yields the set with one class: { {} } */ 00337 if(!gn) { 00338 return listPrependElement(NULL, NULL); 00339 } else if (gn == NONSPEC) { 00340 return listPrependElement(NULL, NONSPEC); 00341 } 00342 #ifdef DEBUGPARTITION 00343 cdgPrintf(CDG_DEBUG, 00344 "DEBUG: about to classify %d lexemes\n", 00345 listSize(gn->lexemes)); 00346 00347 for (p = gn->lexemes; p != NULL; p = listNext(p)) { 00348 cdgPrintf(CDG_DEBUG, "%s ", ((LexemNode) listElement(p))->lexem->description); 00349 } 00350 cdgPrintf(CDG_DEBUG, "\n"); 00351 #endif 00352 00353 /* partitioning a normal set yields sets of equivalent lexemes: 00354 { { das_ART_nom,das_ART_acc }, { das_PREL_nom,das_PREL_acc} } */ 00355 for (p = gn->lexemes; p != NULL; p = listNext(p)) { 00356 ln = (LexemNode) listElement(p); 00357 lexeme1 = ln->lexem; 00358 00359 #ifdef DEBUGPARTITION 00360 cdgPrintf(CDG_DEBUG, "DEBUG: classifying lexeme %s\n", lexeme1->description); 00361 #endif 00362 00363 /* compare to the first item in each class */ 00364 00365 for (q = result; q != NULL; q = listNext(q)) { 00366 class = (List) listElement(q); 00367 lexeme2 = ((LexemNode) listElement(class))->lexem; 00368 00369 #ifdef DEBUGPARTITION 00370 cdgPrintf(CDG_DEBUG, "DEBUG: comparing to lexeme %s... ", lexeme2->description); 00371 #endif 00372 00373 /* insert into correct class */ 00374 if (lgCompactLVs && 00375 !inputCompareLeByAtts(lexeme1, lexeme2, features)) { 00376 #ifdef DEBUGPARTITION 00377 cdgPrintf(CDG_DEBUG, "equivalent.\n"); 00378 cdgPrintf(CDG_DEBUG, "DEBUG: adding lexeme to this class.\n"); 00379 #endif 00380 class = listAppendElement(class, ln); 00381 break; 00382 } else { 00383 #ifdef DEBUGPARTITION 00384 cdgPrintf(CDG_DEBUG, "differs.\n"); 00385 #endif 00386 } 00387 } 00388 00389 /* open new equivalence class */ 00390 00391 if (q == NULL) { 00392 00393 #ifdef DEBUGPARTITION 00394 cdgPrintf(CDG_DEBUG, "DEBUG: creating new equivalence class.\n"); 00395 #endif 00396 00397 class = listPrependElement(NULL, ln); 00398 result = listAppendElement(result, class); 00399 00400 } 00401 #ifdef DEBUGPARTITION 00402 cdgPrintf(CDG_DEBUG, "DEBUG: partition now looks like this: "); 00403 for (q = result; q != NULL; q = listNext(q)) 00404 cdgPrintf(CDG_DEBUG, "%d ", listSize(listElement(q))); 00405 cdgPrintf(CDG_DEBUG, "\n"); 00406 #endif 00407 00408 } 00409 00410 #ifdef DEBUGPARTITION 00411 cdgPrintf(CDG_DEBUG, "DEBUG: %d lexemes partitioned into %d classes.\n\n", 00412 listSize(gn->lexemes), listSize(result)); 00413 #endif 00414 00415 00416 return (result); 00417 00418 } 00419 00420 00421 /* ---------------------------------------------------------------------------- 00422 @brief initializes the lexemgraph 00423 00424 This function returns a new LexemGraph structure with all fields initialized 00425 to meaningless values. In particular, it contains no nodes whatsoever. 00426 */ 00427 LexemGraph lgNewInit() 00428 { 00429 LexemGraph lg; 00430 00431 lg = (LexemGraph) memMalloc(sizeof(LexemGraphStruct)); 00432 lg->lattice = NULL; 00433 lg->graphemnodes = vectorNew(20); 00434 lg->max = 0; 00435 lg->min = INT_MAX; 00436 lg->distance = NULL; 00437 lg->isDeletedNode = bvNew(20); 00438 lg->noOfPathsFromStart = NULL; 00439 lg->noOfPathsToEnd = NULL; 00440 lg->nodes = vectorNew(20); 00441 lg->tags = NULL; 00442 lg->chunks = NULL; 00443 return lg; 00444 } 00445 00446 00447 /* ---------------------------------------------------------------------------- 00448 @brief Insert lexeme nodes into the LexemGraph that correspond the Arc. 00449 00450 This function builds all possible lexeme nodes for the specific 00451 @a arc and adds them to @a lg. It fails if there is no 00452 matching entry in the lexicon. 00453 */ 00454 Boolean lgNewIter(LexemGraph lg, Arc arc) 00455 { 00456 GraphemNode gn; 00457 LexemNode ln; 00458 LexiconItem le; 00459 List j, entries = NULL; 00460 00461 /** Maybe undo capitalisation introduced by orthographic convention. 00462 00463 If the written word is uppercase, but that uppercase-ness is suspect 00464 because it is at the start of a phrase and might be mere orthographic 00465 convention, we have to decide which version we use for lexicon lookup. 00466 00467 If our lexicon contains items for the lower-case version but none for 00468 the upper-case versions, we use only those; if it contains only 00469 items for the upper-case version, we use those; and if it contains 00470 neither, we allow both and hope that there is a lexical template which 00471 will catch this word. 00472 00473 We do not use the obvious solution - look up both versions whenever a 00474 word is spurious - because it has the following defect: If a sentence 00475 starts with `Der', some naive lexical template could introduce a noun 00476 reading, and if POS tagging allows, it might actually survive even 00477 though it is exceedingly unlikely. Since we do not want this, we 00478 effectively force the reading to be `der'. 00479 00480 Moral: If you really need to have open-class items in your lexicon 00481 that are near-homonymous with closed-class items, you can bloody well 00482 write proper lexicon items for them and not templates. */ 00483 00484 if(lgSpuriousUppercase(lg, arc)) { 00485 String low = strCopy(arc->word); 00486 Boolean have_upper, have_lower; 00487 00488 low[0] = tolower(low[0]); 00489 have_upper = inputLexiconQuery(arc->word); 00490 have_lower = inputLexiconQuery(low); 00491 00492 if(have_upper && !have_lower) { 00493 entries = listAppendList(entries, inputLexiconGet(arc->word)); 00494 } 00495 else if(have_lower && !have_upper) { 00496 entries = listAppendList(entries, inputLexiconGet(low)); 00497 arc->word = strCopy(low); 00498 } else { 00499 entries = listAppendList(entries, inputLexiconGet(low)); 00500 entries = listAppendList(entries, inputLexiconGet(arc->word)); 00501 } 00502 memFree(low); 00503 } 00504 00505 /** Much the same goes for words in ALL UPPER CAPS, except that those can 00506 occur anywhere in a sentence, not only at the start, and we have to 00507 check three different spellings instead of two. */ 00508 else if(inputALLUPPER(arc->word)) { 00509 String low = strCopy(arc->word); 00510 String startUpper = strCopy(arc->word); 00511 Boolean have_upper, have_lower, have_startUpper; 00512 int i; 00513 for (i=0; i < strlen((unsigned char *)arc->word); i++){ 00514 low[i] = tolower(low[i]); 00515 } 00516 for (i=1; i < strlen((unsigned char *)arc->word); i++){ 00517 startUpper[i] = tolower(low[i]); 00518 } 00519 00520 have_lower = inputLexiconQuery(low); 00521 have_startUpper = inputLexiconQuery(startUpper); 00522 have_upper = inputLexiconQuery(arc->word); 00523 00524 /** In one-letter words, the intermediate version is 00525 indistinguishable from the third one, so we suppress it. */ 00526 if(1 == strlen(arc->word)) { 00527 have_startUpper = FALSE; 00528 } 00529 00530 if( have_lower ){ 00531 entries = listAppendList(entries, inputLexiconGet(low)); 00532 } 00533 if( have_startUpper ){ 00534 entries = listAppendList(entries, inputLexiconGet(startUpper)); 00535 } 00536 if( have_upper ){ 00537 entries = listAppendList(entries, inputLexiconGet(arc->word)); 00538 } 00539 if(!have_lower && !have_startUpper && !have_upper){ 00540 entries = listAppendList(entries, inputLexiconGet(low)); 00541 entries = listAppendList(entries, inputLexiconGet(startUpper)); 00542 entries = listAppendList(entries, inputLexiconGet(arc->word)); 00543 } 00544 00545 if(have_startUpper && !have_upper && !have_lower) { 00546 arc->word = strCopy(startUpper); 00547 } 00548 else if(have_lower && !have_upper && !have_startUpper) { 00549 arc->word = strCopy(low); 00550 } 00551 else if(!have_upper) { 00552 arc->word = strCopy(startUpper); 00553 } 00554 00555 memFree(startUpper); 00556 memFree(low); 00557 } 00558 00559 else { 00560 entries = inputLexiconGet(arc->word); 00561 } 00562 00563 if (!entries) { 00564 cdgPrintf(CDG_WARNING, "WARNING: no lexical entry for `%s'.\n", arc->word); 00565 return FALSE; 00566 } 00567 00568 lg->min = min(lg->min, arc->from); 00569 lg->max = max(lg->max, arc->to); 00570 00571 gn = (GraphemNode) memMalloc(sizeof (GraphemNodeStruct)); 00572 gn->no = vectorAddElement(lg->graphemnodes, gn); 00573 gn->lexemgraph = lg; 00574 gn->arc = arc; 00575 gn->lexemes = NULL; 00576 gn->chunk = NULL; 00577 00578 /* force consistent order among homonyms */ 00579 j = listSort(entries, leCompareByName); 00580 listDelete(entries); 00581 entries = j; 00582 00583 for (j = entries; j; j = listNext(j)) { 00584 le = (LexiconItem) listElement(j); 00585 ln = (LexemNode) memMalloc(sizeof (LexemNodeStruct)); 00586 ln->no = vectorAddElement(lg->nodes, ln); 00587 ln->lexemgraph = lg; 00588 ln->arc = gn->arc; 00589 ln->lexem = le; 00590 ln->tagscore = 1.0; 00591 ln->limit = 1.0; 00592 bvSetElement(lg->isDeletedNode, FALSE, ln->no); 00593 00594 /* link the nodes to each other */ 00595 ln->grapheme = gn; 00596 gn->lexemes = listAppendElement(gn->lexemes, ln); 00597 } 00598 listDelete(entries); 00599 00600 return TRUE; 00601 } 00602 00603 /* ---------------------------------------------------------------------------- 00604 @brief does the final computations for the lexemgraph 00605 00606 This function sets those fields of @a lg that can only be computed after all 00607 lexeme nodes are present: 00608 - It initializes LexemGraph::isDeletedNode to all FALSE. 00609 - It uses lgComputeDistances() to compute the distance 00610 between any to nodes in the graph. 00611 - It applies lgComputeNoOfPaths() to compute the number of paths to and from 00612 all lexeme nodes in the graph. If no complete path exists, @a lg is 00613 deallocated with a warning. 00614 00615 The function can fail returning FALSE if there is no valid path through the 00616 lexeme graph. 00617 */ 00618 Boolean lgNewFinal(LexemGraph lg) 00619 { 00620 int noOfNodes; 00621 00622 /* compute noOfPathsFromStart and noOfPathsToEnd */ 00623 noOfNodes = vectorSize(lg->graphemnodes); 00624 lgComputeNoOfPaths(lg); 00625 00626 /* no path? */ 00627 if (lg->noOfPaths == 0) { 00628 cdgPrintf(CDG_WARNING, "WARNING: no path possible, invalid graph\n"); 00629 return FALSE; 00630 } 00631 bvSetAllElements(lg->isDeletedNode, FALSE); 00632 lgComputeDistances(lg); 00633 00634 /* build tags cheat sheet */ 00635 { 00636 int i; 00637 AnnoEntry ae = findAnnoForLattice(lg->lattice, TRUE); 00638 List l, m; 00639 00640 if (ae) { 00641 lg->tags = vectorNew(lg->max); 00642 for (i = 0; i < lg->max; i++) { 00643 vectorAddElement(lg->tags, strRegister("")); 00644 } 00645 00646 for (l = ae->annos; l != NULL; l = listNext(l)) { 00647 Annotation a = listElement(l); 00648 00649 for (m = a->specs; m != NULL; m = listNext(m)) { 00650 Specification s = listElement(m); 00651 00652 if (s->type == STTag && strRegister("cat") == s->kind) { 00653 /* CAUTION: this does not work reliable on wordgraphs */ 00654 vectorSetElement(lg->tags, s->name, a->from); 00655 } 00656 } 00657 } 00658 } else { 00659 lg->tags = NULL; 00660 } 00661 } 00662 00663 #ifdef DEBUGLGNEW 00664 lgPrint(CDG_INFO, lg); 00665 #endif 00666 00667 return TRUE; 00668 } 00669 00670 /* ---------------------------------------------------------------------------- 00671 This function creates a lexeme graph from a Lattice @a lat and a cdg lexicon. 00672 For each arcs of the lattice a grapheme node is allocated and annotated with 00673 all possible lexical entries. (If there is no lexical entry for an arc, a 00674 warning is given, but processing continues.) 00675 00676 For each grapheme node, as many lexeme nodes are created as there are 00677 lexical alternatives in the lexicon. 00678 00679 Furthermore: 00680 - LexemGraph::isDeletedNode is initialized to FALSE 00681 - LexemGraph::noOfPathsFromStart and LexemGraph::noOfPathsFromStart=are 00682 computed using computeNoOfPaths(). 00683 - LexemGraph::distance is computed using lgComputeDistances(). 00684 */ 00685 LexemGraph lgNew(Lattice lat) 00686 { 00687 LexemGraph lg = lgNewInit(); 00688 List l; 00689 00690 lg->lattice = lat; 00691 for (l = lat->arcs; l; l = listNext(l)) { 00692 lgNewIter(lg, (Arc) listElement(l)); 00693 } 00694 00695 if (!lgNewFinal(lg)) { 00696 lgDelete(lg); 00697 lg = NULL; 00698 } 00699 00700 return lg; 00701 } 00702 00703 /* ---------------------------------------------------------------------------- 00704 @brief print lexem graph 00705 00706 This function displays a textual representation of the lexeme graph @a lg. 00707 */ 00708 void lgPrint(long unsigned int mode, LexemGraph lg) 00709 { 00710 Lattice lat = lg->lattice; 00711 int i; 00712 LexemNode ln; 00713 Arc a; 00714 Boolean deleted; 00715 00716 cdgPrintf(mode, "\n%s : \n", lat->id); 00717 for (i = 0; i < vectorSize(lg->nodes); i++) { 00718 ln = (LexemNode) vectorElement(lg->nodes, i); 00719 deleted = bvElement(lg->isDeletedNode, ln->no); 00720 a = ln->arc; 00721 00722 cdgPrintf(mode, "%%%d:%s%s(%d-%d)%s %4.3e\n", 00723 ln->no, 00724 deleted ? "[" : " ", 00725 ln->lexem->description, 00726 a->from, a->to, deleted ? "]" : " ", ln->tagscore); 00727 } 00728 00729 if (lg->chunks) { 00730 cdgPrintf(mode, "chunks:\n"); 00731 chunkerPrintChunks(mode, lg->chunks); 00732 } 00733 } 00734 00735 /* ---------------------------------------------------------------------------- 00736 @brief returns TRUE if node is a start node 00737 00738 This function checks whether @c n->arc->from is equal to the minimal time 00739 point in the lexeme graph. 00740 */ 00741 Boolean lgIsStartNode(GraphemNode n) 00742 { 00743 if(n == NULL || n == NONSPEC) 00744 return (FALSE); 00745 else 00746 return (n->arc->from == n->lexemgraph->min); 00747 } 00748 00749 /* --------------------------------------------------------------------------- 00750 @brief returns TRUE if node is an end node 00751 00752 This function checks whether @c n->arc->to is equal to the maximal time point 00753 in the lexeme graph. 00754 00755 @todo should nodes on segment boundaries of incremental parsed input be 00756 considered to be endnodes 00757 */ 00758 Boolean lgIsEndNode(GraphemNode n) 00759 { 00760 if(n == NULL || n == NONSPEC) 00761 return (FALSE); 00762 else 00763 return (n->arc->to == n->lexemgraph->max); 00764 } 00765 00766 /* ---------------------------------------------------------------------------- 00767 @brief returns a distance measure for two lexem nodes 00768 00769 This function computes the logical distance between @a a and @a b, measured 00770 in words. Usually this is just the corresponding element of 00771 LexemGraph::distance. If either of the nodes is underspecified it is treated 00772 as if it followed the latest specified lexeme node directly. Hence, the 00773 return value may be greater than value in LexemGraph::distance. Two 00774 underspecified lexeme nodes are considered to have distance zero. 00775 */ 00776 int lgDistanceOfNodes(LexemGraph lg, LexemNode a, LexemNode b) 00777 { 00778 int maxDistance, i, maxDim, distance; 00779 00780 if (a == NULL || b == NULL || lg == NULL) { 00781 cdgPrintf(CDG_ERROR, "ERROR: lgDistanceOfNodes: argument is NULL\n"); 00782 abort(); 00783 } 00784 if (a == NONSPEC) { 00785 if (b == NONSPEC) 00786 return 0; 00787 00788 maxDim = arrayDimension(lg->distance, 1); 00789 maxDistance = 0; 00790 for (i = 0; i < maxDim; i++) { 00791 distance = (int)arrayElement(lg->distance, b->grapheme->no, i); 00792 if (maxDistance < distance) { 00793 maxDistance = distance; 00794 } 00795 } 00796 return -maxDistance - 1; 00797 } 00798 00799 if (b == NONSPEC) { 00800 maxDim = arrayDimension(lg->distance, 1); 00801 maxDistance = 0; 00802 for (i = 0; i < maxDim; i++) { 00803 distance = (int)arrayElement(lg->distance, a->grapheme->no, i); 00804 if (maxDistance < distance) { 00805 maxDistance = distance; 00806 } 00807 } 00808 return maxDistance + 1; 00809 } 00810 00811 return ((int)arrayElement(lg->distance, a->grapheme->no, b->grapheme->no)); 00812 } 00813 00814 /* ---------------------------------------------------------------------------- 00815 @brief may these words modify each other? 00816 00817 \pre @a lexemes must be a List of lexeme nodes with identical time spans. If 00818 this is not the case, the behaviour is undefined. 00819 00820 This function checks whether an LevelValue can exist with the modifiers @a 00821 exemes and a modifiee from @a gn. This is the case iff both can coexist on 00822 one path and do not overlap. 00823 */ 00824 Boolean lgMayModify(LexemGraph lg, GraphemNode down, GraphemNode up) 00825 { 00826 LexemNode lna, lnb; 00827 if(!spec(up)) { 00828 return TRUE; 00829 } 00830 lna = listElement(down->lexemes); 00831 lnb = listElement(up->lexemes); 00832 00833 return (lgDistanceOfNodes(lg, lna, lnb) != 0); 00834 } 00835 00836 /* --------------------------------------------------------------------------- 00837 @brief returns TRUE if lexem nodes a and b exist on one path. 00838 00839 This function checks whether, in principle, a complete path can exist 00840 through @a lg that includes both @a a and @a b. This is independent of the 00841 current state of deletions. In fact, the function merely checks whether the 00842 distance between the nodes is not 0 by using lgDistanceOfNodes(). Note that 00843 two nodes are not automatically compatible merely because they do not 00844 overlap in time. Also, a lexeme node is not compatible with itself by this 00845 definition. 00846 */ 00847 Boolean lgCompatibleNodes(LexemGraph lg, LexemNode a, LexemNode b) 00848 { 00849 if (lg == NULL || a == NULL || b == NULL) { 00850 cdgPrintf(CDG_ERROR, "ERROR: lgCompatibleNodes: argument is NULL\n"); 00851 abort(); 00852 } 00853 return (a == b || lgDistanceOfNodes(lg, a, b) != 0); 00854 } 00855 00856 /* ---------------------------------------------------------------------------- 00857 @brief does existence of these lexemes exclude that lexeme. 00858 00859 \pre @a lexemes must be a List of lexeme nodes with identical time spans. If 00860 this is not the case, the behaviour is undefined. 00861 00862 This function checks whether the List @a lexemes and the lexeme node @a ln 00863 can be selected in a solution. This is the case if either of the following 00864 holds: 00865 - @a lexemes is empty 00866 - @a ln is compatible with the first element of @a lexemes 00867 - @a lexemes contains @a ln 00868 00869 In these cases FALSE is returned (@a ln is not forbidden). Otherwise TRUE is 00870 returned. 00871 */ 00872 Boolean lgForbiddenBy(LexemGraph lg, LexemNode ln, List lexemes) 00873 { 00874 if (!spec(lexemes)) 00875 return FALSE; 00876 00877 if (!lgCompatibleNodes(lg, ln, (LexemNode) listElement(lexemes)) 00878 && !listContains(lexemes, ln)) { 00879 return (TRUE); 00880 } else { 00881 return (FALSE); 00882 } 00883 } 00884 00885 00886 /* ---------------------------------------------------------------------------- 00887 @brief checks if these sets of lexemes are compatible, i.e. either unrelated 00888 or intersecting? 00889 00890 \pre Both @a a and @a b must be Lists of lexeme nodes spanning the same 00891 respective time interval. If this is not the case, the behaviour is 00892 undefined. 00893 00894 This function checks whether both sets of lexeme nodes may be selected in a 00895 solution. This is defined as follows: 00896 - If either set is empty, the result is TRUE 00897 - If the first elements of @a a and @a b are compatible, the result is TRUE 00898 - If the sets intersect, the result is TRUE 00899 - Otherwise the result is FALS 00900 */ 00901 00902 00903 Boolean lgCompatibleSets(LexemGraph lg, List a, List b) 00904 { 00905 00906 LexemNode lna, lnb; 00907 List p; 00908 00909 /* empty sets are compatible */ 00910 00911 if (a == NULL || b == NULL) 00912 return (TRUE); 00913 00914 lna = (LexemNode) listElement(a); 00915 lnb = (LexemNode) listElement(b); 00916 00917 /* unrelated sets are compatible */ 00918 00919 if (lgCompatibleNodes(lg, lna, lnb)) 00920 return (TRUE); 00921 00922 /* if we get here, the only chance for compatibility 00923 is for the lists to intersect. This is only possible 00924 if their elements are simultaneous. */ 00925 00926 if(lna->arc->from != lnb->arc->from || 00927 lna->arc->to != lnb->arc->to) 00928 return FALSE; 00929 00930 /* OK, the sets are simultaneous. Do they actually intersect? */ 00931 00932 for (p = a; p != NULL; p = listNext(p)) { 00933 if (listContains(b, listElement(p))) 00934 return (TRUE); 00935 } 00936 00937 /* mutually exclusive sets are not compatible */ 00938 00939 return (FALSE); 00940 00941 } 00942 00943 /* ---------------------------------------------------------------------------- 00944 * This checks if a lexem node has been deleted TRUE is returned, if not 00945 * FALSE. 00946 */ 00947 Boolean lgIsDeletedNode(LexemGraph lg, LexemNode ln) 00948 { 00949 return (bvElement(lg->isDeletedNode, ln->no)); 00950 } 00951 00952 00953 /* ---------------------------------------------------------------------------- 00954 * This checks if the @a lexemes have been deleted TRUE is returned, if not 00955 * FALSE 00956 */ 00957 Boolean lgAreDeletedNodes(LexemGraph lg, List lexemes) 00958 { 00959 List l; 00960 00961 for (l = lexemes; l != NULL; l = listNext(l)) { 00962 if (!lgIsDeletedNode(lg, (LexemNode) listElement(l))) 00963 return FALSE; 00964 } 00965 return TRUE; 00966 } 00967 00968 /* ---------------------------------------------------------------------------- 00969 This function checks whether all lexeme nodes passed in @a lexemes can 00970 be deleted at the same time. This is the case if doing so will leave at 00971 least one complete path though the lexeme graph, according to the current 00972 state of deletions. For this end, the function checks whether the sum of the 00973 number of paths through each lexeme node is smaller than the total number of 00974 paths in @a lg. 00975 00976 @pre @a lexemes must be a List of lexeme nodes with identical time spans. If 00977 this is not the case, the behaviour is undefined. 00978 */ 00979 Boolean lgAreDeletableNodes(LexemGraph lg, List lexemes) 00980 { 00981 int affectedPaths = 0; 00982 List p; 00983 LexemNode ln; 00984 00985 /* The lexem graph can only tear if we are trying to delete all of 00986 the remaining nodes of one grapheme node. So we first check this 00987 condition. */ 00988 GraphemNode gn = ((LexemNode)listElement(lexemes))->grapheme; 00989 int noRemaining, noToDelete; 00990 00991 /* the number of undeleted lexeme nodes for the grapheme node */ 00992 noRemaining = 0; 00993 for (p = gn->lexemes; p != NULL; p = listNext(p)) { 00994 ln = listElement(p); 00995 if(!bvElement(lg->isDeletedNode, ln->no)) { 00996 noRemaining++; 00997 } 00998 } 00999 if(noRemaining == 0) { 01000 return TRUE; 01001 } 01002 01003 /* the number of lexemes that would be deleted */ 01004 noToDelete = 0; 01005 for(p = lexemes; p != NULL; p = listNext(p)) { 01006 ln = listElement(p); 01007 if(!bvElement(lg->isDeletedNode, ln->no)) { 01008 noToDelete++; 01009 } 01010 } 01011 01012 if(noToDelete < noRemaining) { 01013 return TRUE; 01014 } 01015 01016 /* Now we know that the user wants do delete an entire grapheme 01017 node. This is allowed iff not all paths in the _grapheme_ graph 01018 go through gn. */ 01019 01020 affectedPaths = 01021 01022 /* # of paths leading here */ 01023 lg->noOfPathsFromStart[gn->no] 01024 /* times # of paths leaving here */ 01025 * lg->noOfPathsToEnd[gn->no]; 01026 01027 /* this can't happen */ 01028 if(affectedPaths > lg->noOfPaths) { 01029 cdgPrintf(CDG_ERROR, "PANIC: node #%d supports more paths than exist!\n", gn->no); 01030 abort(); 01031 } 01032 01033 return (affectedPaths != lg->noOfPaths); 01034 01035 } 01036 01037 /* ---------------------------------------------------------------------------- 01038 @brief deletes a node from the lexeme graph itself. 01039 01040 This function marks a lexeme node as deleted. It does this by setting the 01041 cell @a ln->no in the Vector @a lg->isDeletedNode. If this destroys the last 01042 possible path through @a lg=, a warning is displayed. This function always 01043 re-computes the number of remaining paths in @a lg. 01044 */ 01045 void lgDeleteNode(LexemGraph lg, LexemNode ln) 01046 { 01047 01048 int oldNo = lg->noOfPaths; 01049 01050 /* already deleted, ok */ 01051 if (bvElement(lg->isDeletedNode, ln->no)) 01052 return; 01053 01054 bvSetElement(lg->isDeletedNode, TRUE, ln->no); 01055 lgComputeNoOfPaths(lg); 01056 01057 if (lg->noOfPaths == 0 && oldNo > 0) { 01058 cdgPrintf(CDG_WARNING, 01059 "WARNING: lgDeleteNode: deletion of node `%s' destroys last path\n", 01060 ln->lexem->description); 01061 } 01062 01063 /* have to do this because two nodes may become more distant by the 01064 * deletion of the shortest connecting path */ 01065 lgComputeDistances(lg); 01066 } 01067 01068 /* ---------------------------------------------------------------------------- 01069 @brief delete a list of lexeme nodes 01070 01071 This function behaves as lgDeleteNode() were called on each element of the @a 01072 nodes, but it is more efficient since it only re-computes the number of 01073 remaining paths once. 01074 */ 01075 void lgDeleteNodes(LexemGraph lg, List nodes) 01076 { 01077 List p; 01078 int oldNo = lg->noOfPaths; 01079 LexemNode ln = NULL; 01080 01081 /* So we inline the relevant code into a loop here... */ 01082 for (p = nodes; p != NULL; p = listNext(p)) { 01083 ln = listElement(p); 01084 bvSetElement(lg->isDeletedNode, TRUE, ln->no); 01085 } 01086 01087 /* ...and then do one call to computeNoOfPaths(). */ 01088 lgComputeNoOfPaths(lg); 01089 if (oldNo > 0 && lg->noOfPaths == 0) { 01090 cdgPrintf(CDG_WARNING, 01091 "WARNING: lgDeleteNodes: deletion of node `%s' destroys last path\n", 01092 ln->lexem->description); 01093 } 01094 01095 /* have to do this because two nodes may become more distant by the 01096 * deletion of the shortest connecting path */ 01097 lgComputeDistances(lg); 01098 } 01099 01100 01101 /* ---------------------------------------------------------------------------- 01102 @brief do the lexemes span the same time interval? 01103 01104 This function checks whether @a a and @a b cover the same time span. An 01105 argument of NONSPEC always causes TRUE to be returned. However, the NULL node 01106 is not simultaneous to any lexeme node, not even to another root node. 01107 */ 01108 Boolean lgSimultaneous(LexemNode a, LexemNode b) 01109 { 01110 01111 if (a == NULL && b != NULL) { 01112 cdgPrintf(CDG_WARNING, 01113 "WARNING: lgSimultaneous: first argument must not be NULL\n"); 01114 cdgPrintf(CDG_WARNING, 01115 "WARNING: the other argument was `%s'\n", b->lexem->word); 01116 } 01117 if (b == NULL && a != NULL) { 01118 cdgPrintf(CDG_WARNING, 01119 "WARNING: lgSimultaneous: second argument must not be NULL\n"); 01120 cdgPrintf(CDG_WARNING, 01121 "WARNING: the other argument was `%s'\n", a->lexem->word); 01122 } 01123 if (a == NULL && b == NULL) 01124 cdgPrintf(CDG_WARNING, 01125 "WARNING: lgSimultaneous: both arguments must not be NULL\n"); 01126 01127 if(a == NONSPEC || b == NONSPEC) 01128 return(TRUE); 01129 else if(a == NULL || b == NULL) 01130 return (FALSE); 01131 else 01132 return (a->arc->from == b->arc->from 01133 && a->arc->to == b->arc->to); 01134 } 01135 01136 /* ---------------------------------------------------------------------------- 01137 @brief is this lexeme a member of the this set? 01138 01139 \pre @a lexemes must be a List of lexeme nodes with identical time spans. If 01140 this is not the case, the behaviour is undefined. 01141 01142 This function checks whether @a ln is an element of @a lexemes. 01143 01144 In the following cases @a ln is not a member (return FALSE); 01145 - a NIL binding (@a ln is NULL) is not an element of anything 01146 - an empty set (@a lexemes is NULL) has no member, 01147 - @a ln belongs to another timespan 01148 - @a ln is not contained litteraly in the set 01149 01150 Otherwise TRUE is returned. 01151 */ 01152 Boolean lgMember(LexemNode ln, List lexemes) 01153 { 01154 if (ln == NULL) 01155 return (FALSE); 01156 01157 if (lexemes == NULL) 01158 return (FALSE); 01159 01160 if (!lgSimultaneous(((LexemNode) listElement(lexemes)), ln)) 01161 return (FALSE); 01162 01163 return (listContains(lexemes, ln)); 01164 } 01165 01166 /* ---------------------------------------------------------------------------- 01167 This function checks whether @a a is a subset of @a b. 01168 01169 \pre Both @a a and @a b must be Lists of lexeme nodes spanning the same 01170 respective time interval. If this is not the case, the behaviour is 01171 undefined. 01172 */ 01173 Boolean lgSubset(List a, List b) 01174 { 01175 01176 List l; 01177 LexemNode ln; 01178 01179 /* empty set is always a subset */ 01180 if (a == NULL) 01181 return (TRUE); 01182 01183 /* empty set has no proper subset */ 01184 if (b == NULL) 01185 return (FALSE); 01186 01187 /* if a belongs to a different timespan, it's not a subset */ 01188 if (!lgSimultaneous(((LexemNode) listElement(a)), (LexemNode) listElement(b))) 01189 return (FALSE); 01190 01191 /* find it out the hard way */ 01192 for (l = a; l != NULL; l = listNext(l)) { 01193 ln = (LexemNode) listElement(l); 01194 if (!lgMember(ln, b)) 01195 return (FALSE); 01196 } 01197 01198 return (TRUE); 01199 01200 } 01201 01202 /* ---------------------------------------------------------------------------- 01203 @brief Do two lexeme lists intersect. 01204 01205 \pre Both @a a and @a b must be Lists of lexeme nodes spanning the same 01206 respective time interval. If this is not the case, the behaviour is 01207 undefined. 01208 01209 This function checks whether @a a and @a b intersect. 01210 */ 01211 Boolean lgIntersectingSets(List a, List b) 01212 { 01213 01214 List l; 01215 LexemNode ln; 01216 01217 if (a == NULL && b != NULL) { return (FALSE); } 01218 01219 if (a != NULL && b == NULL) { return (FALSE); } 01220 01221 if (a == NULL && b == NULL) { return (TRUE); } 01222 01223 /* if a belongs to a different timespan, no intersection */ 01224 if (!lgSimultaneous(((LexemNode) listElement(a)), (LexemNode) listElement(b))) { 01225 return (FALSE); 01226 } 01227 01228 /* find it out the hard way */ 01229 for (l = a; l != NULL; l = listNext(l)) { 01230 ln = (LexemNode) listElement(l); 01231 if (listContains(b, ln)) { 01232 return (TRUE); 01233 } 01234 } 01235 01236 return (FALSE); 01237 01238 } 01239 01240 /* ---------------------------------------------------------------------------- 01241 Takes a set of lexeme nodes, and extends it to a complete path through the 01242 graph, composed of undeleted LexemNodes. Returns NULL if this is impossible, 01243 It returns a List of lexeme nodes that 01244 -# is a superset of @a nodes 01245 -# corresponds to a complete path through the graph and 01246 -# contains only undeleted lexeme nodes. 01247 01248 If this is not possible, NULL is returned. 01249 */ 01250 List lgMakePath(LexemGraph lg, List nodes) { 01251 01252 LexemNode lna, lnb; 01253 List result = NULL; 01254 List l, m; 01255 int i, nrBoundPoints = 0; 01256 Boolean okay; 01257 01258 /** We do this by simply appending arbitrary non-contradictory nodes 01259 until we have bound all time points. Note that for this approach 01260 to be correct, there must not be any undeleted dangling nodes in 01261 the graph. This condition must have ensured by cnOptimizeNet(). */ 01262 01263 /* is the list of nodes self-contradictory? */ 01264 for(l = nodes; l != NULL; l = listNext(l)) { 01265 lna = listElement(l); 01266 nrBoundPoints += (lna->arc->to - lna->arc->from); 01267 for(m = listNext(l); m != NULL; m = listNext(m)) { 01268 lnb = listElement(m); 01269 if(!lgCompatibleNodes(lg, lna, lnb)) { 01270 return NULL; 01271 } 01272 } 01273 } 01274 01275 /* choose arbitrary nodes until all time points are bound */ 01276 result = listClone(nodes); 01277 for(i = 0; i < vectorSize(lg->nodes); i++) { 01278 okay = TRUE; 01279 lna = vectorElement(lg->nodes, i); 01280 if(lgIsDeletedNode(lg, lna)) { 01281 continue; 01282 } 01283 for(l = result; l != NULL; l = listNext(l)) { 01284 lnb = listElement(l); 01285 if(lna == lnb || 01286 !lgCompatibleNodes(lg, lna, lnb)) { 01287 okay = FALSE; 01288 break; 01289 } 01290 } 01291 if(okay) { 01292 result = listAppendElement(result, lna); 01293 nrBoundPoints += (lna->arc->to - lna->arc->from); 01294 } 01295 } 01296 01297 if(nrBoundPoints != lg->max - lg->min) { 01298 listDelete(result); 01299 result = NULL; 01300 } 01301 01302 return result; 01303 } 01304 01305 /* ---------------------------------------------------------------------------- 01306 @brief deletes LexemGraph 01307 01308 This function deallocates a lexeme graph. This deallocates all parts of the 01309 structure, even the lexeme nodes and lexical entries themselves. The lexicon 01310 remains unchanged as the LexicalEntry structures are merely clones of the 01311 structures in @ref inputCurrentGrammar. 01312 */ 01313 void lgDelete(LexemGraph lg) 01314 { 01315 int i; 01316 GraphemNode gn; 01317 LexemNode ln; 01318 01319 if (!lg) { 01320 return; 01321 } 01322 01323 for (i = 0; i < vectorSize(lg->graphemnodes); i++) { 01324 gn = vectorElement(lg->graphemnodes, i); 01325 listDelete(gn->lexemes); 01326 memFree(gn); 01327 } 01328 vectorDelete(lg->graphemnodes); 01329 01330 for (i = 0; i < vectorSize(lg->nodes); i++) { 01331 ln = (LexemNode) vectorElement(lg->nodes, i); 01332 freeLexiconItem(ln->lexem); 01333 memFree(ln); 01334 } 01335 vectorDelete(lg->nodes); 01336 01337 if (lg->distance != NULL) { 01338 arrayDelete(lg->distance); 01339 } 01340 01341 memFree(lg->noOfPathsFromStart); 01342 memFree(lg->noOfPathsToEnd); 01343 bvDelete(lg->isDeletedNode); 01344 01345 if (lg->tags) { 01346 vectorDelete(lg->tags); 01347 } 01348 01349 listForEachDelete(lg->chunks, chunkerChunkDelete); 01350 01351 memFree(lg); 01352 } 01353 01354 01355 /* ---------------------------------------------------------------------------- 01356 @returns maximal ambiguity per time point 01357 01358 This function computes the maximal number of overlapping lexeme nodes for any 01359 time point in @a lg. Thus it gives an upper bound (not an estimate) of the 01360 average acoustical and lexical ambiguity of the graph. 01361 */ 01362 int lgWidth(LexemGraph lg) 01363 { 01364 01365 int result = 0; 01366 int i, j; 01367 int *ambiguity = memMalloc(sizeof (int) * lg->max); 01368 LexemNode ln; 01369 01370 for (i = 0; i < lg->max; i++) { 01371 ambiguity[i] = 0; 01372 } 01373 01374 for (i = 0; i < vectorSize(lg->nodes); i++) { 01375 ln = vectorElement(lg->nodes, i); 01376 for (j = ln->arc->from; j < ln->arc->to; j++) { 01377 ambiguity[j]++; 01378 if (ambiguity[j] > result) { 01379 result = ambiguity[j]; 01380 } 01381 } 01382 } 01383 01384 memFree(ambiguity); 01385 return result; 01386 01387 } 01388 01389 /* ---------------------------------------------------------------------------- 01390 @brief prints out a lexeme node 01391 01392 This function displays the identifier and the time span of @a ln in the 01393 format @c der_nom(0,1). 01394 */ 01395 void lgPrintNode(unsigned long mode, LexemNode ln) 01396 { 01397 Boolean dead = bvElement(ln->lexemgraph->isDeletedNode, ln->no); 01398 01399 cdgPrintf(mode, "%s`%s'(%d-%d)%s", 01400 dead ? "[" : "", 01401 ln->lexem->description, 01402 ln->arc->from, ln->arc->to, dead ? "]" : ""); 01403 } 01404 01405 01406 /* ---------------------------------------------------------------------------- 01407 @brief Do these lexeme nodes overlap? 01408 01409 Returns TRUE if the two lexeme nodes have at least one time point in common. 01410 01411 This is subtly different from the more common question, "Can the two nodes 01412 coexist on one path?": two nodes can be compatible although they overlap if 01413 they are identical. Conversely, a and b may be incompatible even if they do 01414 not overlap if there is no path between them. 01415 */ 01416 Boolean lgOverlap(LexemNode a, LexemNode b) 01417 { 01418 return((a->arc->to > b->arc->from && a->arc->from <= b->arc->from) || 01419 (b->arc->to > a->arc->from && b->arc->from <= a->arc->from)); 01420 } 01421 01422 /* ---------------------------------------------------------------------------- 01423 Takes a Vector of Boolean, and sets all cells that correspond to the numbers 01424 of nodes incompatible with ln. This function can be used in combination with 01425 lvVectorCompatible() to decide whether an LV is compatible with a set of 01426 other LVs. 01427 */ 01428 void lgRequireLexeme(LexemGraph lg, ByteVector v, LexemNode ln) 01429 { 01430 int i; 01431 LexemNode ln2; 01432 01433 if (!spec(ln)) 01434 return; 01435 01436 for (i = 0; i < vectorSize(lg->nodes); i++) { 01437 ln2 = vectorElement(lg->nodes, i); 01438 if (!lgCompatibleNodes(lg, ln, ln2)) { 01439 bvSetElement(v, TRUE, ln2->no); 01440 } 01441 } 01442 } 01443 01444 /* ---------------------------------------------------------------------------- 01445 This function is similar to lgRequireLexeme(), but takes a List of lexeme 01446 nodes. It marks all those lexeme nodes that are incompatible with all 01447 lexemnodes of @a which. 01448 */ 01449 void lgRequireLexemes(LexemGraph lg, ByteVector v, List which) 01450 { 01451 int i; 01452 LexemNode ln1, ln2; 01453 01454 if (!spec(which)) 01455 return; 01456 01457 ln1 = listElement(which); 01458 01459 if (!spec(ln1)) 01460 return; 01461 01462 for (i = 0; i < vectorSize(lg->nodes); i++) { 01463 ln2 = vectorElement(lg->nodes, i); 01464 if (!lgCompatibleNodes(lg, ln1, ln2) && !listContains(which, ln2)) { 01465 bvSetElement(v, TRUE, ln2->no); 01466 } 01467 } 01468 } 01469 01470 /* ---------------------------------------------------------------------------- 01471 @brief updates the partial lexemgraph with the incoming arcs. 01472 01473 This function extends a lexeme graph by the Arc structures contained in @a 01474 listArcs. 01475 */ 01476 Boolean lgUpdateArcs(LexemGraph lg, Lattice lat, List listArcs) 01477 { 01478 List m; 01479 01480 for (m = listArcs; m; m = listNext(m)) { 01481 lgNewIter(lg, (Arc) listElement(m)); 01482 } 01483 if (lg->distance != NULL) { 01484 arrayDelete(lg->distance); 01485 lg->distance = NULL; 01486 } 01487 01488 lg->lattice = lat; 01489 return lgNewFinal(lg); 01490 } 01491 01492 /* ---------------------------------------------------------------------------- 01493 This function simply transfers the field LexemGraph::tagscore from each node 01494 in @a source to the corresponding node in @a destination. (This is only 01495 useful to save repeated invocation of taggerTag() for two graphs produced 01496 from the same lattice.) 01497 */ 01498 void lgCopyTagScores(LexemGraph destination, LexemGraph source) 01499 { 01500 int i; 01501 List l; 01502 01503 void backlink(Chunk chunk) { 01504 List l; 01505 01506 for (l = chunk->nodes; l; l = listNext(l)) { 01507 GraphemNode gn = listElement(l); 01508 01509 gn->chunk = chunk; 01510 } 01511 for (l = chunk->subChunks; l; l = listNext(l)) { 01512 backlink(listElement(l)); 01513 } 01514 } 01515 01516 for (i = 0; i < vectorSize(destination->nodes) && 01517 i < vectorSize(source->nodes); i++) { 01518 LexemNode ln1 = vectorElement(destination->nodes, i); 01519 LexemNode ln2 = vectorElement(source->nodes, i); 01520 01521 ln1->tagscore = ln2->tagscore; 01522 } 01523 01524 /* Also copy the chunk information for much the same reason */ 01525 for (l = source->chunks; l; l = listNext(l)) { 01526 Chunk chunk = listElement(l); 01527 01528 chunk = chunkerCloneChunk(chunk); 01529 destination->chunks = listAppendElement(destination->chunks, chunk); 01530 chunkerReplaceGraphemes(chunk, destination); 01531 backlink(chunk); 01532 } 01533 01534 /* copy tags cheat sheet */ 01535 destination->tags = NULL; 01536 if (source->tags) { 01537 if (destination->tags) 01538 vectorDelete(destination->tags); 01539 destination->tags = vectorClone(source->tags); 01540 } 01541 01542 } 01543 01544 01545 /* ---------------------------------------------------------------------------- 01546 This function checks whether at least one of the LexemNode 01547 structures in @a list points to a lexicon element @a le 01548 */ 01549 Boolean lgLexemeInLexemNodeList(LexiconItem le, List list) 01550 { 01551 List m; 01552 for (m = list; m != NULL; m = listNext(m)) { 01553 LexemNode ln = (LexemNode) listElement(m); 01554 if (ln->lexem == le) 01555 return TRUE; 01556 } 01557 return FALSE; 01558 } 01559 01560 /* ---------------------------------------------------------------------------- 01561 @brief Select the path in DST whose parts most closely match SRC. 01562 01563 This function inspects the undeleted words in @a source and undeletes those 01564 words in @a destination that most closely correspond to them. (This is 01565 necessary because two lexeme graphs built from the same lattice may have 01566 their nodes in different order, so you cannot simply re-use an 01567 LexemGraph::isDeletedNode vector across lexeme graphs.) 01568 */ 01569 Boolean lgCopySelection(LexemGraph destination, LexemGraph source) 01570 { 01571 01572 int i, j, k, record; 01573 LexemNode ln1, ln2, best; 01574 01575 if (destination == source) { 01576 return TRUE; 01577 } 01578 01579 bvSetAllElements(destination->isDeletedNode, TRUE); 01580 for (i = 0; i < bvSize(source->isDeletedNode); i++) { 01581 if (bvElement(source->isDeletedNode, i)) { 01582 continue; 01583 } 01584 01585 /* OK, find the node in DST that corresponds to this one. */ 01586 ln1 = vectorElement(source->nodes, i); 01587 best = NULL; 01588 record = INT_MAX; 01589 for (j = 0; j < vectorSize(destination->nodes); j++) { 01590 ln2 = vectorElement(destination->nodes, j); 01591 if (ln2->arc->from != ln1->arc->from || ln2->arc->to != ln1->arc->to) { 01592 continue; 01593 } 01594 k = leCompare(inputCurrentGrammar, ln1->lexem, ln2->lexem, FALSE); 01595 if (k < record) { 01596 record = k; 01597 best = ln2; 01598 } 01599 } 01600 if (!best) { 01601 return FALSE; 01602 } 01603 bvSetElement(destination->isDeletedNode, FALSE, best->no); 01604 } 01605 01606 return TRUE; 01607 } 01608 01609 /* ---------------------------------------------------------------------------- 01610 @brief Initialize the input module 01611 01612 This function initializes the module Lexemgraph and registers the variable 01613 @a compactlevelvalues. 01614 */ 01615 void lgInitialize() 01616 { 01617 setRegister("compactlevelvalues", SET_BOOL, &lgCompactLVs, NULL, NULL, NULL, 01618 NULL); 01619 } 01620 01621 /* --------------------------------------------------------------------------- 01622 @brief What categories can this node represent? (Needed while tagging.) 01623 01624 This function queries the lexicon about what syntactical categories @a gn can 01625 represent. (The syntactical category is that feature whose index is 01626 @ref taggerCategoryIndex.) This function is used to check whether an 01627 assignment by the tagger can be honored by the lexicon. 01628 */ 01629 List lgQueryCat(LexemGraph lg, GraphemNode gn) 01630 { 01631 01632 List l; 01633 List result = NULL; 01634 01635 for (l = gn->lexemes; l != NULL; l = listNext(l)) { 01636 LexemNode ln = listElement(l); 01637 LexiconItem li = ln->lexem; 01638 Value v = li->values[taggerCategoryIndex]; 01639 01640 if (v->type != VTString) { 01641 /* This should definitely not happen, but it will already generate 01642 * errors elsewhere, so we just ignore it here. */ 01643 continue; 01644 } 01645 result = listAddUniqueElement(result, strRegister(v->data.string)); 01646 } 01647 return result; 01648 } 01649 01650 /* ---------------------------------------------------------------------------- 01651 @brief Does a lexemgraph contain at least one instance of a given form? 01652 01653 This function checks whether @a lg contains at least one instance of the 01654 form @a form. Capitalized versions of @a form are permissible if they are 01655 spurious (cf. lgSpuriousUppercase()). 01656 */ 01657 Boolean lgContains(LexemGraph lg, String form) { 01658 int i; 01659 for (i = 0; i < vectorSize(lg->graphemnodes); i++) { 01660 GraphemNode gn = vectorElement(lg->graphemnodes, i); 01661 if(form == gn->arc->word) { 01662 return TRUE; 01663 } 01664 if(lgSpuriousUppercase(lg, gn->arc) && 01665 !strcasecmp(form, gn->arc->word)) { 01666 return TRUE; 01667 } 01668 01669 } 01670 return FALSE; 01671 } 01672 01673 /* ---------------------------------------------------------------------------- 01674 Returns the most probable path, as defined by tagging scores. 01675 */ 01676 List lgMostProbablePath(LexemGraph lg) 01677 { 01678 List result = NULL; 01679 int i; 01680 List l; 01681 01682 if (latticeBranches(lg->lattice)) { 01683 cdgPrintf(CDG_WARNING, 01684 "ERROR: lgSelectMostProbablePath() called no branching lattice!\n"); 01685 return NULL; 01686 } 01687 01688 for (i = 0; i < vectorSize(lg->graphemnodes); i++) { 01689 GraphemNode gn = vectorElement(lg->graphemnodes, i); 01690 Number record = 0.0; 01691 LexemNode best = NULL; 01692 01693 for (l = gn->lexemes; l != NULL; l = listNext(l)) { 01694 LexemNode ln = listElement(l); 01695 01696 if (ln->tagscore > record) { 01697 record = ln->tagscore; 01698 best = ln; 01699 } 01700 } 01701 result = listPrependElement(result, best); 01702 } 01703 return result; 01704 } 01705 01706 /* ---------------------------------------------------------------------------- 01707 @brief Clone a grapheme node. 01708 01709 The field GraphemNode::lexemes is not set; the caller has to do that. (The 01710 two-way links between grapheme nodes and lexeme nodes can be set easier when 01711 all nodes are known.) 01712 */ 01713 GraphemNode gnClone(GraphemNode gn, Lattice lat) 01714 { 01715 GraphemNode result = 01716 result = (GraphemNode) memMalloc(sizeof (GraphemNodeStruct)); 01717 01718 result->no = gn->no; 01719 result->lexemgraph = gn->lexemgraph; 01720 result->arc = inputFindArc(lat, gn->arc->no); 01721 result->lexemes = NULL; 01722 01723 result->chunk = NULL; 01724 01725 return result; 01726 } 01727 01728 01729 /* ---------------------------------------------------------------------------- 01730 @brief Clone a lexeme graph. 01731 01732 This performs a totally deep copy; even the underlying lattice, lexicon 01733 items etc. are cloned. 01734 */ 01735 LexemGraph lgClone(LexemGraph lg) { 01736 01737 int i; 01738 LexemGraph result = memMalloc(sizeof(LexemGraphStruct)); 01739 01740 result->lattice = inputCloneLattice(lg->lattice); 01741 result->graphemnodes = vectorNew(20); 01742 for(i = 0; i < vectorSize(lg->graphemnodes); i++) { 01743 GraphemNode gn = vectorElement(lg->graphemnodes, i); 01744 GraphemNode new = gnClone(gn, result->lattice); 01745 vectorAddElement(result->graphemnodes, new); 01746 } 01747 01748 result->nodes = vectorNew(20); 01749 for(i = 0; i < vectorSize(lg->nodes); i++) { 01750 LexemNode ln = vectorElement(lg->nodes, i); 01751 LexemNode new = memMalloc(sizeof (LexemNodeStruct)); 01752 new->no = ln->no; 01753 new->lexemgraph = result; 01754 new->arc = inputFindArc(result->lattice, ln->arc->no); 01755 new->lexem = inputCloneLI(ln->lexem); 01756 new->limit = ln->limit; 01757 new->grapheme = vectorElement(result->graphemnodes, ln->grapheme->no); 01758 new->grapheme->lexemes = listAppendElement(new->grapheme->lexemes, new); 01759 vectorAddElement(result->nodes, new); 01760 } 01761 01762 result->max = lg->max; 01763 result->min = lg->min; 01764 01765 result->isDeletedNode = bvClone(lg->isDeletedNode); 01766 01767 result->distance = NULL; 01768 lgComputeDistances(result); 01769 01770 result->noOfPathsToEnd = NULL; 01771 result->noOfPathsFromStart = NULL; 01772 lgComputeNoOfPaths(result); 01773 result->chunks = NULL; 01774 lgCopyTagScores(result, lg); 01775 01776 return result; 01777 01778 } 01779 01780 01781 /* ---------------------------------------------------------------------------- 01782 Might this be a lowercase word that is spelled in upper case 01783 because of orthographic convention? 01784 */ 01785 Boolean lgSpuriousUppercase(LexemGraph lg, Arc arc) 01786 { 01787 01788 /** Spurious uppercase must be an upper case letter... */ 01789 if (!isupper(((unsigned char *)arc->word)[0])) { 01790 return FALSE; 01791 } 01792 01793 /** ... followed by a lower case letter. */ 01794 if (strlen(arc->word) > 1 && 01795 !islower(((unsigned char *)arc->word)[1])) { 01796 return FALSE; 01797 } 01798 01799 /** This is another instance of the "wordgraphs start at 0" assumption. 01800 01801 Ordinarily, this would be wrong, since the lexeme graph might start at 01802 some other time point. However, at this time lg->min may not be 01803 initialized, so we can't check it. Since spurious upper case only 01804 occurs in written text, and weird time points occur mainly in 01805 recognizer output for spoken text, I'm letting it pass here. */ 01806 if (arc->from == 0) { 01807 return TRUE; 01808 } 01809 01810 return inputSentenceBoundary(inputLatticePrevArc(lg->lattice, arc)); 01811 } 01812 01813 01814 /* -- ENDOFFILE --------------------------------------------------------- */ 01815 /* ---------------------------------------------------------------------- */ 01816 /** @} */

CDG 0.95 (20 Oct 2004)