Main Page | Modules | Alphabetical List | Data Structures | File List | Data Fields | Related Pages

chunker.c

00001 /* 00002 * Copyright (C) 1997-2004 The CDG Team <cdg@nats.informatik.uni-hamburg.de> 00003 * 00004 * This file is free software; as a special exception the author gives 00005 * unlimited permission to copy and/or distribute it, with or without 00006 * modifications, as long as this notice is preserved. 00007 * 00008 * This program is distributed in the hope that it will be useful, but 00009 * WITHOUT ANY WARRANTY, to the extent permitted by law; without even the 00010 * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 00011 * 00012 */ 00013 00014 /* ------------------------------------------------------------------------- 00015 * @addtogroup Chunker Chunker - Interface to a Chunking Parser 00016 * @author Michael Daum (see also AUTHORS and THANKS for more) 00017 * @date 2002-09-09 00018 * 00019 * $Id: chunker.c,v 1.41 2004/09/01 13:42:31 micha Exp $ 00020 * 00021 * This module offers an interface for an external chunker. 00022 * 00023 * @{ 00024 */ 00025 00026 /* -- INCLUDES ------------------------------------------------------------- */ 00027 #include <ctype.h> 00028 #include <unistd.h> 00029 #include <signal.h> 00030 #include <stdio.h> 00031 #include <errno.h> 00032 #include <limits.h> 00033 #include <string.h> 00034 #include <sys/types.h> 00035 #include <sys/time.h> 00036 #include <sys/resource.h> 00037 #include <sys/wait.h> 00038 #include "cdg.h" 00039 #include "chunker.h" 00040 #include "parse.h" 00041 #include "hook.h" 00042 #include "tagger.h" 00043 #include "set.h" 00044 00045 /* -- MACROS --------------------------------------------------------------- */ 00046 // #define DEBUG_GETCHUNKSAT 00047 // #define DEBUG_GETCHUNKS 00048 // #define DEBUG_FINDCHUNK 00049 00050 /* -- TYPE DEFINITIONS ----------------------------------------------------- */ 00051 00052 /* ------------------------------------------------------------------------- 00053 * representation of the chunker. 00054 */ 00055 struct ChunkerStruct { 00056 ChunkerMode mode; /**< determines the mode of operation */ 00057 LexemGraph lg; /**< the current lexemgraph that we are chunking */ 00058 Parse parse; /**< parse generated from the annotation of the lattice */ 00059 Level mainlevel; /**< mainlevels index in the current parse */ 00060 int nrWords; /**< number of words in the current parse */ 00061 int nrLevels; /**< number of levels in the current parse */ 00062 List chunks; /**< found chunks */ 00063 char **args; /**< command to start this chunker @see chunkerArgs */ 00064 pid_t pid; /**< process id of a started chunker job */ 00065 int pipe1[2]; /**< pipe connected to the stdin of the chunker process */ 00066 int pipe2[2]; /**< pipe connected to the stdout of the chunker process */ 00067 }; 00068 00069 /* -- VARIABLES ------------------------------------------------------------ */ 00070 00071 /** indicates wether the chunker is used or not */ 00072 static Boolean chunkerUseChunker = FALSE; 00073 00074 /** set the default chunker mode, @see DefaultChunker */ 00075 static ChunkerMode chunkerMode = RealChunker; 00076 00077 /** string representation of the current command used for real chunking */ 00078 static String chunkerCommand = NULL; 00079 00080 /** NULL terminated array of command arguments used for real chunking */ 00081 static char **chunkerArgs = NULL; 00082 00083 /* -- FUNCTIONS ------------------------------------------------------------ */ 00084 00085 /* constructors, destructors & ko */ 00086 static Chunk newChunk(ChunkType type); 00087 static Boolean initChunker(Chunker chunker); 00088 static Boolean initFakeChunker(Chunker chunker); 00089 static Boolean initRealChunker(Chunker chunker); 00090 static void resetChunker(Chunker chunker); 00091 00092 /* chunker routines */ 00093 static List getChunks(Chunker chunker); 00094 static List getFakeChunks(Chunker chunker); 00095 static List getFakeChunksAt(Chunker chunker, Chunk parent, int index); 00096 static ChunkType getFakeChunkType(Chunker chunker, int index); 00097 static void postProcessChunks(Chunker chunker, List chunks); 00098 static Chunk findChunk(List chunks, int from, int to); 00099 static Chunk mergeChunk(Chunker chunker, Chunk target, Chunk source); 00100 static Chunk embedChunk(Chunker chunker, Chunk target, Chunk source); 00101 static int evalChunker(Chunker chunker, List annoChunks); 00102 static int countChunks(List chunks); 00103 static Boolean compareChunks(Chunk c1, Chunk c2); 00104 00105 /* TODO: these should go into the parse module as their names already say */ 00106 static int parseGetModifiee(Chunker chunker, int index); 00107 static List parseGetRoots(Chunker chunker); 00108 static String parseGetLabel(Chunker chunker, int index); 00109 static LevelValue parseGetLevelValue(Chunker chunker, int index); 00110 static String parseGetCategory(Chunker chunker, int index); 00111 static GraphemNode parseGetGrapheme(Chunker chunker, int index); 00112 00113 /* misc */ 00114 static Boolean cmpChunks(Chunk c1, Chunk c2, Chunker chunker); 00115 static Boolean cmpArcs(Arc arc1, Arc arc2); 00116 static Boolean cmpGraphemes(GraphemNode g1, GraphemNode g2); 00117 static void printChunk(unsigned long mode, Chunk chunk); 00118 static GraphemNode findGrapheme(LexemGraph lg, GraphemNode gn); 00119 static List getCategories(GraphemNode gn); 00120 static String getCategory(GraphemNode gn); 00121 00122 /* -- IMPLEMENTATION ------------------------------------------------------- */ 00123 00124 /* ------------------------------------------------------------------------- 00125 * terminateChild 00126 * 00127 * This function waits for child with the specified pid to terminate. 00128 * 00129 * Return values: 00130 * 00131 * -1 error 00132 * 0 child died already 00133 * 1 child died after SIGTERM 00134 * 2 child died after SIGKILL 00135 * 00136 */ 00137 int terminateChild(pid_t pid) 00138 { 00139 pid_t res; 00140 00141 res = wait4(pid, (int *)NULL, WNOHANG, NULL); 00142 if (res == pid) { 00143 return 0; 00144 } 00145 if (res == 0) { 00146 /* child hasn't terminated yet */ 00147 if (-1 == kill(pid, SIGTERM)) { 00148 cdgPrintf(CDG_WARNING, "WARNING: can't send SIGTERM to %d: %s\n", pid, 00149 strerror(errno)); 00150 } 00151 usleep(100000); 00152 res = wait4(pid, (int *)NULL, WNOHANG, NULL); 00153 if (res == pid) { 00154 return 1; 00155 } 00156 if (res == 0) { 00157 if (-1 == kill(pid, SIGKILL)) { 00158 cdgPrintf(CDG_WARNING, "WARNING: can't send SIGKILL to %d: %s\n", pid, 00159 strerror(errno)); 00160 } 00161 usleep(100000); 00162 res = wait4(pid, (int *)NULL, WNOHANG, NULL); 00163 if (res == pid) { 00164 return 2; 00165 } 00166 } 00167 } 00168 /* error condition */ 00169 cdgPrintf(CDG_ERROR, "ERROR: can't terminate %d: %s\n", pid, 00170 strerror(errno)); 00171 00172 return -1; 00173 } 00174 00175 00176 /* ------------------------------------------------------------------------- 00177 * construct a new chunker. 00178 * 00179 * @param mode one of the defined @ref ChunkerMode 00180 * @param lg data from which we're going to initialize 00181 * @returns a new chunker 00182 * 00183 * In case of a @ref RealChunker a child process is forked in the background 00184 * specified by the @ref chunkerCommand. This function might return NULL 00185 * when chunking is switched off, no @ref chunkerCommand is defined or the 00186 * initialization of the chunker object fails. 00187 * 00188 * @see initChunker, initRealChunker, initFakeChunker 00189 */ 00190 Chunker chunkerNew(ChunkerMode mode, LexemGraph lg) 00191 { 00192 Chunker chunker; 00193 int i; 00194 00195 if (!chunkerUseChunker) 00196 return NULL; 00197 00198 if (mode == DefaultChunker) 00199 mode = chunkerMode; 00200 00201 if (mode != FakeChunker) { 00202 if(!chunkerCommand || strlen(chunkerCommand) == 0) { 00203 cdgPrintf(CDG_ERROR, "ERROR: no chunker command defined ... switching off the chunker\n"); 00204 chunkerUseChunker = FALSE; 00205 return NULL; 00206 } 00207 } 00208 00209 chunker = (Chunker) memMalloc(sizeof (ChunkerStruct)); 00210 chunker->mode = mode; 00211 chunker->lg = lg; 00212 chunker->parse = NULL; 00213 chunker->nrWords = 0; 00214 chunker->nrLevels = 0; 00215 chunker->chunks = NULL; 00216 chunker->pid = -1; 00217 chunker->args = NULL; 00218 00219 /* copy the chunkerArgs to our private structure */ 00220 if(chunkerArgs) { 00221 for (i = 0; chunkerArgs[i]; i++) 00222 chunker->args = (char **)memMalloc(sizeof (char *) * (i + 1)); 00223 for (i = 0; chunkerArgs[i]; i++) { 00224 chunker->args[i] = strRegister(chunkerArgs[i]); 00225 } 00226 chunker->args[i] = NULL; 00227 } 00228 return (initChunker(chunker)?chunker:NULL); 00229 } 00230 00231 /* ------------------------------------------------------------------------- 00232 * initialize the chunker with the given data. 00233 * 00234 * @param chunker the current chunker 00235 * @returns true on success. 00236 * 00237 * This is called from @ref chunkerChunk and from chunkerNew. It calls 00238 * @ref initFakeChunker or @ref initRealChunker depending on the chunker mode. 00239 */ 00240 Boolean initChunker(Chunker chunker) 00241 { 00242 switch (chunker->mode) { 00243 case FakeChunker: 00244 return initFakeChunker(chunker); 00245 break; 00246 case RealChunker: 00247 return initRealChunker(chunker); 00248 case EvalChunker: 00249 return initFakeChunker(chunker) && initRealChunker(chunker); 00250 break; 00251 default: /* never reach */ 00252 cdgPrintf(CDG_ERROR, "ERROR: unknown chunker mode\n"); 00253 break; 00254 } 00255 00256 return FALSE; 00257 } 00258 00259 /* ------------------------------------------------------------------------- 00260 * initialize a fake chunker with the given data. 00261 * 00262 * @param chunker the current chunker 00263 * @returns true on success. 00264 * 00265 * This function is called by initChunker() whenever the given Chunker 00266 * is in mode FakeChunker. In order to work properly the contained 00267 * Lattice within the LexemGraph lg must have a corresponding 00268 * Annotation from which the chunks can be faked. 00269 * 00270 * @see also initChunker, initFakeChunker, initRealChunker, chunkerChunk. 00271 */ 00272 Boolean initFakeChunker(Chunker chunker) 00273 { 00274 Level mainlevel = inputGetMainlevel(inputCurrentGrammar); 00275 AnnoEntry anno = findAnnoForLattice(chunker->lg->lattice, TRUE); 00276 00277 /* check for available annotation */ 00278 if (!anno) { 00279 cdgPrintf(CDG_ERROR, 00280 "ERROR: no annotation for lattice %s found needed in fake mode\n", 00281 chunker->lg->lattice->id); 00282 return FALSE; 00283 } 00284 00285 /* check for a defined mainlevel */ 00286 if (!mainlevel) { 00287 cdgPrintf(CDG_ERROR, "ERROR: no mainlevel defined\n"); 00288 return FALSE; 00289 } 00290 00291 /* planed to work on linear lattices only right now */ 00292 if (latticeBranches(chunker->lg->lattice)) { 00293 cdgPrintf(CDG_ERROR, 00294 "ERROR: sorry, chunking only works for linear lattices right now\n"); 00295 return FALSE; 00296 } 00297 00298 /* get a parse from the annotation */ 00299 chunker->parse = parseFromAnno(anno); 00300 if (!chunker->parse) { 00301 resetChunker(chunker); 00302 return FALSE; 00303 } 00304 00305 /* decorate the parse */ 00306 if (!parseDecorate(chunker->parse, chunker->lg, anno)) { 00307 cdgPrintf(CDG_ERROR, 00308 "ERROR: could not decorate parse for annotation `%s'.\n", 00309 anno->id); 00310 resetChunker(chunker); 00311 return FALSE; 00312 } 00313 00314 chunker->nrWords = vectorSize(chunker->parse->words); 00315 chunker->nrLevels = listSize(chunker->parse->levels); 00316 chunker->mainlevel = mainlevel; 00317 00318 return TRUE; 00319 } 00320 00321 /* ------------------------------------------------------------------------- 00322 * initialize a real chunker with the given data. 00323 * 00324 * @param chunker the current chunker 00325 * @returns true on success. 00326 * 00327 * This function is called from @ref initChunker during the initialization 00328 * of the chunker, that is just before it starts chunking in @ref chunkerChunk. 00329 * 00330 * @see also initChunker, initFakeChunker, initRealChunker, chunkerChunk. 00331 */ 00332 Boolean initRealChunker(Chunker chunker) 00333 { 00334 /* check for a defined mainlevel */ 00335 chunker->mainlevel = inputGetMainlevel(inputCurrentGrammar); 00336 if (!chunker->mainlevel) { 00337 cdgPrintf(CDG_ERROR, "ERROR: no mainlevel defined\n"); 00338 return FALSE; 00339 } 00340 00341 /* create a fresh new chunker coprocess for each chunker object */ 00342 00343 /* create pipes */ 00344 if (pipe(chunker->pipe1) < 0 || pipe(chunker->pipe2) < 0) { 00345 cdgPrintf(CDG_ERROR, "ERROR: can't create pipes\n"); 00346 return FALSE; 00347 } 00348 00349 #if 0 00350 cdgPrintf(CDG_DEBUG, "DEBUG: forking %s ... \n", chunker->args[0]); 00351 #endif 00352 00353 /* fork starts a subprocess under pid 'pid' */ 00354 if ((chunker->pid = fork()) < 0) { 00355 cdgPrintf(CDG_ERROR, "ERROR: can't fork: %s\n", strerror(errno)); 00356 return FALSE; 00357 } 00358 00359 if (chunker->pid) { 00360 /* parent thread */ 00361 close(chunker->pipe1[0]); 00362 close(chunker->pipe2[1]); 00363 #if 0 00364 cdgPrintf(CDG_DEBUG, "DEBUG: process %d started.\n", chunker->pid); 00365 #endif 00366 } else { 00367 /* child thread */ 00368 close(chunker->pipe1[1]); 00369 close(chunker->pipe2[0]); 00370 signal(SIGINT, SIG_DFL); 00371 signal(SIGTERM, SIG_DFL); 00372 signal(SIGXCPU, SIG_DFL); 00373 if (chunker->pipe1[0] != STDIN_FILENO) { 00374 if (dup2(chunker->pipe1[0], STDIN_FILENO) != STDIN_FILENO) { 00375 fprintf(stderr, "WARNING: child can't dup2 stdin: %s\n", 00376 strerror(errno)); 00377 close(chunker->pipe1[0]); 00378 } 00379 } 00380 if (chunker->pipe2[1] != STDOUT_FILENO) { 00381 if (dup2(chunker->pipe2[1], STDOUT_FILENO) != STDOUT_FILENO) { 00382 fprintf(stderr, "WARNING: child can't dup2 stdout: %s\n", 00383 strerror(errno)); 00384 close(chunker->pipe2[1]); 00385 } 00386 } 00387 execvp(chunker->args[0], chunker->args); 00388 00389 /* never reach */ 00390 fprintf(stderr, "ERROR: exec(%s) failed: %s\n", 00391 chunker->args[0], strerror(errno)); 00392 _exit(1); 00393 } 00394 00395 return TRUE; 00396 } 00397 00398 /* ------------------------------------------------------------------------- 00399 * resetChunker: set the chunker in a state of innocence. 00400 * 00401 * parameters: 00402 * chunker = the object of desire 00403 */ 00404 void resetChunker(Chunker chunker) 00405 { 00406 if (chunker->parse) { /* not NULL aware */ 00407 parseDelete(chunker->parse); 00408 } 00409 listForEachDelete(chunker->chunks, chunkerChunkDelete); 00410 chunker->lg = NULL; 00411 chunker->parse = NULL; 00412 chunker->nrWords = 0; 00413 chunker->nrLevels = 0; 00414 chunker->chunks = NULL; 00415 if (chunker->pid > 0) { 00416 close(chunker->pipe1[0]); 00417 close(chunker->pipe1[1]); 00418 close(chunker->pipe2[0]); 00419 close(chunker->pipe2[1]); 00420 terminateChild(chunker->pid); /* in tagger.c */ 00421 } 00422 00423 if (chunker->args) { 00424 int i; 00425 for(i = 0; chunker->args[i]; i++) { 00426 cdgFreeString(chunker->args[i]); 00427 } 00428 memFree(chunker->args); 00429 chunker->args = NULL; 00430 } 00431 } 00432 00433 /* ------------------------------------------------------------------------- 00434 * chunkerDelete: destroy the chunker representation. 00435 * 00436 * parameters: 00437 * chunker = the object to be destructed 00438 */ 00439 void chunkerDelete(Chunker chunker) 00440 { 00441 if (!chunker) 00442 return; 00443 00444 resetChunker(chunker); 00445 memFree(chunker); 00446 } 00447 00448 /* ------------------------------------------------------------------------- 00449 * construct a new chunk and initialize it. 00450 * 00451 * @param type one of the ChunkTypes NChunk, PChunk, ... 00452 * @returns a new empty Chunk. 00453 */ 00454 Chunk newChunk(ChunkType type) 00455 { 00456 Chunk chunk; 00457 00458 chunk = (Chunk)memMalloc(sizeof(ChunkStruct)); 00459 chunk->type = type; 00460 chunk->nodes = NULL; 00461 chunk->from = NULL; 00462 chunk->to = NULL; 00463 chunk->subChunks = NULL; 00464 chunk->parent = NULL; 00465 chunk->head = NULL; 00466 00467 return chunk; 00468 } 00469 00470 /* ------------------------------------------------------------------------- 00471 * chunkerCloneChunk: construct a copy of a given chunk including clones of subChunks. 00472 * 00473 * parameters: 00474 * chunk = the original 00475 * returns: the copy. 00476 */ 00477 Chunk chunkerCloneChunk(Chunk chunk) 00478 { 00479 Chunk clone; 00480 List l; 00481 00482 if (!chunk) 00483 return NULL; 00484 00485 clone = newChunk(chunk->type); 00486 clone->parent = chunk->parent; 00487 clone->from = chunk->from; /* these are references into the chunker->lg */ 00488 clone->to = chunk->to; 00489 clone->head = chunk->head; 00490 clone->nodes = listClone(chunk->nodes); 00491 for (l = chunk->subChunks; l; l = listNext(l)) { 00492 clone->subChunks = listAppendElement(clone->subChunks, chunkerCloneChunk(listElement(l))); 00493 } 00494 00495 return clone; 00496 } 00497 00498 /* ------------------------------------------------------------------------- 00499 * chunkerChunkDelete: destruct a chunk and all its subchunks. 00500 * 00501 * parameters: 00502 * chunk = a chunk to be deallocated 00503 */ 00504 void chunkerChunkDelete(Chunk chunk) 00505 { 00506 if (!chunk) 00507 return; 00508 00509 listForEachDelete(chunk->subChunks, chunkerChunkDelete); 00510 listDelete(chunk->nodes); 00511 memFree(chunk); 00512 } 00513 00514 /* ------------------------------------------------------------------------- 00515 * cmpGraphemes: return true if g1 start before g2 00516 */ 00517 Boolean cmpGraphemes(GraphemNode g1, GraphemNode g2) 00518 { 00519 return (g1->arc->from < g2->arc->from); 00520 } 00521 00522 /* ------------------------------------------------------------------------- 00523 * cmpArcs: return true if arc1 starts before arc2 00524 */ 00525 Boolean cmpArcs(Arc arc1, Arc arc2) 00526 { 00527 return (arc1->from < arc2->from); 00528 } 00529 00530 /* ------------------------------------------------------------------------- 00531 * cmpChunks: return true if c1 starts before c2. 00532 */ 00533 Boolean cmpChunks(Chunk c1, Chunk c2, Chunker chunker) 00534 { 00535 return (c1->from->arc->from < c2->from->arc->from); 00536 } 00537 00538 /* ------------------------------------------------------------------------- 00539 * parseGetRoots: get all unbound words (on the main level) 00540 * 00541 * parameters: 00542 * chunker = the current chunker 00543 * returns: a list of word indices or NULL if there are no root bindings (?) 00544 * 00545 * Note: you become the owner of the returned 00546 * list container, so deallocate it after you've consumed the result. 00547 */ 00548 List parseGetRoots(Chunker chunker) 00549 { 00550 int i; 00551 List result = NULL; 00552 00553 for (i = 0; i < chunker->nrWords; i ++) { 00554 if (parseGetModifiee(chunker, i) == -1) {; 00555 result = listAppendElement(result, (Pointer)i); 00556 } 00557 } 00558 00559 return result; 00560 } 00561 00562 /* ------------------------------------------------------------------------- 00563 * parseGetModifiee: get the word this one is modifying (on the main level) 00564 * 00565 * parameters: 00566 * chunker = the current chunker 00567 * index = the modifier index in the word vector of the 00568 * current parse 00569 * returns: the modifiee index 00570 */ 00571 int parseGetModifiee(Chunker chunker, int index) 00572 { 00573 return (int)vectorElement(chunker->parse->verticesStructure, 00574 chunker->nrLevels * index + chunker->mainlevel->no); 00575 } 00576 00577 /* ------------------------------------------------------------------------- 00578 * parseGetLabel: get the label of the dependency of a word (on the main level) 00579 * 00580 * parameters: 00581 * chunker = the current chunker 00582 * index = index of a word in the parse 00583 * returns: the label of that dependency 00584 */ 00585 String parseGetLabel(Chunker chunker, int index) 00586 { 00587 return (String) vectorElement(chunker->parse->verticesLabels, 00588 chunker->nrLevels * index + chunker->mainlevel->no); 00589 } 00590 00591 /* ------------------------------------------------------------------------- 00592 * parseGetLevelValue: get the dependency arc of a word. 00593 * 00594 * parameters: 00595 * chunker = the current chunker 00596 * index = index of a word in the parse. 00597 * 00598 * returns: the level value of this word 00599 */ 00600 LevelValue parseGetLevelValue(Chunker chunker, int index) 00601 { 00602 if (index == -1 || index == INT_MAX) return NULL; 00603 return (LevelValue) vectorElement(chunker->parse->LVs, 00604 inputCurrentGrammar->noOfLevels * index + chunker->mainlevel->no); 00605 } 00606 00607 /* ------------------------------------------------------------------------- 00608 * getCategory: get one POS-tag, warn if there are more than one 00609 * 00610 * parameter: 00611 * gn = a grapheme 00612 * returns: the first POS-tag available. 00613 */ 00614 String getCategory(GraphemNode gn) 00615 { 00616 List cats; 00617 String cat; 00618 00619 if (!gn) 00620 return NULL; 00621 00622 cats = getCategories(gn); 00623 if (!cats) { 00624 return NULL; 00625 } 00626 00627 #if 0 00628 if (listSize(cats) > 1) { 00629 cdgPrintf(CDG_WARNING, 00630 "WARNING: got more than one POS-tag\n"); 00631 } 00632 #endif 00633 cat = listElement(cats); 00634 listDelete(cats); 00635 00636 return cat; 00637 } 00638 00639 /* ------------------------------------------------------------------------- 00640 * getCategories: get all POS-tags of undeleted lexem nodes 00641 * 00642 * parameter: 00643 * gn = a lexem node 00644 * returns: a list of POS tags. 00645 */ 00646 List getCategories(GraphemNode gn) 00647 { 00648 List cats = NULL; 00649 List l; 00650 Value value; 00651 List lns = NULL; 00652 00653 Boolean lexemSorter(LexemNode lna, LexemNode lnb) 00654 { 00655 return (lna->tagscore > lnb->tagscore); 00656 } 00657 00658 for (l = gn->lexemes; l; l = listNext(l)) { 00659 LexemNode ln = listElement(l); 00660 00661 if (bvElement(gn->lexemgraph->isDeletedNode, ln->no)) 00662 continue; 00663 00664 value = ln->lexem->values[taggerCategoryIndex]; 00665 if (value->type == VTString) { 00666 lns = listInsertSorted(lns, ln, lexemSorter); 00667 } 00668 } 00669 00670 for (l = lns; l; l = listNext(l)) { 00671 LexemNode ln = listElement(l); 00672 value = ln->lexem->values[taggerCategoryIndex]; 00673 cats = listAddUniqueElement(cats, value->data.string); 00674 #if 0 00675 cdgPrintf(CDG_DEBUG, "DEBUG: cat %s = score %g\n", 00676 value->data.string, ln->tagscore); 00677 #endif 00678 } 00679 listDelete(lns); 00680 00681 00682 return cats; 00683 } 00684 00685 /* ------------------------------------------------------------------------- 00686 * parseGetCategory: get the POS-tag of a given word index 00687 * 00688 * parameters: 00689 * chunker = the current chunker 00690 * index = index of a word in the parse. 00691 * 00692 * returns: the POS-tag string or NULL if not defined 00693 */ 00694 String parseGetCategory(Chunker chunker, int index) 00695 { 00696 if (index == -1 || index == INT_MAX) { 00697 return ""; 00698 } else { 00699 LevelValue lv = parseGetLevelValue(chunker, index); 00700 LexemNode ln = lv->modifier; 00701 Value value = ln->lexem->values[taggerCategoryIndex]; 00702 00703 if (value->type != VTString) { 00704 return NULL; 00705 } 00706 00707 return value->data.string; 00708 } 00709 } 00710 00711 /* ------------------------------------------------------------------------- 00712 * parseGetGrapheme: get the grapheme node of a given word index 00713 * 00714 * parameters: 00715 * chunker = the current chunker 00716 * index = index of a word in the parse. 00717 * 00718 * returns: the arc. 00719 */ 00720 GraphemNode parseGetGrapheme(Chunker chunker, int index) 00721 { 00722 LevelValue lv; 00723 00724 if (index == -1 || index == INT_MAX) { 00725 return NULL; 00726 } 00727 00728 lv = parseGetLevelValue(chunker, index); 00729 00730 if (!lv) { /* never reach */ 00731 cdgPrintf(CDG_ERROR, "ERROR: can't get grapheme at %d\n", index); 00732 abort(); 00733 } 00734 return lv->modifier->grapheme; 00735 } 00736 00737 00738 /* ------------------------------------------------------------------------- 00739 * getFakeChunkType 00740 */ 00741 ChunkType getFakeChunkType(Chunker chunker, int index) 00742 { 00743 String cat = parseGetCategory(chunker, index); 00744 00745 if (cat == strRegister("NN") || 00746 cat == strRegister("NE") || 00747 cat == strRegister("FM") || 00748 cat == strRegister("CARD") || 00749 cat == strRegister("PDS") || 00750 cat == strRegister("PPER") || 00751 cat == strRegister("PRF") || 00752 cat == strRegister("PRELS") || 00753 cat == strRegister("PPOSS") || 00754 cat == strRegister("PWS") || 00755 cat == strRegister("PIS") || 00756 cat == strRegister("ADJA") || 00757 cat == strRegister("KON") || 00758 cat == strRegister("TRUNC")) { 00759 return NChunk; 00760 } 00761 00762 if (cat == strRegister("VVINF") || 00763 cat == strRegister("VVIZU") || 00764 cat == strRegister("VVFIN") || 00765 cat == strRegister("VVPP") || 00766 cat == strRegister("VMINF") || 00767 cat == strRegister("VMFIN") || 00768 cat == strRegister("VMPP") || 00769 cat == strRegister("VAINF") || 00770 cat == strRegister("VAFIN") || 00771 cat == strRegister("VAPP") || 00772 cat == strRegister("PTKVZ") || 00773 cat == strRegister("PTKZU")) { 00774 return VChunk; 00775 } 00776 00777 if (cat == strRegister("APPR") || 00778 cat == strRegister("APPRART") || 00779 cat == strRegister("APPO")) { 00780 return PChunk; 00781 } 00782 00783 if (cat == strRegister("KOKOM") || 00784 cat == strRegister("PROAV") || 00785 cat == strRegister("ADV") || 00786 cat == strRegister("ADJD") || 00787 cat == strRegister("KON")) { 00788 return NoChunk; 00789 } 00790 00791 return UnknownChunk; 00792 } 00793 00794 /* ------------------------------------------------------------------------- 00795 * mergeChunk: add the source to the target chunk. the target chunk spans 00796 * the words of both chunks. 00797 * 00798 * parameters: 00799 * chunker = the current chunker 00800 * target = the resulting chunk 00801 * source = the chunk to be added to the target 00802 * 00803 * returns: the target chunk. 00804 */ 00805 Chunk mergeChunk(Chunker chunker, Chunk target, Chunk source) 00806 { 00807 List l; 00808 00809 #ifdef DEBUG_GETCHUNKSAT 00810 cdgPrintf(CDG_DEBUG, "DEBUG: merging chunk %s <%s,%s> to target chunk %s <%s,%s>\n", 00811 chunkerStringOfChunkType(source), source->from->arc->word, source->to->arc->word, 00812 chunkerStringOfChunkType(target), target->from->arc->word, target->to->arc->word); 00813 #endif 00814 00815 /* enlarge the span */ 00816 if (target->to->arc->to == source->from->arc->from) { 00817 target->to = source->to; 00818 } 00819 if (source->to->arc->to == target->from->arc->from) { 00820 target->from = source->from; 00821 } 00822 00823 for (l = source->nodes; l; l = listNext(l)) { 00824 Arc arc = listElement(l); 00825 target->nodes = listInsertSorted(target->nodes, arc, cmpGraphemes); 00826 } 00827 00828 /* add all sub chunks to the currently available subchunks */ 00829 for (l = source->subChunks; l; l = listNext(l)) { 00830 Chunk clone = chunkerCloneChunk(listElement(l)); 00831 target->subChunks = 00832 listInsertSortedWithData(target->subChunks, clone, cmpChunks, chunker); 00833 } 00834 00835 return target; 00836 } 00837 00838 /* ------------------------------------------------------------------------- 00839 * embedChunk: embed chunk as a subchunk into the target chunk 00840 * 00841 * parameters 00842 * target = the resulting chunk 00843 * source = the chunk to be embedded 00844 * returns: the target chunk. 00845 */ 00846 Chunk embedChunk(Chunker chunker, Chunk target, Chunk source) 00847 { 00848 List l; 00849 00850 #ifdef DEBUG_GETCHUNKSAT 00851 cdgPrintf(CDG_DEBUG, 00852 "DEBUG: embedding chunk %s <%s,%s> to target chunk %s <%s,%s>\n", 00853 chunkerStringOfChunkType(source), 00854 source->from->arc->word, source->to->arc->word, 00855 chunkerStringOfChunkType(target), 00856 target->from->arc->word, target->to->arc->word); 00857 #endif 00858 if (target->to->arc->to == source->from->arc->from) { 00859 target->to = source->to; 00860 } 00861 if (source->to->arc->to == target->from->arc->from) { 00862 target->from = source->from; 00863 } 00864 for (l = source->nodes; l; l = listNext(l)) { 00865 Arc arc = listElement(l); 00866 target->nodes = listInsertSorted(target->nodes, arc, cmpArcs); 00867 } 00868 00869 target->subChunks = 00870 listInsertSortedWithData(target->subChunks, source, cmpChunks, chunker); 00871 00872 return target; 00873 } 00874 00875 /* ------------------------------------------------------------------------- 00876 * get the chunks under the given root node. 00877 * 00878 * @param chunker the current chunker 00879 * @param parent dominating chunk 00880 * @param index index of the root node in the dependency tree 00881 */ 00882 List getFakeChunksAt(Chunker chunker, Chunk parent, int index) 00883 { 00884 List chunks = NULL; 00885 List children, leftChildren, rightChildren; 00886 List l; 00887 Chunk thisChunk, chunk; 00888 GraphemNode thisNode; 00889 00890 /* inline function ------------------------------------------------------- */ 00891 void doit(List children) { 00892 List l, m; 00893 00894 for (l = children; l; l = listNext(l)) { 00895 int childIndex = (int)listElement(l); 00896 String childLabel = parseGetLabel(chunker, childIndex); 00897 List childChunks = getFakeChunksAt(chunker, thisChunk, childIndex); 00898 00899 #ifdef DEBUG_GETCHUNKSAT 00900 cdgPrintf(CDG_DEBUG, "DEBUG: childChunks = "); 00901 for (m = childChunks; m; m = listNext(m)) { 00902 chunk = listElement(m); 00903 cdgPrintf(CDG_DEBUG, "%s <%s,%s> " , 00904 chunkerStringOfChunkType(chunk), 00905 chunk->from->arc->word, chunk->to->arc->word); 00906 } 00907 cdgPrintf(CDG_DEBUG, "\n"); 00908 #endif 00909 00910 /* integrate chunks to current chunk */ 00911 for (m = childChunks; m; m = listNext(m)) { 00912 chunk = listElement(m); 00913 00914 if ((thisChunk->to->arc->to == chunk->from->arc->from || /* check adjacency */ 00915 chunk->to->arc->to == thisChunk->from->arc->from) && 00916 chunk->parent == thisChunk && /* check directly domination */ 00917 childLabel != strRegister("GMOD") /* never integrate GMODs */ && 00918 childLabel != strRegister("ADV") /* never integrate ADVs */ 00919 ) { 00920 00921 /* NChunks */ 00922 if (thisChunk->type == NChunk && 00923 (chunk->type == UnknownChunk || 00924 chunk->type == NChunk)) { 00925 #ifdef DEBUG_GETCHUNKSAT 00926 cdgPrintf(CDG_DEBUG, "DEBUG: appyling NC rule\n"); 00927 #endif 00928 mergeChunk(chunker, thisChunk, chunk); 00929 chunkerChunkDelete(chunk); 00930 continue; 00931 } 00932 00933 /* PChunks */ 00934 if (thisChunk->type == PChunk) { 00935 /* adjectives */ 00936 if(chunk->type == UnknownChunk) { 00937 #ifdef DEBUG_GETCHUNKSAT 00938 cdgPrintf(CDG_DEBUG, "DEBUG: appyling PC rule for XCs\n"); 00939 #endif 00940 00941 mergeChunk(chunker, thisChunk, chunk); 00942 chunkerChunkDelete(chunk); 00943 continue; 00944 } 00945 00946 /* embedded NPs */ 00947 if (chunk->type == NChunk) { 00948 #ifdef DEBUG_GETCHUNKSAT 00949 cdgPrintf(CDG_DEBUG, "DEBUG: appyling PC rule for NCs\n"); 00950 #endif 00951 embedChunk(chunker, thisChunk, chunk); 00952 continue; 00953 } 00954 00955 /* embedded PPs, e.g. bis zum Frühjahr */ 00956 if (chunk->type == PChunk && thisChunk->from == thisChunk->to) { 00957 #ifdef DEBUG_GETCHUNKSAT 00958 cdgPrintf(CDG_DEBUG, "DEBUG: appyling PC rule for PCs\n"); 00959 #endif 00960 mergeChunk(chunker, thisChunk, chunk); 00961 chunkerChunkDelete(chunk); 00962 continue; 00963 } 00964 } 00965 00966 /* VChunks */ 00967 if (thisChunk->type == VChunk && chunk->type == VChunk) { 00968 #ifdef DEBUG_GETCHUNKSAT 00969 cdgPrintf(CDG_DEBUG, "DEBUG: appyling VC rule\n"); 00970 #endif 00971 mergeChunk(chunker, thisChunk, chunk); 00972 chunkerChunkDelete(chunk); 00973 continue; 00974 } 00975 00976 #ifdef DEBUG_GETCHUNKSAT 00977 cdgPrintf(CDG_DEBUG, "DEBUG: no rule applies\n"); 00978 #endif 00979 } 00980 #ifdef DEBUG_GETCHUNKSAT 00981 else { 00982 cdgPrintf(CDG_DEBUG, "DEBUG: not adjacent\n"); 00983 } 00984 #endif 00985 00986 /* keep separate */ 00987 #ifdef DEBUG_GETCHUNKSAT 00988 cdgPrintf(CDG_DEBUG, "DEBUG: cannot add chunk %s <(%d,%d)%s,(%d,%d)%s> to target chunk %s <(%d,%d)%s,(%d,%d)%s>\n", 00989 chunkerStringOfChunkType(chunk), 00990 chunk->from->arc->from, chunk->from->arc->to, 00991 chunk->from->arc->word, 00992 chunk->to->arc->from, chunk->to->arc->to, 00993 chunk->to->arc->word, 00994 chunkerStringOfChunkType(thisChunk), 00995 thisChunk->from->arc->from, thisChunk->from->arc->to, 00996 thisChunk->from->arc->word, 00997 thisChunk->to->arc->from, thisChunk->to->arc->to, 00998 thisChunk->to->arc->word); 00999 #endif 01000 chunks = listInsertSortedWithData(chunks, chunk, cmpChunks, chunker); 01001 } 01002 listDelete(childChunks); 01003 } 01004 } 01005 /* ------------------------------------------------------------------------ */ 01006 /* ------------------------------------------------------------------------ */ 01007 01008 /* security first */ 01009 if (index == -1 || index == INT_MAX) { 01010 return NULL; 01011 } 01012 01013 /* construct a chunk for the current arc */ 01014 thisNode = parseGetGrapheme(chunker, index); 01015 thisChunk = newChunk(getFakeChunkType(chunker, index)); 01016 thisChunk->parent = parent; 01017 thisChunk->from = thisNode; 01018 thisChunk->to = thisNode; 01019 thisChunk->head = thisNode; 01020 thisChunk->nodes = listAppendElement(NULL, thisNode); 01021 children = parseGetModifiers(chunker->parse, chunker->mainlevel, index); 01022 01023 #ifdef DEBUG_GETCHUNKSAT 01024 cdgPrintf(CDG_DEBUG, "DEBUG: decending at %s\n", 01025 thisChunk->from->arc->word); 01026 #endif 01027 01028 01029 /* end of recursion: each leaf node is its own chunk */ 01030 if (!children) { 01031 #ifdef DEBUG_GETCHUNKSAT 01032 cdgPrintf(CDG_DEBUG, "DEBUG: returning from %s (leaf reached)\n", 01033 thisChunk->from->arc->word); 01034 #endif 01035 return listAppendElement(NULL, thisChunk); 01036 } 01037 01038 /* separate modifiers left and right of the current index */ 01039 leftChildren = rightChildren = NULL; 01040 for (l = children; l; l = listNext(l)) { 01041 int childIndex = (int)listElement(l); 01042 GraphemNode gn = parseGetGrapheme(chunker, childIndex); 01043 01044 /* left */ 01045 if (thisChunk->from->arc->from >= gn->arc->to) { 01046 leftChildren = listAppendElement(leftChildren, (Pointer)childIndex); 01047 } 01048 01049 /* right */ 01050 else { 01051 rightChildren = listAppendElement(rightChildren, (Pointer)childIndex); 01052 } 01053 } 01054 listDelete(children); 01055 01056 /* process children */ 01057 01058 { /* descent at the left in reverse order */ 01059 List reverseList = NULL; 01060 /* TODO: rework on new ACL with listReverse */ 01061 #if 0 01062 reverseList = listReverse(leftChildren); 01063 #else 01064 for (l = leftChildren; l; l = listNext(l)) { 01065 reverseList = listPrependElement(reverseList, listElement(l)); 01066 } 01067 #endif 01068 listDelete(leftChildren); 01069 leftChildren = reverseList; 01070 } 01071 01072 doit(leftChildren); 01073 doit(rightChildren); 01074 01075 /* add this constructed chunk to the list if all chunks we have sofar */ 01076 chunks = listInsertSortedWithData(chunks, thisChunk, cmpChunks, chunker); 01077 01078 #ifdef DEBUG_GETCHUNKSAT 01079 cdgPrintf(CDG_DEBUG, "DEBUG: returning from %s\n", 01080 thisChunk->from->arc->word); 01081 #endif 01082 return chunks; 01083 } 01084 01085 /* ------------------------------------------------------------------------- 01086 * postProcessChunks: get rid of unwanted chunks. 01087 * 01088 * parameters: 01089 * inputList = items to be filtered 01090 */ 01091 void postProcessChunks(Chunker chunker, List inputList) 01092 { 01093 List l; 01094 Chunk chunk; 01095 String cat; 01096 01097 for (l = inputList; l; l = listNext(l)) { 01098 chunk = listElement(l); 01099 cat = getCategory(chunk->from); 01100 if ((chunk->from == chunk->to 01101 && (cat == strRegister("PTKZU") || 01102 cat == strRegister("KON"))) 01103 || chunk->type == UnknownChunk) { 01104 #if 0 01105 cdgPrintf(CDG_DEBUG, "DEBUG: filtered chunk %s <%s, %s>\n", 01106 chunkerStringOfChunkType(chunk), 01107 chunk->from->arc->word, chunk->to->word); 01108 #endif 01109 chunk->type = NoChunk; 01110 } 01111 } 01112 } 01113 01114 /* ------------------------------------------------------------------------- 01115 * this is the entry function to the fake chunker. 01116 * 01117 * @param chunker the current chunker to be used 01118 * @returns a list of chunks 01119 */ 01120 List getFakeChunks(Chunker chunker) 01121 { 01122 List chunks = NULL; 01123 List result = NULL; 01124 List l, m, roots; 01125 int index; 01126 01127 roots = parseGetRoots(chunker); 01128 for (l = roots; l; l = listNext(l)) { 01129 index = (int)listElement(l); 01130 chunks = getFakeChunksAt(chunker, NULL, index); 01131 postProcessChunks(chunker, chunks); 01132 for (m = chunks; m; m = listNext(m)) { 01133 result = 01134 listInsertSortedWithData(result, listElement(m), cmpChunks, chunker); 01135 } 01136 listDelete(chunks); 01137 } 01138 01139 listDelete(roots); 01140 return result; 01141 } 01142 01143 /* ------------------------------------------------------------------------- 01144 * this is the entry function to the real chunker. 01145 * 01146 * @param chunker the current chunker to be used 01147 * @returns a list of chunks 01148 * 01149 * Writes to the chunker, reads from the chunker and builds a list of 01150 * chunks. 01151 */ 01152 List getChunks(Chunker chunker) 01153 { 01154 int i, n; 01155 List l; 01156 List chunks = NULL; 01157 String str; 01158 char buffer[MAXBUFFER]; 01159 List outputs = NULL; 01160 String bufptr; 01161 String token; 01162 Vector gns; 01163 01164 /* write to chunker */ 01165 #ifdef DEBUG_GETCHUNKS 01166 cdgPrintf(CDG_DEBUG, "DEBUG: writing to chunker:\n"); 01167 #endif 01168 for (i = 0; i < vectorSize(chunker->lg->graphemnodes); i++) { 01169 GraphemNode gn = vectorElement(chunker->lg->graphemnodes, i); 01170 String cat = getCategory(gn); 01171 str = strPrintf("%s\t%s\n", gn->arc->word, cat); 01172 n = strlen(str); 01173 #ifdef DEBUG_GETCHUNKS 01174 cdgPrintf(CDG_DEBUG, str); 01175 #endif 01176 if(write(chunker->pipe1[1], str, n) != n) { 01177 cdgPrintf(CDG_ERROR, "ERROR: while writing %s(%d) to chunker: %s\n", 01178 str, n, strerror(errno)); 01179 } 01180 cdgFreeString(str); 01181 } 01182 #ifdef DEBUG_GETCHUNKS 01183 cdgPrintf(CDG_DEBUG, "\n"); 01184 #endif 01185 01186 /* finish with an empty line */ 01187 str = strRegister("\n"); 01188 if(write(chunker->pipe1[1], str, 1) != 1) { 01189 cdgPrintf(CDG_ERROR, "ERROR: while writing a newline to chunker: %s\n", 01190 strerror(errno)); 01191 } 01192 cdgFreeString(str); 01193 01194 close(chunker->pipe1[1]); /* is that a good idea ??? */ 01195 01196 /* read from chunker */ 01197 for (i = 0; i < vectorSize(chunker->lg->graphemnodes); i++) { 01198 n = read(chunker->pipe2[0], buffer, MAXBUFFER); 01199 if(n < 0) { 01200 cdgPrintf(CDG_ERROR, "ERROR: while reading from chunker: %s\n", 01201 strerror(errno)); 01202 break; 01203 } 01204 if (n == 0) 01205 break; 01206 buffer[n] = '\0'; 01207 outputs = listAppendElement(outputs, strRegister(buffer)); 01208 #if 0 01209 cdgPrintf(CDG_DEBUG, "DEBUG: reading\n%s\n", buffer); 01210 #endif 01211 } 01212 close(chunker->pipe2[0]); /* is this a good idea ??? */ 01213 01214 if(!outputs) { 01215 return NULL; 01216 } 01217 01218 /* concatenate the outputs */ 01219 str = strCopy(strFromList(outputs)); /* allocate memory */ 01220 listForEachDelete(outputs, cdgFreeString); 01221 01222 /* split into lines */ 01223 bufptr = str; 01224 outputs = NULL; 01225 token = strtok_r(str, "\n", &bufptr); 01226 while (token) { 01227 outputs = listAppendElement(outputs, strRegister(token)); 01228 token = strtok_r(NULL, "\n", &bufptr); 01229 } 01230 memFree(str); /* allocated by strCopy */ 01231 01232 #if 0 01233 for (l = outputs; l; l = listNext(l)) { 01234 cdgPrintf(CDG_DEBUG, "DEBUG: %s\n", listElement(l)); 01235 } 01236 #endif 01237 01238 /* collect graphemnodes */ 01239 gns = vectorNew(vectorSize(chunker->lg->graphemnodes)); 01240 for (i = 0; i < vectorSize(chunker->lg->graphemnodes); i++) { 01241 GraphemNode gn = vectorElement(chunker->lg->graphemnodes, i); 01242 vectorSetElement(gns, gn, gn->arc->from); 01243 } 01244 01245 /* 01246 * convert output to chunks 01247 * line format: <wordform>\t<chunk-type>\t<chunk-from>\t<chunk-to>\t<chunk-head> 01248 */ 01249 for (l = outputs; l; l = listNext(l)) { 01250 int from = -1, to = -1, head = -1; 01251 char tag[3]; 01252 Chunk chunk = NULL; 01253 01254 /* get data */ 01255 if (sscanf(listElement(l), "%s\t%2s\t%d\t%d\t%d", 01256 buffer, tag, &from, &to, &head) != 5) { 01257 cdgPrintf(CDG_WARNING, "WARNING: unexpected line format '%s'\n", 01258 listElement(l)); 01259 continue; 01260 } 01261 01262 from--; 01263 to--; 01264 head--; 01265 01266 #ifdef DEBUG_GETCHUNKS 01267 cdgPrintf(CDG_DEBUG, "DEBUG: %s\t%s\t%d\t%d\t%d\n", 01268 buffer, tag, from, to, head); 01269 #endif 01270 /* search matching chunk */ 01271 chunk = findChunk(chunks, from, to); 01272 if (!chunk) { 01273 /* new chunk */ 01274 chunk = newChunk(chunkerChunkTypeOfString(tag)); 01275 chunks = listAppendElement(chunks, chunk); 01276 chunk->from = vectorElement(gns, from); 01277 chunk->to = vectorElement(gns, to); 01278 chunk->head = vectorElement(gns, head); 01279 for (i = from; i <= to; i++) { 01280 chunk->nodes = listAppendElement(chunk->nodes, vectorElement(gns, i)); 01281 } 01282 continue; 01283 } 01284 01285 /* already contained */ 01286 if (chunk->from->arc->from == from && chunk->to->arc->from == to) 01287 continue; 01288 01289 /* create a sub chunk inside chunk */ 01290 if (chunk->from->arc->from <= from && 01291 chunk->to->arc->from >= to) { 01292 Chunk subChunk = newChunk(chunkerChunkTypeOfString(tag)); 01293 subChunk->from = vectorElement(gns, from); 01294 subChunk->to = vectorElement(gns, to); 01295 subChunk->head = vectorElement(gns, head); 01296 for (i = from; i <= to; i++) { 01297 subChunk->nodes = listAppendElement(subChunk->nodes, vectorElement(gns, i)); 01298 } 01299 #ifdef DEBUG_GETCHUNKS 01300 cdgPrintf(CDG_DEBUG, 01301 "DEBUG: creating a subchunk <%s:%d-%d> inside <%s:%d-%d>\n", 01302 tag, from, to, 01303 chunkerStringOfChunkType(chunk), 01304 chunk->from->arc->from, chunk->to->arc->from); 01305 #endif 01306 chunk->subChunks = 01307 listInsertSortedWithData(chunk->subChunks, subChunk, cmpChunks, chunker); 01308 continue; 01309 } 01310 01311 /* never reach */ 01312 cdgPrintf(CDG_ERROR, "ERROR: programming error\n"); 01313 abort(); 01314 } 01315 01316 /* memory tinji */ 01317 listForEachDelete(outputs, cdgFreeString); 01318 vectorDelete(gns); 01319 01320 return chunks; 01321 } 01322 01323 /* ------------------------------------------------------------------------- 01324 * count the number of chunks. 01325 * 01326 * @param chunks list of chunks to be counted 01327 * @returns amount of chunks 01328 * 01329 * This not only counts the list length but also all sub chunks. 01330 */ 01331 int countChunks(List chunks) 01332 { 01333 List l; 01334 int counter = 0; 01335 01336 for (l = chunks; l; l = listNext(l)) { 01337 Chunk chunk = listElement(l); 01338 if (chunk->type == NChunk || 01339 chunk->type == PChunk || 01340 chunk->type == VChunk) { 01341 counter++; 01342 } 01343 counter += countChunks(chunk->subChunks); 01344 } 01345 01346 return counter; 01347 } 01348 01349 /* ------------------------------------------------------------------------- 01350 * search the chunk that spans over the given indices. 01351 * 01352 * @param chunks a list of chunks 01353 * @param from the starting point of the span 01354 * @param to the ending point of the span 01355 * @returns the found Chunk or NULL if the specified span has not been 01356 * chunked so far 01357 * 01358 * This is a recursive function: 01359 * - returns NULL if chunks is empty 01360 * - returns NULL if <from-to> is not spanned by any chunk 01361 * - returns chunk X if it spans exactly <from-to> 01362 * - if a chunk X spans more than <from-to> 01363 * - return X if there's no exaclty spanning sub chunk Y 01364 * - or the found sub chunk Y inside X 01365 */ 01366 Chunk findChunk(List chunks, int from, int to) 01367 { 01368 List l; 01369 01370 for (l = chunks; l; l = listNext(l)) { 01371 Chunk chunk = listElement(l); 01372 01373 /* already spanned */ 01374 if (chunk->from->arc->from == from && chunk->to->arc->from == to) { 01375 #if DEBUG_FINDCHUNK 01376 cdgPrintf(CDG_DEBUG, "DEBUG: found in "); 01377 printChunk(CDG_DEBUG, chunk); 01378 cdgPrintf(CDG_DEBUG, "\n"); 01379 #endif 01380 return chunk; 01381 } 01382 01383 /* check sub chunk */ 01384 if (chunk->from->arc->from <= from && chunk->to->arc->from >= to) { 01385 Chunk subChunk; 01386 01387 #if DEBUG_FINDCHUNK 01388 cdgPrintf(CDG_DEBUG, "DEBUG: searching sub chunk\n"); 01389 #endif 01390 subChunk = findChunk(chunk->subChunks, from, to); 01391 if (subChunk) { 01392 #if DEBUG_FINDCHUNK 01393 cdgPrintf(CDG_DEBUG, "DEBUG: found in sub chunk "); 01394 printChunk(CDG_DEBUG, subChunk); 01395 cdgPrintf(CDG_DEBUG, "\n"); 01396 #endif 01397 return subChunk; 01398 } else { 01399 #if DEBUG_FINDCHUNK 01400 cdgPrintf(CDG_DEBUG, "DEBUG: found, no sub chunk, using "); 01401 printChunk(CDG_DEBUG, chunk); 01402 cdgPrintf(CDG_DEBUG, "\n"); 01403 #endif 01404 return chunk; 01405 } 01406 } 01407 } 01408 01409 #if DEBUG_FINDCHUNK 01410 cdgPrintf(CDG_DEBUG, "DEBUG: not found\n"); 01411 #endif 01412 01413 return NULL; 01414 } 01415 01416 /* ------------------------------------------------------------------------- 01417 * printChunk: print a single chunk and all its subchunks 01418 * 01419 * parameters: 01420 * mode = print mode, e.g. CDG_INFO 01421 * chunk = the chunk to be printed 01422 */ 01423 void printChunk(unsigned long mode, Chunk chunk) 01424 { 01425 GraphemNode from, to; 01426 List l,ll; 01427 List subChunks = NULL; 01428 if (!chunk) 01429 return; 01430 01431 if (chunk->type != NoChunk) 01432 cdgPrintf(mode, "[%s ", chunkerStringOfChunkType(chunk)); 01433 01434 /* get span of sub chunks */ 01435 if (chunk->subChunks) { 01436 subChunks = chunk->subChunks; 01437 from = ((Chunk)listElement(subChunks))->from; 01438 to = ((Chunk)listLastElement(subChunks))->to; 01439 } else { 01440 from = to = NULL; 01441 } 01442 01443 /* print all words spanning the current chunk */ 01444 l = chunk->nodes; 01445 while (l) { 01446 GraphemNode gn = listElement(l); 01447 01448 /* ... which arent part of a subchunk */ 01449 if (!subChunks || 01450 gn->arc->to <= from->arc->from || gn->arc->from >= to->arc->to) { 01451 List cats = 01452 #if 0 01453 getCategories(gn); 01454 #else 01455 listAppendElement(NULL, getCategory(gn)); 01456 #endif 01457 cdgPrintf(mode, "%s%s/", 01458 (gn == chunk->head && chunk->type != NoChunk)?"*":"", gn->arc->word); 01459 for (ll = cats; ll; ll = listNext(ll)) { 01460 String cat = listElement(ll); 01461 cdgPrintf(mode, "%s%s", cat, listNext(ll)?"/":""); 01462 } 01463 listDelete(cats); 01464 l = listNext(l); 01465 } else { 01466 Chunk subChunk = listElement(subChunks); 01467 subChunks = listNext(subChunks); 01468 if (subChunks) { 01469 from = ((Chunk)listElement(subChunks))->from; 01470 } else { 01471 from = to = NULL; 01472 } 01473 printChunk(mode, subChunk); 01474 for (; l; l = listNext(l)) { 01475 gn = listElement(l); 01476 if (gn->arc->from >= subChunk->to->arc->to) 01477 break; 01478 } 01479 } 01480 01481 if (l) 01482 cdgPrintf(mode, " "); 01483 } 01484 01485 if (chunk->type != NoChunk) 01486 cdgPrintf(mode, "]"); 01487 } 01488 01489 /* ------------------------------------------------------------------------- 01490 * print the chunks of a lattice. 01491 * 01492 * @param mode print mode, e.g. CDG_INFO 01493 * @param chunks the list of chunks to be printed 01494 */ 01495 void chunkerPrintChunks(unsigned long mode, List chunks) 01496 { 01497 List l; 01498 01499 if (!chunks) { 01500 return; 01501 } 01502 01503 for (l = chunks; l; l = listNext(l)) { 01504 printChunk(mode, listElement(l)); 01505 cdgPrintf(mode, " "); 01506 } 01507 cdgPrintf(mode, "\n"); 01508 } 01509 01510 /* ------------------------------------------------------------------------- 01511 * return the string representation of a chunk type. 01512 */ 01513 ChunkType chunkerChunkTypeOfString(String tag) 01514 { 01515 if (strcmp(tag, "NC") == 0) { 01516 return NChunk; 01517 } 01518 if (strcmp(tag, "VC") == 0) { 01519 return VChunk; 01520 } 01521 if (strcmp(tag, "PC") == 0) { 01522 return PChunk; 01523 } 01524 return NoChunk; 01525 } 01526 01527 /* ------------------------------------------------------------------------- 01528 * return the string representation of a chunk type. 01529 */ 01530 String chunkerStringOfChunkType(Chunk chunk) 01531 { 01532 if (!chunk) { 01533 return strRegister("(null)"); 01534 } 01535 switch(chunk->type) { 01536 case NChunk: return strRegister("NC"); 01537 case VChunk: return strRegister("VC"); 01538 case PChunk: return strRegister("PC"); 01539 default: return strRegister("XC"); 01540 } 01541 } 01542 01543 /* ------------------------------------------------------------------------- 01544 * compute the chunks. 01545 * 01546 * @param chunker the current chunker 01547 * @returns a list of chunks or NULL on failure. 01548 * 01549 * The returned list is owned by the chunker and thus needs no extra 01550 * deallocation besides chunker deletion. The list of computed chunks 01551 * is then used to assert the chunking information into the LexemGraph lg. 01552 */ 01553 List chunkerChunk(Chunker chunker) 01554 { 01555 List annoChunks = NULL; 01556 List l; 01557 01558 void backlink(Chunk chunk) { 01559 List l; 01560 for (l = chunk->nodes; l; l = listNext(l)) { 01561 GraphemNode gn = listElement(l); 01562 gn->chunk = chunk; 01563 } 01564 for (l = chunk->subChunks; l; l = listNext(l)) { 01565 backlink(listElement(l)); 01566 } 01567 } 01568 01569 if (!chunker) { 01570 return NULL; 01571 } 01572 01573 switch (chunker->mode) { 01574 case RealChunker: 01575 chunker->chunks = getChunks(chunker); 01576 break; 01577 case FakeChunker: 01578 chunker->chunks = getFakeChunks(chunker); 01579 break; 01580 case EvalChunker: 01581 chunker->chunks = getChunks(chunker); 01582 annoChunks = getFakeChunks(chunker); 01583 evalChunker(chunker, annoChunks); 01584 cdgPrintf(CDG_INFO, "\n"); 01585 listForEachDelete(annoChunks, chunkerChunkDelete); 01586 break; 01587 default: /* never reach */ 01588 cdgPrintf(CDG_ERROR, "ERROR: unknown chunker mode\n"); 01589 return NULL; 01590 } 01591 01592 if (!chunker->chunks) 01593 return NULL; 01594 01595 /* enrich the lexemgraph with chunking information. */ 01596 if (chunker->lg->chunks) { 01597 listForEachDelete(chunker->lg->chunks, chunkerChunkDelete); 01598 chunker->lg->chunks = NULL; 01599 } 01600 01601 /* attach chunk graphemes */ 01602 for (l = chunker->chunks; l; l = listNext(l)) { 01603 Chunk chunk = chunkerCloneChunk(listElement(l)); 01604 chunker->lg->chunks = listAppendElement(chunker->lg->chunks, chunk); 01605 chunkerReplaceGraphemes(chunk, chunker->lg); 01606 backlink(chunk); 01607 } 01608 01609 return chunker->chunks; 01610 } 01611 01612 /* ------------------------------------------------------------------------- 01613 * are two chunks isomorph. 01614 * 01615 * @param c1 the one chunk 01616 * @param c2 the other chunk 01617 * @returns TRUE if the two chunks are equal, else FALSE 01618 * 01619 * This is again a recursion where all sub chunks must match aswell. 01620 */ 01621 Boolean compareChunks(Chunk c1, Chunk c2) 01622 { 01623 List l, ll; 01624 01625 if ((c1 && !c2) || (!c1 && c2) || 01626 c1->from->arc->from != c2->from->arc->from || 01627 c1->from->arc->to != c2->from->arc->to || 01628 c1->from->arc->word != c2->from->arc->word || 01629 c1->to->arc->from != c2->to->arc->from || 01630 c1->to->arc->to != c2->to->arc->to || 01631 c1->to->arc->word != c2->to->arc->word || c1->type != c2->type) { 01632 return FALSE; 01633 } 01634 01635 for (l = c1->subChunks, ll = c2->subChunks; l && ll; 01636 l = listNext(l), ll = listNext(ll)) { 01637 if (!compareChunks(listElement(l), listElement(ll))) { 01638 return FALSE; 01639 } 01640 } 01641 01642 if (!l && !ll) 01643 return TRUE; 01644 else 01645 return FALSE; 01646 } 01647 01648 /* ------------------------------------------------------------------------- 01649 * evaluate computed agains annotated chunks. 01650 * 01651 * @param chunker the current chunker 01652 * @param annoChunks the list of chunks extracted from the annotation 01653 * @returns the number of errors 01654 */ 01655 int evalChunker(Chunker chunker, List annoChunks) 01656 { 01657 int noChunks = countChunks(chunker->chunks); 01658 int noAnnoChunks = countChunks(annoChunks); 01659 int noErrors = 0, noUnChunked = 0; 01660 List l; 01661 List errorChunks = NULL; 01662 List unChunked = NULL; 01663 01664 if (noChunks != noAnnoChunks) { 01665 cdgPrintf(CDG_WARNING, "\nWARNING: got %d chunk(s) but was expecting %d\n", 01666 noChunks, noAnnoChunks); 01667 } 01668 01669 /* check computed chunks */ 01670 for (l = chunker->chunks; l; l = listNext(l)) { 01671 Chunk chunk = listElement(l); 01672 Chunk annoChunk = findChunk(annoChunks, chunk->from->arc->from, 01673 chunk->to->arc->from); 01674 01675 if (chunk->type == NoChunk) 01676 continue; 01677 01678 if (!compareChunks(chunk, annoChunk)) 01679 errorChunks = listAppendElement(errorChunks, chunk); 01680 } 01681 01682 noErrors = listSize(errorChunks); 01683 if (noErrors) { 01684 cdgPrintf(CDG_WARNING, "\nWARNING: got %d erroneous chunk(s):\n", noErrors); 01685 for (l = errorChunks; l; l = listNext(l)) { 01686 Chunk chunk = listElement(l); 01687 01688 cdgPrintf(CDG_WARNING, " "); 01689 printChunk(CDG_WARNING, chunk); 01690 cdgPrintf(CDG_WARNING, "\n"); 01691 } 01692 } 01693 01694 /* check anno chunks */ 01695 for (l = annoChunks; l; l = listNext(l)) { 01696 Chunk annoChunk = listElement(l); 01697 Chunk chunk = findChunk(chunker->chunks, annoChunk->from->arc->from, 01698 annoChunk->to->arc->from); 01699 01700 if (annoChunk->type == NoChunk) 01701 continue; 01702 01703 if (annoChunk->type != NChunk && 01704 annoChunk->type != VChunk && annoChunk->type != PChunk) 01705 continue; 01706 01707 if (!compareChunks(chunk, annoChunk)) 01708 unChunked = listAppendElement(unChunked, annoChunk); 01709 } 01710 01711 noUnChunked = listSize(unChunked); 01712 if (noUnChunked) { 01713 cdgPrintf(CDG_WARNING, "\nWARNING: %d chunk(s) not found:\n", noUnChunked); 01714 for (l = unChunked; l; l = listNext(l)) { 01715 Chunk chunk = listElement(l); 01716 01717 cdgPrintf(CDG_WARNING, " "); 01718 printChunk(CDG_WARNING, chunk); 01719 cdgPrintf(CDG_WARNING, "\n"); 01720 } 01721 } 01722 01723 if (!noErrors && !noUnChunked) { 01724 cdgPrintf(CDG_INFO, "\nINFO: chunker agrees with annotations.\n", 01725 noChunks); 01726 } 01727 01728 cdgPrintf(CDG_INFO, "\nINFO: got %d answers of which %d where correct, having %d keys\n", 01729 noChunks, noChunks - noErrors, noAnnoChunks); 01730 01731 listDelete(errorChunks); 01732 listDelete(unChunked); 01733 01734 return noErrors + noUnChunked; 01735 } 01736 01737 /* ------------------------------------------------------------------------- 01738 * find an equivalent grapheme in a given lexemgraph. 01739 * 01740 * parameters: 01741 * lg = the lexemgraph 01742 * arc = the arc (possibly not used in the lexemgraph) 01743 * returns: an equivalent arc. 01744 */ 01745 GraphemNode findGrapheme(LexemGraph lg, GraphemNode old) 01746 { 01747 int i; 01748 01749 if (!old) 01750 return NULL; 01751 01752 for (i = 0; i < vectorSize(lg->graphemnodes); i++) { 01753 GraphemNode gn = vectorElement(lg->graphemnodes, i); 01754 if (gn->arc->from == old->arc->from && 01755 gn->arc->to == old->arc->to) { 01756 return gn; 01757 } 01758 } 01759 01760 return NULL; 01761 } 01762 01763 /* ------------------------------------------------------------------------- 01764 * chunkerReplaceGraphemes: replace all grapheme references in a chunk with those 01765 * given in a lexemgraph. 01766 * 01767 * parameters: 01768 * chunk = the structure using the arcs 01769 * lg = the lexemgraph using equivalent arcs 01770 */ 01771 void chunkerReplaceGraphemes(Chunk chunk, LexemGraph lg) 01772 { 01773 List l; 01774 01775 if (!chunk) 01776 return; 01777 01778 chunk->from = findGrapheme(lg, chunk->from); 01779 chunk->to = findGrapheme(lg, chunk->to); 01780 chunk->head = findGrapheme(lg, chunk->head); 01781 for (l = chunk->nodes; l; l = listNext(l)) { 01782 listSetElement(l, findGrapheme(lg, listElement(l))); 01783 } 01784 01785 for (l = chunk->subChunks; l; l = listNext(l)) { 01786 chunkerReplaceGraphemes(listElement(l), lg); 01787 } 01788 } 01789 01790 /* ------------------------------------------------------------------------- 01791 * validation command for @ref chunkerCommand. 01792 * 01793 * @param name name of the variable (chunkerCommand in our case) 01794 * @param value the value to be set 01795 * @param var the address of a possibly converted value 01796 * @returns TRUE on success 01797 * 01798 * Basically the chunkerCommand value is converted into @ref chunkerArgs 01799 * which are then used when actually forking the chunker command. 01800 * 01801 * @see chunkerInitialize, chunkerArgs 01802 */ 01803 Boolean chunkerCommandValidate(String name, String value, String *var) 01804 { 01805 int i; 01806 01807 /* check resposibility of this validator */ 01808 if (name != strRegister("chunkerCommand")) { 01809 cdgPrintf(CDG_ERROR, "ERROR: chunkerCommand callback does not handle %s\n", 01810 name); 01811 return FALSE; 01812 } 01813 01814 /* get rid of old stuff */ 01815 chunkerArgs = NULL; 01816 if (chunkerArgs) { 01817 for(i = 0; chunkerArgs[i]; i++) { 01818 cdgFreeString(chunkerArgs[i]); 01819 } 01820 memFree(chunkerArgs); 01821 chunkerArgs = NULL; 01822 } 01823 01824 /* set the chunker args */ 01825 { 01826 List args, l; 01827 String myvalue = strCopy(value); /* allocate memory */ 01828 String start, end; 01829 01830 /* loop over the chunkerCommand and list the contained arguments in args */ 01831 args = NULL; 01832 for (start = myvalue; *start; start = end) { 01833 01834 /* skip over whitespace */ 01835 for (;isspace((int)*start); start++); 01836 01837 /* get quoted argument */ 01838 if (*start == '"') { 01839 start++; 01840 01841 /* skip to next quote */ 01842 for (end = start; *end && *end != '"'; end++); 01843 01844 if (!*end) { 01845 cdgPrintf(CDG_ERROR, "ERROR: unbalanced quotes in taggerCommand\n"); 01846 return FALSE; 01847 } 01848 01849 /* terminate the current argument */ 01850 *end = '\0'; 01851 end++; 01852 01853 } 01854 01855 /* get unquoted argument */ 01856 else { 01857 01858 /* skip over whitespace */ 01859 for (end = start; *end && !isspace((int)*end) && *end != '"'; end++); 01860 01861 if (*end == '"') { 01862 cdgPrintf(CDG_ERROR, "ERROR: unbalanced quotes in taggerCommand\n"); 01863 return FALSE; 01864 } 01865 01866 01867 /* terminate the current argument */ 01868 if (*end) { 01869 *end = '\0'; 01870 end++; 01871 } 01872 } 01873 if (start != end) { 01874 args = listAppendElement(args, start); 01875 } 01876 } 01877 01878 /* put all args into the chunkerArgs */ 01879 chunkerArgs = (char **)memMalloc(sizeof(char *)*(listSize(args)+1)); 01880 for (l = args, i = 0; l; l = listNext(l), i++) { 01881 chunkerArgs[i] = strRegister(listElement(l)); 01882 } 01883 chunkerArgs[i] = NULL; 01884 listDelete(args); 01885 memFree(myvalue); /* allocated by strCopy */ 01886 } 01887 01888 #if 0 01889 fprintf(stderr, "chunkerArgs = \n"); 01890 for (i = 0; chunkerArgs[i]; i++) { 01891 fprintf(stderr, "%d: <%s>\n", i, chunkerArgs[i]); 01892 } 01893 fprintf(stderr, "\n"); 01894 #endif 01895 01896 01897 return TRUE; 01898 } 01899 01900 /* ------------------------------------------------------------------------- 01901 * initialize the chunker module. 01902 * 01903 * This is called only once by @ref cdgInitialize when the application starts 01904 * up. 01905 * 01906 * @see chunkerFinalize 01907 */ 01908 void chunkerInitialize(void) 01909 { 01910 setRegister("chunker", SET_BOOL, &chunkerUseChunker, NULL, NULL, NULL, NULL); 01911 01912 setRegister("chunkerMode", SET_ENUM, &chunkerMode, 01913 NULL, NULL, NULL, 01914 "fake", FakeChunker, 01915 "real", RealChunker, "eval", EvalChunker, NULL); 01916 01917 chunkerCommand = strRegister(""); 01918 setRegister("chunkerCommand", SET_STRING, &chunkerCommand, 01919 NULL, &chunkerCommandValidate, NULL, NULL); 01920 } 01921 01922 /* ------------------------------------------------------------------------- 01923 * finalize the chunker module. 01924 * 01925 * This is called by @ref cdgFinalize. (No good module without a finalizer and 01926 * a initializer.) 01927 * 01928 * @see chunkerInitialize 01929 */ 01930 void chunkerFinalize(void) 01931 { 01932 } 01933 01934 /* ------------------------------------------------------------------------- */ 01935 /* -- ENDOFFILE ------------------------------------------------------------ */ 01936 /** @} */

CDG 0.95 (20 Oct 2004)