00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
#include <ctype.h>
00028
#include <unistd.h>
00029
#include <signal.h>
00030
#include <stdio.h>
00031
#include <errno.h>
00032
#include <limits.h>
00033
#include <string.h>
00034
#include <sys/types.h>
00035
#include <sys/time.h>
00036
#include <sys/resource.h>
00037
#include <sys/wait.h>
00038
#include "cdg.h"
00039
#include "chunker.h"
00040
#include "parse.h"
00041
#include "hook.h"
00042
#include "tagger.h"
00043
#include "set.h"
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055 struct ChunkerStruct {
00056 ChunkerMode mode;
00057 LexemGraph lg;
00058 Parse
parse;
00059 Level
mainlevel;
00060 int nrWords;
00061 int nrLevels;
00062 List
chunks;
00063 char **
args;
00064 pid_t
pid;
00065 int pipe1[2];
00066 int pipe2[2];
00067 };
00068
00069
00070
00071
00072 static Boolean
chunkerUseChunker =
FALSE;
00073
00074
00075 static ChunkerMode chunkerMode =
RealChunker;
00076
00077
00078 static String
chunkerCommand =
NULL;
00079
00080
00081 static char **
chunkerArgs =
NULL;
00082
00083
00084
00085
00086
static Chunk newChunk(ChunkType type);
00087
static Boolean
initChunker(
Chunker chunker);
00088
static Boolean
initFakeChunker(
Chunker chunker);
00089
static Boolean
initRealChunker(
Chunker chunker);
00090
static void resetChunker(
Chunker chunker);
00091
00092
00093
static List
getChunks(
Chunker chunker);
00094
static List
getFakeChunks(
Chunker chunker);
00095
static List
getFakeChunksAt(
Chunker chunker,
Chunk parent,
int index);
00096
static ChunkType getFakeChunkType(
Chunker chunker,
int index);
00097
static void postProcessChunks(
Chunker chunker, List chunks);
00098
static Chunk findChunk(List chunks,
int from,
int to);
00099
static Chunk mergeChunk(
Chunker chunker,
Chunk target,
Chunk source);
00100
static Chunk embedChunk(
Chunker chunker,
Chunk target,
Chunk source);
00101
static int evalChunker(
Chunker chunker, List annoChunks);
00102
static int countChunks(List chunks);
00103
static Boolean
compareChunks(
Chunk c1,
Chunk c2);
00104
00105
00106
static int parseGetModifiee(
Chunker chunker,
int index);
00107
static List
parseGetRoots(
Chunker chunker);
00108
static String
parseGetLabel(
Chunker chunker,
int index);
00109
static LevelValue
parseGetLevelValue(
Chunker chunker,
int index);
00110
static String
parseGetCategory(
Chunker chunker,
int index);
00111
static GraphemNode parseGetGrapheme(
Chunker chunker,
int index);
00112
00113
00114
static Boolean
cmpChunks(
Chunk c1,
Chunk c2,
Chunker chunker);
00115
static Boolean
cmpArcs(Arc arc1, Arc arc2);
00116
static Boolean
cmpGraphemes(
GraphemNode g1,
GraphemNode g2);
00117
static void printChunk(
unsigned long mode,
Chunk chunk);
00118
static GraphemNode findGrapheme(
LexemGraph lg,
GraphemNode gn);
00119
static List
getCategories(
GraphemNode gn);
00120
static String
getCategory(
GraphemNode gn);
00121
00122
00123
00124
00125
00126
00127
00128
00129
00130
00131
00132
00133
00134
00135
00136
00137 int terminateChild(pid_t pid)
00138 {
00139 pid_t res;
00140
00141 res = wait4(pid, (
int *)
NULL, WNOHANG,
NULL);
00142
if (res == pid) {
00143
return 0;
00144 }
00145
if (res == 0) {
00146
00147
if (-1 == kill(pid, SIGTERM)) {
00148
cdgPrintf(
CDG_WARNING,
"WARNING: can't send SIGTERM to %d: %s\n", pid,
00149 strerror(errno));
00150 }
00151 usleep(100000);
00152 res = wait4(pid, (
int *)
NULL, WNOHANG,
NULL);
00153
if (res == pid) {
00154
return 1;
00155 }
00156
if (res == 0) {
00157
if (-1 == kill(pid, SIGKILL)) {
00158
cdgPrintf(
CDG_WARNING,
"WARNING: can't send SIGKILL to %d: %s\n", pid,
00159 strerror(errno));
00160 }
00161 usleep(100000);
00162 res = wait4(pid, (
int *)
NULL, WNOHANG,
NULL);
00163
if (res == pid) {
00164
return 2;
00165 }
00166 }
00167 }
00168
00169
cdgPrintf(
CDG_ERROR,
"ERROR: can't terminate %d: %s\n", pid,
00170 strerror(errno));
00171
00172
return -1;
00173 }
00174
00175
00176
00177
00178
00179
00180
00181
00182
00183
00184
00185
00186
00187
00188
00189
00190 Chunker chunkerNew(ChunkerMode mode,
LexemGraph lg)
00191 {
00192
Chunker chunker;
00193
int i;
00194
00195
if (!
chunkerUseChunker)
00196
return NULL;
00197
00198
if (mode ==
DefaultChunker)
00199 mode =
chunkerMode;
00200
00201
if (mode !=
FakeChunker) {
00202
if(!
chunkerCommand || strlen(
chunkerCommand) == 0) {
00203
cdgPrintf(
CDG_ERROR,
"ERROR: no chunker command defined ... switching off the chunker\n");
00204
chunkerUseChunker =
FALSE;
00205
return NULL;
00206 }
00207 }
00208
00209 chunker = (
Chunker) memMalloc(
sizeof (
ChunkerStruct));
00210 chunker->
mode = mode;
00211 chunker->
lg = lg;
00212 chunker->
parse =
NULL;
00213 chunker->
nrWords = 0;
00214 chunker->
nrLevels = 0;
00215 chunker->
chunks =
NULL;
00216 chunker->
pid = -1;
00217 chunker->
args =
NULL;
00218
00219
00220
if(
chunkerArgs) {
00221
for (i = 0;
chunkerArgs[i]; i++)
00222 chunker->
args = (
char **)memMalloc(
sizeof (
char *) * (i + 1));
00223
for (i = 0;
chunkerArgs[i]; i++) {
00224 chunker->
args[i] = strRegister(
chunkerArgs[i]);
00225 }
00226 chunker->
args[i] =
NULL;
00227 }
00228
return (
initChunker(chunker)?chunker:
NULL);
00229 }
00230
00231
00232
00233
00234
00235
00236
00237
00238
00239
00240 Boolean
initChunker(
Chunker chunker)
00241 {
00242
switch (chunker->
mode) {
00243
case FakeChunker:
00244
return initFakeChunker(chunker);
00245
break;
00246
case RealChunker:
00247
return initRealChunker(chunker);
00248
case EvalChunker:
00249
return initFakeChunker(chunker) &&
initRealChunker(chunker);
00250
break;
00251
default:
00252
cdgPrintf(
CDG_ERROR,
"ERROR: unknown chunker mode\n");
00253
break;
00254 }
00255
00256
return FALSE;
00257 }
00258
00259
00260
00261
00262
00263
00264
00265
00266
00267
00268
00269
00270
00271
00272 Boolean
initFakeChunker(
Chunker chunker)
00273 {
00274 Level mainlevel = inputGetMainlevel(inputCurrentGrammar);
00275 AnnoEntry anno = findAnnoForLattice(chunker->
lg->
lattice,
TRUE);
00276
00277
00278
if (!anno) {
00279
cdgPrintf(
CDG_ERROR,
00280
"ERROR: no annotation for lattice %s found needed in fake mode\n",
00281 chunker->
lg->
lattice->id);
00282
return FALSE;
00283 }
00284
00285
00286
if (!mainlevel) {
00287
cdgPrintf(
CDG_ERROR,
"ERROR: no mainlevel defined\n");
00288
return FALSE;
00289 }
00290
00291
00292
if (latticeBranches(chunker->
lg->
lattice)) {
00293
cdgPrintf(
CDG_ERROR,
00294
"ERROR: sorry, chunking only works for linear lattices right now\n");
00295
return FALSE;
00296 }
00297
00298
00299 chunker->
parse = parseFromAnno(anno);
00300
if (!chunker->
parse) {
00301
resetChunker(chunker);
00302
return FALSE;
00303 }
00304
00305
00306
if (!parseDecorate(chunker->
parse, chunker->
lg, anno)) {
00307
cdgPrintf(
CDG_ERROR,
00308
"ERROR: could not decorate parse for annotation `%s'.\n",
00309 anno->id);
00310
resetChunker(chunker);
00311
return FALSE;
00312 }
00313
00314 chunker->
nrWords = vectorSize(chunker->
parse->words);
00315 chunker->
nrLevels = listSize(chunker->
parse->levels);
00316 chunker->
mainlevel = mainlevel;
00317
00318
return TRUE;
00319 }
00320
00321
00322
00323
00324
00325
00326
00327
00328
00329
00330
00331
00332 Boolean
initRealChunker(
Chunker chunker)
00333 {
00334
00335 chunker->
mainlevel = inputGetMainlevel(inputCurrentGrammar);
00336
if (!chunker->
mainlevel) {
00337
cdgPrintf(
CDG_ERROR,
"ERROR: no mainlevel defined\n");
00338
return FALSE;
00339 }
00340
00341
00342
00343
00344
if (pipe(chunker->
pipe1) < 0 || pipe(chunker->
pipe2) < 0) {
00345
cdgPrintf(
CDG_ERROR,
"ERROR: can't create pipes\n");
00346
return FALSE;
00347 }
00348
00349
#if 0
00350
cdgPrintf(
CDG_DEBUG,
"DEBUG: forking %s ... \n", chunker->
args[0]);
00351
#endif
00352
00353
00354
if ((chunker->
pid = fork()) < 0) {
00355
cdgPrintf(
CDG_ERROR,
"ERROR: can't fork: %s\n", strerror(errno));
00356
return FALSE;
00357 }
00358
00359
if (chunker->
pid) {
00360
00361 close(chunker->
pipe1[0]);
00362 close(chunker->
pipe2[1]);
00363
#if 0
00364
cdgPrintf(
CDG_DEBUG,
"DEBUG: process %d started.\n", chunker->
pid);
00365
#endif
00366
}
else {
00367
00368 close(chunker->
pipe1[1]);
00369 close(chunker->
pipe2[0]);
00370 signal(SIGINT, SIG_DFL);
00371 signal(SIGTERM, SIG_DFL);
00372 signal(SIGXCPU, SIG_DFL);
00373
if (chunker->
pipe1[0] != STDIN_FILENO) {
00374
if (dup2(chunker->
pipe1[0], STDIN_FILENO) != STDIN_FILENO) {
00375 fprintf(stderr,
"WARNING: child can't dup2 stdin: %s\n",
00376 strerror(errno));
00377 close(chunker->
pipe1[0]);
00378 }
00379 }
00380
if (chunker->
pipe2[1] != STDOUT_FILENO) {
00381
if (dup2(chunker->
pipe2[1], STDOUT_FILENO) != STDOUT_FILENO) {
00382 fprintf(stderr,
"WARNING: child can't dup2 stdout: %s\n",
00383 strerror(errno));
00384 close(chunker->
pipe2[1]);
00385 }
00386 }
00387 execvp(chunker->
args[0], chunker->
args);
00388
00389
00390 fprintf(stderr,
"ERROR: exec(%s) failed: %s\n",
00391 chunker->
args[0], strerror(errno));
00392 _exit(1);
00393 }
00394
00395
return TRUE;
00396 }
00397
00398
00399
00400
00401
00402
00403
00404 void resetChunker(
Chunker chunker)
00405 {
00406
if (chunker->
parse) {
00407 parseDelete(chunker->
parse);
00408 }
00409 listForEachDelete(chunker->
chunks,
chunkerChunkDelete);
00410 chunker->
lg =
NULL;
00411 chunker->
parse =
NULL;
00412 chunker->
nrWords = 0;
00413 chunker->
nrLevels = 0;
00414 chunker->
chunks =
NULL;
00415
if (chunker->
pid > 0) {
00416 close(chunker->
pipe1[0]);
00417 close(chunker->
pipe1[1]);
00418 close(chunker->
pipe2[0]);
00419 close(chunker->
pipe2[1]);
00420
terminateChild(chunker->
pid);
00421 }
00422
00423
if (chunker->
args) {
00424
int i;
00425
for(i = 0; chunker->
args[i]; i++) {
00426
cdgFreeString(chunker->
args[i]);
00427 }
00428 memFree(chunker->
args);
00429 chunker->
args =
NULL;
00430 }
00431 }
00432
00433
00434
00435
00436
00437
00438
00439 void chunkerDelete(
Chunker chunker)
00440 {
00441
if (!chunker)
00442
return;
00443
00444
resetChunker(chunker);
00445 memFree(chunker);
00446 }
00447
00448
00449
00450
00451
00452
00453
00454 Chunk newChunk(ChunkType type)
00455 {
00456
Chunk chunk;
00457
00458 chunk = (
Chunk)memMalloc(
sizeof(
ChunkStruct));
00459 chunk->
type = type;
00460 chunk->
nodes =
NULL;
00461 chunk->
from =
NULL;
00462 chunk->
to =
NULL;
00463 chunk->
subChunks =
NULL;
00464 chunk->
parent =
NULL;
00465 chunk->
head =
NULL;
00466
00467
return chunk;
00468 }
00469
00470
00471
00472
00473
00474
00475
00476
00477 Chunk chunkerCloneChunk(
Chunk chunk)
00478 {
00479
Chunk clone;
00480 List l;
00481
00482
if (!chunk)
00483
return NULL;
00484
00485 clone =
newChunk(chunk->
type);
00486 clone->
parent = chunk->
parent;
00487 clone->
from = chunk->
from;
00488 clone->
to = chunk->
to;
00489 clone->
head = chunk->
head;
00490 clone->
nodes = listClone(chunk->
nodes);
00491
for (l = chunk->
subChunks; l; l = listNext(l)) {
00492 clone->
subChunks = listAppendElement(clone->
subChunks,
chunkerCloneChunk(listElement(l)));
00493 }
00494
00495
return clone;
00496 }
00497
00498
00499
00500
00501
00502
00503
00504 void chunkerChunkDelete(
Chunk chunk)
00505 {
00506
if (!chunk)
00507
return;
00508
00509 listForEachDelete(chunk->
subChunks,
chunkerChunkDelete);
00510 listDelete(chunk->
nodes);
00511 memFree(chunk);
00512 }
00513
00514
00515
00516
00517 Boolean
cmpGraphemes(
GraphemNode g1,
GraphemNode g2)
00518 {
00519
return (g1->
arc->from < g2->
arc->from);
00520 }
00521
00522
00523
00524
00525 Boolean
cmpArcs(Arc arc1, Arc arc2)
00526 {
00527
return (arc1->from < arc2->from);
00528 }
00529
00530
00531
00532
00533 Boolean
cmpChunks(
Chunk c1,
Chunk c2,
Chunker chunker)
00534 {
00535
return (c1->
from->
arc->from < c2->
from->
arc->from);
00536 }
00537
00538
00539
00540
00541
00542
00543
00544
00545
00546
00547
00548 List
parseGetRoots(
Chunker chunker)
00549 {
00550
int i;
00551 List result =
NULL;
00552
00553
for (i = 0; i < chunker->
nrWords; i ++) {
00554
if (
parseGetModifiee(chunker, i) == -1) {;
00555 result = listAppendElement(result, (Pointer)i);
00556 }
00557 }
00558
00559
return result;
00560 }
00561
00562
00563
00564
00565
00566
00567
00568
00569
00570
00571 int parseGetModifiee(
Chunker chunker,
int index)
00572 {
00573
return (
int)vectorElement(chunker->
parse->verticesStructure,
00574 chunker->
nrLevels * index + chunker->
mainlevel->no);
00575 }
00576
00577
00578
00579
00580
00581
00582
00583
00584
00585 String
parseGetLabel(
Chunker chunker,
int index)
00586 {
00587
return (String) vectorElement(chunker->
parse->verticesLabels,
00588 chunker->
nrLevels * index + chunker->
mainlevel->no);
00589 }
00590
00591
00592
00593
00594
00595
00596
00597
00598
00599
00600 LevelValue
parseGetLevelValue(
Chunker chunker,
int index)
00601 {
00602
if (index == -1 || index == INT_MAX)
return NULL;
00603
return (LevelValue) vectorElement(chunker->
parse->LVs,
00604 inputCurrentGrammar->noOfLevels * index + chunker->
mainlevel->no);
00605 }
00606
00607
00608
00609
00610
00611
00612
00613
00614 String
getCategory(
GraphemNode gn)
00615 {
00616 List cats;
00617 String cat;
00618
00619
if (!gn)
00620
return NULL;
00621
00622 cats =
getCategories(gn);
00623
if (!cats) {
00624
return NULL;
00625 }
00626
00627
#if 0
00628
if (listSize(cats) > 1) {
00629
cdgPrintf(
CDG_WARNING,
00630
"WARNING: got more than one POS-tag\n");
00631 }
00632
#endif
00633
cat = listElement(cats);
00634 listDelete(cats);
00635
00636
return cat;
00637 }
00638
00639
00640
00641
00642
00643
00644
00645
00646 List
getCategories(
GraphemNode gn)
00647 {
00648 List cats =
NULL;
00649 List l;
00650 Value value;
00651 List lns =
NULL;
00652
00653 Boolean lexemSorter(
LexemNode lna,
LexemNode lnb)
00654 {
00655
return (lna->
tagscore > lnb->
tagscore);
00656 }
00657
00658
for (l = gn->
lexemes; l; l = listNext(l)) {
00659
LexemNode ln = listElement(l);
00660
00661
if (bvElement(gn->
lexemgraph->
isDeletedNode, ln->
no))
00662
continue;
00663
00664 value = ln->
lexem->values[taggerCategoryIndex];
00665
if (value->type == VTString) {
00666 lns = listInsertSorted(lns, ln, lexemSorter);
00667 }
00668 }
00669
00670
for (l = lns; l; l = listNext(l)) {
00671
LexemNode ln = listElement(l);
00672 value = ln->
lexem->values[taggerCategoryIndex];
00673 cats = listAddUniqueElement(cats, value->data.string);
00674
#if 0
00675
cdgPrintf(
CDG_DEBUG,
"DEBUG: cat %s = score %g\n",
00676 value->data.string, ln->
tagscore);
00677
#endif
00678
}
00679 listDelete(lns);
00680
00681
00682
return cats;
00683 }
00684
00685
00686
00687
00688
00689
00690
00691
00692
00693
00694 String
parseGetCategory(
Chunker chunker,
int index)
00695 {
00696
if (index == -1 || index == INT_MAX) {
00697
return "";
00698 }
else {
00699 LevelValue lv =
parseGetLevelValue(chunker, index);
00700
LexemNode ln = lv->modifier;
00701 Value value = ln->
lexem->values[taggerCategoryIndex];
00702
00703
if (value->type != VTString) {
00704
return NULL;
00705 }
00706
00707
return value->data.string;
00708 }
00709 }
00710
00711
00712
00713
00714
00715
00716
00717
00718
00719
00720 GraphemNode parseGetGrapheme(
Chunker chunker,
int index)
00721 {
00722 LevelValue lv;
00723
00724
if (index == -1 || index == INT_MAX) {
00725
return NULL;
00726 }
00727
00728 lv =
parseGetLevelValue(chunker, index);
00729
00730
if (!lv) {
00731
cdgPrintf(
CDG_ERROR,
"ERROR: can't get grapheme at %d\n", index);
00732 abort();
00733 }
00734
return lv->modifier->grapheme;
00735 }
00736
00737
00738
00739
00740
00741 ChunkType getFakeChunkType(
Chunker chunker,
int index)
00742 {
00743 String cat =
parseGetCategory(chunker, index);
00744
00745
if (cat == strRegister(
"NN") ||
00746 cat == strRegister(
"NE") ||
00747 cat == strRegister(
"FM") ||
00748 cat == strRegister(
"CARD") ||
00749 cat == strRegister(
"PDS") ||
00750 cat == strRegister(
"PPER") ||
00751 cat == strRegister(
"PRF") ||
00752 cat == strRegister(
"PRELS") ||
00753 cat == strRegister(
"PPOSS") ||
00754 cat == strRegister(
"PWS") ||
00755 cat == strRegister(
"PIS") ||
00756 cat == strRegister(
"ADJA") ||
00757 cat == strRegister(
"KON") ||
00758 cat == strRegister(
"TRUNC")) {
00759
return NChunk;
00760 }
00761
00762
if (cat == strRegister(
"VVINF") ||
00763 cat == strRegister(
"VVIZU") ||
00764 cat == strRegister(
"VVFIN") ||
00765 cat == strRegister(
"VVPP") ||
00766 cat == strRegister(
"VMINF") ||
00767 cat == strRegister(
"VMFIN") ||
00768 cat == strRegister(
"VMPP") ||
00769 cat == strRegister(
"VAINF") ||
00770 cat == strRegister(
"VAFIN") ||
00771 cat == strRegister(
"VAPP") ||
00772 cat == strRegister(
"PTKVZ") ||
00773 cat == strRegister(
"PTKZU")) {
00774
return VChunk;
00775 }
00776
00777
if (cat == strRegister(
"APPR") ||
00778 cat == strRegister(
"APPRART") ||
00779 cat == strRegister(
"APPO")) {
00780
return PChunk;
00781 }
00782
00783
if (cat == strRegister(
"KOKOM") ||
00784 cat == strRegister(
"PROAV") ||
00785 cat == strRegister(
"ADV") ||
00786 cat == strRegister(
"ADJD") ||
00787 cat == strRegister(
"KON")) {
00788
return NoChunk;
00789 }
00790
00791
return UnknownChunk;
00792 }
00793
00794
00795
00796
00797
00798
00799
00800
00801
00802
00803
00804
00805 Chunk mergeChunk(
Chunker chunker,
Chunk target,
Chunk source)
00806 {
00807 List l;
00808
00809
#ifdef DEBUG_GETCHUNKSAT
00810
cdgPrintf(
CDG_DEBUG,
"DEBUG: merging chunk %s <%s,%s> to target chunk %s <%s,%s>\n",
00811
chunkerStringOfChunkType(source), source->
from->
arc->word, source->
to->
arc->word,
00812
chunkerStringOfChunkType(target), target->
from->
arc->word, target->
to->
arc->word);
00813
#endif
00814
00815
00816
if (target->
to->
arc->to == source->
from->
arc->from) {
00817 target->
to = source->
to;
00818 }
00819
if (source->
to->
arc->to == target->
from->
arc->from) {
00820 target->
from = source->
from;
00821 }
00822
00823
for (l = source->
nodes; l; l = listNext(l)) {
00824 Arc arc = listElement(l);
00825 target->
nodes = listInsertSorted(target->
nodes, arc,
cmpGraphemes);
00826 }
00827
00828
00829
for (l = source->
subChunks; l; l = listNext(l)) {
00830
Chunk clone =
chunkerCloneChunk(listElement(l));
00831 target->
subChunks =
00832 listInsertSortedWithData(target->
subChunks, clone,
cmpChunks, chunker);
00833 }
00834
00835
return target;
00836 }
00837
00838
00839
00840
00841
00842
00843
00844
00845
00846 Chunk embedChunk(
Chunker chunker,
Chunk target,
Chunk source)
00847 {
00848 List l;
00849
00850
#ifdef DEBUG_GETCHUNKSAT
00851
cdgPrintf(
CDG_DEBUG,
00852
"DEBUG: embedding chunk %s <%s,%s> to target chunk %s <%s,%s>\n",
00853
chunkerStringOfChunkType(source),
00854 source->
from->
arc->word, source->
to->
arc->word,
00855
chunkerStringOfChunkType(target),
00856 target->
from->
arc->word, target->
to->
arc->word);
00857
#endif
00858
if (target->
to->
arc->to == source->
from->
arc->from) {
00859 target->
to = source->
to;
00860 }
00861
if (source->
to->
arc->to == target->
from->
arc->from) {
00862 target->
from = source->
from;
00863 }
00864
for (l = source->
nodes; l; l = listNext(l)) {
00865 Arc arc = listElement(l);
00866 target->
nodes = listInsertSorted(target->
nodes, arc,
cmpArcs);
00867 }
00868
00869 target->
subChunks =
00870 listInsertSortedWithData(target->
subChunks, source,
cmpChunks, chunker);
00871
00872
return target;
00873 }
00874
00875
00876
00877
00878
00879
00880
00881
00882 List
getFakeChunksAt(
Chunker chunker,
Chunk parent,
int index)
00883 {
00884 List chunks =
NULL;
00885 List children, leftChildren, rightChildren;
00886 List l;
00887
Chunk thisChunk, chunk;
00888
GraphemNode thisNode;
00889
00890
00891
void doit(List children) {
00892 List l, m;
00893
00894
for (l = children; l; l = listNext(l)) {
00895
int childIndex = (
int)listElement(l);
00896 String childLabel =
parseGetLabel(chunker, childIndex);
00897 List childChunks =
getFakeChunksAt(chunker, thisChunk, childIndex);
00898
00899
#ifdef DEBUG_GETCHUNKSAT
00900
cdgPrintf(
CDG_DEBUG,
"DEBUG: childChunks = ");
00901
for (m = childChunks; m; m = listNext(m)) {
00902 chunk = listElement(m);
00903
cdgPrintf(
CDG_DEBUG,
"%s <%s,%s> " ,
00904
chunkerStringOfChunkType(chunk),
00905 chunk->
from->
arc->word, chunk->
to->
arc->word);
00906 }
00907
cdgPrintf(
CDG_DEBUG,
"\n");
00908
#endif
00909
00910
00911
for (m = childChunks; m; m = listNext(m)) {
00912 chunk = listElement(m);
00913
00914
if ((thisChunk->
to->
arc->to == chunk->
from->
arc->from ||
00915 chunk->
to->
arc->to == thisChunk->
from->
arc->from) &&
00916 chunk->
parent == thisChunk &&
00917 childLabel != strRegister(
"GMOD") &&
00918 childLabel != strRegister(
"ADV")
00919 ) {
00920
00921
00922
if (thisChunk->
type ==
NChunk &&
00923 (chunk->
type ==
UnknownChunk ||
00924 chunk->
type ==
NChunk)) {
00925
#ifdef DEBUG_GETCHUNKSAT
00926
cdgPrintf(
CDG_DEBUG,
"DEBUG: appyling NC rule\n");
00927
#endif
00928
mergeChunk(chunker, thisChunk, chunk);
00929
chunkerChunkDelete(chunk);
00930
continue;
00931 }
00932
00933
00934
if (thisChunk->
type ==
PChunk) {
00935
00936
if(chunk->
type ==
UnknownChunk) {
00937
#ifdef DEBUG_GETCHUNKSAT
00938
cdgPrintf(
CDG_DEBUG,
"DEBUG: appyling PC rule for XCs\n");
00939
#endif
00940
00941
mergeChunk(chunker, thisChunk, chunk);
00942
chunkerChunkDelete(chunk);
00943
continue;
00944 }
00945
00946
00947
if (chunk->
type ==
NChunk) {
00948
#ifdef DEBUG_GETCHUNKSAT
00949
cdgPrintf(
CDG_DEBUG,
"DEBUG: appyling PC rule for NCs\n");
00950
#endif
00951
embedChunk(chunker, thisChunk, chunk);
00952
continue;
00953 }
00954
00955
00956
if (chunk->
type ==
PChunk && thisChunk->
from == thisChunk->
to) {
00957
#ifdef DEBUG_GETCHUNKSAT
00958
cdgPrintf(
CDG_DEBUG,
"DEBUG: appyling PC rule for PCs\n");
00959
#endif
00960
mergeChunk(chunker, thisChunk, chunk);
00961
chunkerChunkDelete(chunk);
00962
continue;
00963 }
00964 }
00965
00966
00967
if (thisChunk->
type ==
VChunk && chunk->
type ==
VChunk) {
00968
#ifdef DEBUG_GETCHUNKSAT
00969
cdgPrintf(
CDG_DEBUG,
"DEBUG: appyling VC rule\n");
00970
#endif
00971
mergeChunk(chunker, thisChunk, chunk);
00972
chunkerChunkDelete(chunk);
00973
continue;
00974 }
00975
00976
#ifdef DEBUG_GETCHUNKSAT
00977
cdgPrintf(
CDG_DEBUG,
"DEBUG: no rule applies\n");
00978
#endif
00979
}
00980
#ifdef DEBUG_GETCHUNKSAT
00981
else {
00982
cdgPrintf(
CDG_DEBUG,
"DEBUG: not adjacent\n");
00983 }
00984
#endif
00985
00986
00987
#ifdef DEBUG_GETCHUNKSAT
00988
cdgPrintf(
CDG_DEBUG,
"DEBUG: cannot add chunk %s <(%d,%d)%s,(%d,%d)%s> to target chunk %s <(%d,%d)%s,(%d,%d)%s>\n",
00989
chunkerStringOfChunkType(chunk),
00990 chunk->
from->
arc->from, chunk->
from->
arc->to,
00991 chunk->
from->
arc->word,
00992 chunk->
to->
arc->from, chunk->
to->
arc->to,
00993 chunk->
to->
arc->word,
00994
chunkerStringOfChunkType(thisChunk),
00995 thisChunk->
from->
arc->from, thisChunk->
from->
arc->to,
00996 thisChunk->
from->
arc->word,
00997 thisChunk->
to->
arc->from, thisChunk->
to->
arc->to,
00998 thisChunk->
to->
arc->word);
00999
#endif
01000
chunks = listInsertSortedWithData(chunks, chunk,
cmpChunks, chunker);
01001 }
01002 listDelete(childChunks);
01003 }
01004 }
01005
01006
01007
01008
01009
if (index == -1 || index == INT_MAX) {
01010
return NULL;
01011 }
01012
01013
01014 thisNode =
parseGetGrapheme(chunker, index);
01015 thisChunk =
newChunk(
getFakeChunkType(chunker, index));
01016 thisChunk->
parent = parent;
01017 thisChunk->
from = thisNode;
01018 thisChunk->
to = thisNode;
01019 thisChunk->
head = thisNode;
01020 thisChunk->
nodes = listAppendElement(
NULL, thisNode);
01021 children = parseGetModifiers(chunker->
parse, chunker->
mainlevel, index);
01022
01023
#ifdef DEBUG_GETCHUNKSAT
01024
cdgPrintf(
CDG_DEBUG,
"DEBUG: decending at %s\n",
01025 thisChunk->
from->
arc->word);
01026
#endif
01027
01028
01029
01030
if (!children) {
01031
#ifdef DEBUG_GETCHUNKSAT
01032
cdgPrintf(
CDG_DEBUG,
"DEBUG: returning from %s (leaf reached)\n",
01033 thisChunk->
from->
arc->word);
01034
#endif
01035
return listAppendElement(
NULL, thisChunk);
01036 }
01037
01038
01039 leftChildren = rightChildren =
NULL;
01040
for (l = children; l; l = listNext(l)) {
01041
int childIndex = (
int)listElement(l);
01042
GraphemNode gn =
parseGetGrapheme(chunker, childIndex);
01043
01044
01045
if (thisChunk->
from->
arc->from >= gn->
arc->to) {
01046 leftChildren = listAppendElement(leftChildren, (Pointer)childIndex);
01047 }
01048
01049
01050
else {
01051 rightChildren = listAppendElement(rightChildren, (Pointer)childIndex);
01052 }
01053 }
01054 listDelete(children);
01055
01056
01057
01058 {
01059 List reverseList =
NULL;
01060
01061
#if 0
01062
reverseList = listReverse(leftChildren);
01063
#else
01064
for (l = leftChildren; l; l = listNext(l)) {
01065 reverseList = listPrependElement(reverseList, listElement(l));
01066 }
01067
#endif
01068
listDelete(leftChildren);
01069 leftChildren = reverseList;
01070 }
01071
01072 doit(leftChildren);
01073 doit(rightChildren);
01074
01075
01076 chunks = listInsertSortedWithData(chunks, thisChunk,
cmpChunks, chunker);
01077
01078
#ifdef DEBUG_GETCHUNKSAT
01079
cdgPrintf(
CDG_DEBUG,
"DEBUG: returning from %s\n",
01080 thisChunk->
from->
arc->word);
01081
#endif
01082
return chunks;
01083 }
01084
01085
01086
01087
01088
01089
01090
01091 void postProcessChunks(
Chunker chunker, List inputList)
01092 {
01093 List l;
01094
Chunk chunk;
01095 String cat;
01096
01097
for (l = inputList; l; l = listNext(l)) {
01098 chunk = listElement(l);
01099 cat =
getCategory(chunk->
from);
01100
if ((chunk->
from == chunk->
to
01101 && (cat == strRegister(
"PTKZU") ||
01102 cat == strRegister(
"KON")))
01103 || chunk->
type ==
UnknownChunk) {
01104
#if 0
01105
cdgPrintf(
CDG_DEBUG,
"DEBUG: filtered chunk %s <%s, %s>\n",
01106
chunkerStringOfChunkType(chunk),
01107 chunk->
from->
arc->word, chunk->
to->word);
01108
#endif
01109
chunk->
type =
NoChunk;
01110 }
01111 }
01112 }
01113
01114
01115
01116
01117
01118
01119
01120 List
getFakeChunks(
Chunker chunker)
01121 {
01122 List chunks =
NULL;
01123 List result =
NULL;
01124 List l, m, roots;
01125
int index;
01126
01127 roots =
parseGetRoots(chunker);
01128
for (l = roots; l; l = listNext(l)) {
01129 index = (
int)listElement(l);
01130 chunks =
getFakeChunksAt(chunker,
NULL, index);
01131
postProcessChunks(chunker, chunks);
01132
for (m = chunks; m; m = listNext(m)) {
01133 result =
01134 listInsertSortedWithData(result, listElement(m),
cmpChunks, chunker);
01135 }
01136 listDelete(chunks);
01137 }
01138
01139 listDelete(roots);
01140
return result;
01141 }
01142
01143
01144
01145
01146
01147
01148
01149
01150
01151
01152 List
getChunks(
Chunker chunker)
01153 {
01154
int i, n;
01155 List l;
01156 List chunks =
NULL;
01157 String str;
01158
char buffer[MAXBUFFER];
01159 List outputs =
NULL;
01160 String bufptr;
01161 String token;
01162 Vector gns;
01163
01164
01165
#ifdef DEBUG_GETCHUNKS
01166
cdgPrintf(
CDG_DEBUG,
"DEBUG: writing to chunker:\n");
01167
#endif
01168
for (i = 0; i < vectorSize(chunker->
lg->
graphemnodes); i++) {
01169
GraphemNode gn = vectorElement(chunker->
lg->
graphemnodes, i);
01170 String cat =
getCategory(gn);
01171 str = strPrintf(
"%s\t%s\n", gn->
arc->word, cat);
01172 n = strlen(str);
01173
#ifdef DEBUG_GETCHUNKS
01174
cdgPrintf(
CDG_DEBUG, str);
01175
#endif
01176
if(write(chunker->
pipe1[1], str, n) != n) {
01177
cdgPrintf(
CDG_ERROR,
"ERROR: while writing %s(%d) to chunker: %s\n",
01178 str, n, strerror(errno));
01179 }
01180
cdgFreeString(str);
01181 }
01182
#ifdef DEBUG_GETCHUNKS
01183
cdgPrintf(
CDG_DEBUG,
"\n");
01184
#endif
01185
01186
01187 str = strRegister(
"\n");
01188
if(write(chunker->
pipe1[1], str, 1) != 1) {
01189
cdgPrintf(
CDG_ERROR,
"ERROR: while writing a newline to chunker: %s\n",
01190 strerror(errno));
01191 }
01192
cdgFreeString(str);
01193
01194 close(chunker->
pipe1[1]);
01195
01196
01197
for (i = 0; i < vectorSize(chunker->
lg->
graphemnodes); i++) {
01198 n = read(chunker->
pipe2[0], buffer, MAXBUFFER);
01199
if(n < 0) {
01200
cdgPrintf(
CDG_ERROR,
"ERROR: while reading from chunker: %s\n",
01201 strerror(errno));
01202
break;
01203 }
01204
if (n == 0)
01205
break;
01206 buffer[n] =
'\0';
01207 outputs = listAppendElement(outputs, strRegister(buffer));
01208
#if 0
01209
cdgPrintf(
CDG_DEBUG,
"DEBUG: reading\n%s\n", buffer);
01210
#endif
01211
}
01212 close(chunker->
pipe2[0]);
01213
01214
if(!outputs) {
01215
return NULL;
01216 }
01217
01218
01219 str = strCopy(strFromList(outputs));
01220 listForEachDelete(outputs,
cdgFreeString);
01221
01222
01223 bufptr = str;
01224 outputs =
NULL;
01225 token = strtok_r(str,
"\n", &bufptr);
01226
while (token) {
01227 outputs = listAppendElement(outputs, strRegister(token));
01228 token = strtok_r(
NULL,
"\n", &bufptr);
01229 }
01230 memFree(str);
01231
01232
#if 0
01233
for (l = outputs; l; l = listNext(l)) {
01234
cdgPrintf(
CDG_DEBUG,
"DEBUG: %s\n", listElement(l));
01235 }
01236
#endif
01237
01238
01239 gns = vectorNew(vectorSize(chunker->
lg->
graphemnodes));
01240
for (i = 0; i < vectorSize(chunker->
lg->
graphemnodes); i++) {
01241
GraphemNode gn = vectorElement(chunker->
lg->
graphemnodes, i);
01242 vectorSetElement(gns, gn, gn->
arc->from);
01243 }
01244
01245
01246
01247
01248
01249
for (l = outputs; l; l = listNext(l)) {
01250
int from = -1, to = -1, head = -1;
01251
char tag[3];
01252
Chunk chunk =
NULL;
01253
01254
01255
if (sscanf(listElement(l),
"%s\t%2s\t%d\t%d\t%d",
01256 buffer, tag, &from, &to, &head) != 5) {
01257
cdgPrintf(
CDG_WARNING,
"WARNING: unexpected line format '%s'\n",
01258 listElement(l));
01259
continue;
01260 }
01261
01262 from--;
01263 to--;
01264 head--;
01265
01266
#ifdef DEBUG_GETCHUNKS
01267
cdgPrintf(
CDG_DEBUG,
"DEBUG: %s\t%s\t%d\t%d\t%d\n",
01268 buffer, tag, from, to, head);
01269
#endif
01270
01271 chunk =
findChunk(chunks, from, to);
01272
if (!chunk) {
01273
01274 chunk =
newChunk(
chunkerChunkTypeOfString(tag));
01275 chunks = listAppendElement(chunks, chunk);
01276 chunk->
from = vectorElement(gns, from);
01277 chunk->
to = vectorElement(gns, to);
01278 chunk->
head = vectorElement(gns, head);
01279
for (i = from; i <= to; i++) {
01280 chunk->
nodes = listAppendElement(chunk->
nodes, vectorElement(gns, i));
01281 }
01282
continue;
01283 }
01284
01285
01286
if (chunk->
from->
arc->from == from && chunk->
to->
arc->from == to)
01287
continue;
01288
01289
01290
if (chunk->
from->
arc->from <= from &&
01291 chunk->
to->
arc->from >= to) {
01292
Chunk subChunk =
newChunk(
chunkerChunkTypeOfString(tag));
01293 subChunk->
from = vectorElement(gns, from);
01294 subChunk->
to = vectorElement(gns, to);
01295 subChunk->
head = vectorElement(gns, head);
01296
for (i = from; i <= to; i++) {
01297 subChunk->
nodes = listAppendElement(subChunk->
nodes, vectorElement(gns, i));
01298 }
01299
#ifdef DEBUG_GETCHUNKS
01300
cdgPrintf(
CDG_DEBUG,
01301
"DEBUG: creating a subchunk <%s:%d-%d> inside <%s:%d-%d>\n",
01302 tag, from, to,
01303
chunkerStringOfChunkType(chunk),
01304 chunk->
from->
arc->from, chunk->
to->
arc->from);
01305
#endif
01306
chunk->
subChunks =
01307 listInsertSortedWithData(chunk->
subChunks, subChunk,
cmpChunks, chunker);
01308
continue;
01309 }
01310
01311
01312
cdgPrintf(
CDG_ERROR,
"ERROR: programming error\n");
01313 abort();
01314 }
01315
01316
01317 listForEachDelete(outputs,
cdgFreeString);
01318 vectorDelete(gns);
01319
01320
return chunks;
01321 }
01322
01323
01324
01325
01326
01327
01328
01329
01330
01331 int countChunks(List chunks)
01332 {
01333 List l;
01334
int counter = 0;
01335
01336
for (l = chunks; l; l = listNext(l)) {
01337
Chunk chunk = listElement(l);
01338
if (chunk->
type ==
NChunk ||
01339 chunk->
type ==
PChunk ||
01340 chunk->
type ==
VChunk) {
01341 counter++;
01342 }
01343 counter +=
countChunks(chunk->
subChunks);
01344 }
01345
01346
return counter;
01347 }
01348
01349
01350
01351
01352
01353
01354
01355
01356
01357
01358
01359
01360
01361
01362
01363
01364
01365
01366 Chunk findChunk(List chunks,
int from,
int to)
01367 {
01368 List l;
01369
01370
for (l = chunks; l; l = listNext(l)) {
01371
Chunk chunk = listElement(l);
01372
01373
01374
if (chunk->
from->
arc->from == from && chunk->
to->
arc->from == to) {
01375
#if DEBUG_FINDCHUNK
01376
cdgPrintf(
CDG_DEBUG,
"DEBUG: found in ");
01377
printChunk(
CDG_DEBUG, chunk);
01378
cdgPrintf(
CDG_DEBUG,
"\n");
01379
#endif
01380
return chunk;
01381 }
01382
01383
01384
if (chunk->
from->
arc->from <= from && chunk->
to->
arc->from >= to) {
01385
Chunk subChunk;
01386
01387
#if DEBUG_FINDCHUNK
01388
cdgPrintf(
CDG_DEBUG,
"DEBUG: searching sub chunk\n");
01389
#endif
01390
subChunk =
findChunk(chunk->
subChunks, from, to);
01391
if (subChunk) {
01392
#if DEBUG_FINDCHUNK
01393
cdgPrintf(
CDG_DEBUG,
"DEBUG: found in sub chunk ");
01394
printChunk(
CDG_DEBUG, subChunk);
01395
cdgPrintf(
CDG_DEBUG,
"\n");
01396
#endif
01397
return subChunk;
01398 }
else {
01399
#if DEBUG_FINDCHUNK
01400
cdgPrintf(
CDG_DEBUG,
"DEBUG: found, no sub chunk, using ");
01401
printChunk(
CDG_DEBUG, chunk);
01402
cdgPrintf(
CDG_DEBUG,
"\n");
01403
#endif
01404
return chunk;
01405 }
01406 }
01407 }
01408
01409
#if DEBUG_FINDCHUNK
01410
cdgPrintf(
CDG_DEBUG,
"DEBUG: not found\n");
01411
#endif
01412
01413
return NULL;
01414 }
01415
01416
01417
01418
01419
01420
01421
01422
01423 void printChunk(
unsigned long mode,
Chunk chunk)
01424 {
01425
GraphemNode from, to;
01426 List l,ll;
01427 List subChunks =
NULL;
01428
if (!chunk)
01429
return;
01430
01431
if (chunk->
type !=
NoChunk)
01432
cdgPrintf(mode,
"[%s ",
chunkerStringOfChunkType(chunk));
01433
01434
01435
if (chunk->
subChunks) {
01436 subChunks = chunk->
subChunks;
01437 from = ((
Chunk)listElement(subChunks))->from;
01438 to = ((
Chunk)listLastElement(subChunks))->to;
01439 }
else {
01440 from = to =
NULL;
01441 }
01442
01443
01444 l = chunk->
nodes;
01445
while (l) {
01446
GraphemNode gn = listElement(l);
01447
01448
01449
if (!subChunks ||
01450 gn->
arc->to <= from->
arc->from || gn->
arc->from >= to->
arc->to) {
01451 List cats =
01452
#if 0
01453
getCategories(gn);
01454
#else
01455
listAppendElement(
NULL,
getCategory(gn));
01456
#endif
01457
cdgPrintf(mode,
"%s%s/",
01458 (gn == chunk->
head && chunk->
type !=
NoChunk)?
"*":
"", gn->
arc->word);
01459
for (ll = cats; ll; ll = listNext(ll)) {
01460 String cat = listElement(ll);
01461
cdgPrintf(mode,
"%s%s", cat, listNext(ll)?
"/":
"");
01462 }
01463 listDelete(cats);
01464 l = listNext(l);
01465 }
else {
01466
Chunk subChunk = listElement(subChunks);
01467 subChunks = listNext(subChunks);
01468
if (subChunks) {
01469 from = ((
Chunk)listElement(subChunks))->from;
01470 }
else {
01471 from = to =
NULL;
01472 }
01473
printChunk(mode, subChunk);
01474
for (; l; l = listNext(l)) {
01475 gn = listElement(l);
01476
if (gn->
arc->from >= subChunk->
to->
arc->to)
01477
break;
01478 }
01479 }
01480
01481
if (l)
01482
cdgPrintf(mode,
" ");
01483 }
01484
01485
if (chunk->
type !=
NoChunk)
01486
cdgPrintf(mode,
"]");
01487 }
01488
01489
01490
01491
01492
01493
01494
01495 void chunkerPrintChunks(
unsigned long mode, List chunks)
01496 {
01497 List l;
01498
01499
if (!chunks) {
01500
return;
01501 }
01502
01503
for (l = chunks; l; l = listNext(l)) {
01504
printChunk(mode, listElement(l));
01505
cdgPrintf(mode,
" ");
01506 }
01507
cdgPrintf(mode,
"\n");
01508 }
01509
01510
01511
01512
01513 ChunkType chunkerChunkTypeOfString(String tag)
01514 {
01515
if (strcmp(tag,
"NC") == 0) {
01516
return NChunk;
01517 }
01518
if (strcmp(tag,
"VC") == 0) {
01519
return VChunk;
01520 }
01521
if (strcmp(tag,
"PC") == 0) {
01522
return PChunk;
01523 }
01524
return NoChunk;
01525 }
01526
01527
01528
01529
01530 String
chunkerStringOfChunkType(
Chunk chunk)
01531 {
01532
if (!chunk) {
01533
return strRegister(
"(null)");
01534 }
01535
switch(chunk->
type) {
01536
case NChunk:
return strRegister(
"NC");
01537
case VChunk:
return strRegister(
"VC");
01538
case PChunk:
return strRegister(
"PC");
01539
default:
return strRegister(
"XC");
01540 }
01541 }
01542
01543
01544
01545
01546
01547
01548
01549
01550
01551
01552
01553 List
chunkerChunk(
Chunker chunker)
01554 {
01555 List annoChunks =
NULL;
01556 List l;
01557
01558
void backlink(
Chunk chunk) {
01559 List l;
01560
for (l = chunk->
nodes; l; l = listNext(l)) {
01561
GraphemNode gn = listElement(l);
01562 gn->
chunk = chunk;
01563 }
01564
for (l = chunk->
subChunks; l; l = listNext(l)) {
01565 backlink(listElement(l));
01566 }
01567 }
01568
01569
if (!chunker) {
01570
return NULL;
01571 }
01572
01573
switch (chunker->
mode) {
01574
case RealChunker:
01575 chunker->
chunks =
getChunks(chunker);
01576
break;
01577
case FakeChunker:
01578 chunker->
chunks =
getFakeChunks(chunker);
01579
break;
01580
case EvalChunker:
01581 chunker->
chunks =
getChunks(chunker);
01582 annoChunks =
getFakeChunks(chunker);
01583
evalChunker(chunker, annoChunks);
01584
cdgPrintf(
CDG_INFO,
"\n");
01585 listForEachDelete(annoChunks,
chunkerChunkDelete);
01586
break;
01587
default:
01588
cdgPrintf(
CDG_ERROR,
"ERROR: unknown chunker mode\n");
01589
return NULL;
01590 }
01591
01592
if (!chunker->
chunks)
01593
return NULL;
01594
01595
01596
if (chunker->
lg->
chunks) {
01597 listForEachDelete(chunker->
lg->
chunks,
chunkerChunkDelete);
01598 chunker->
lg->
chunks =
NULL;
01599 }
01600
01601
01602
for (l = chunker->
chunks; l; l = listNext(l)) {
01603
Chunk chunk =
chunkerCloneChunk(listElement(l));
01604 chunker->
lg->
chunks = listAppendElement(chunker->
lg->
chunks, chunk);
01605
chunkerReplaceGraphemes(chunk, chunker->
lg);
01606 backlink(chunk);
01607 }
01608
01609
return chunker->
chunks;
01610 }
01611
01612
01613
01614
01615
01616
01617
01618
01619
01620
01621 Boolean
compareChunks(
Chunk c1,
Chunk c2)
01622 {
01623 List l, ll;
01624
01625
if ((c1 && !c2) || (!c1 && c2) ||
01626 c1->
from->
arc->from != c2->
from->
arc->from ||
01627 c1->
from->
arc->to != c2->
from->
arc->to ||
01628 c1->
from->
arc->word != c2->
from->
arc->word ||
01629 c1->
to->
arc->from != c2->
to->
arc->from ||
01630 c1->
to->
arc->to != c2->
to->
arc->to ||
01631 c1->
to->
arc->word != c2->
to->
arc->word || c1->
type != c2->
type) {
01632
return FALSE;
01633 }
01634
01635
for (l = c1->
subChunks, ll = c2->
subChunks; l && ll;
01636 l = listNext(l), ll = listNext(ll)) {
01637
if (!
compareChunks(listElement(l), listElement(ll))) {
01638
return FALSE;
01639 }
01640 }
01641
01642
if (!l && !ll)
01643
return TRUE;
01644
else
01645
return FALSE;
01646 }
01647
01648
01649
01650
01651
01652
01653
01654
01655 int evalChunker(
Chunker chunker, List annoChunks)
01656 {
01657
int noChunks =
countChunks(chunker->
chunks);
01658
int noAnnoChunks =
countChunks(annoChunks);
01659
int noErrors = 0, noUnChunked = 0;
01660 List l;
01661 List errorChunks =
NULL;
01662 List unChunked =
NULL;
01663
01664
if (noChunks != noAnnoChunks) {
01665
cdgPrintf(
CDG_WARNING,
"\nWARNING: got %d chunk(s) but was expecting %d\n",
01666 noChunks, noAnnoChunks);
01667 }
01668
01669
01670
for (l = chunker->
chunks; l; l = listNext(l)) {
01671
Chunk chunk = listElement(l);
01672
Chunk annoChunk =
findChunk(annoChunks, chunk->
from->
arc->from,
01673 chunk->
to->
arc->from);
01674
01675
if (chunk->
type ==
NoChunk)
01676
continue;
01677
01678
if (!
compareChunks(chunk, annoChunk))
01679 errorChunks = listAppendElement(errorChunks, chunk);
01680 }
01681
01682 noErrors = listSize(errorChunks);
01683
if (noErrors) {
01684
cdgPrintf(
CDG_WARNING,
"\nWARNING: got %d erroneous chunk(s):\n", noErrors);
01685
for (l = errorChunks; l; l = listNext(l)) {
01686
Chunk chunk = listElement(l);
01687
01688
cdgPrintf(
CDG_WARNING,
" ");
01689
printChunk(
CDG_WARNING, chunk);
01690
cdgPrintf(
CDG_WARNING,
"\n");
01691 }
01692 }
01693
01694
01695
for (l = annoChunks; l; l = listNext(l)) {
01696
Chunk annoChunk = listElement(l);
01697
Chunk chunk =
findChunk(chunker->
chunks, annoChunk->
from->
arc->from,
01698 annoChunk->
to->
arc->from);
01699
01700
if (annoChunk->
type ==
NoChunk)
01701
continue;
01702
01703
if (annoChunk->
type !=
NChunk &&
01704 annoChunk->
type !=
VChunk && annoChunk->
type !=
PChunk)
01705
continue;
01706
01707
if (!
compareChunks(chunk, annoChunk))
01708 unChunked = listAppendElement(unChunked, annoChunk);
01709 }
01710
01711 noUnChunked = listSize(unChunked);
01712
if (noUnChunked) {
01713
cdgPrintf(
CDG_WARNING,
"\nWARNING: %d chunk(s) not found:\n", noUnChunked);
01714
for (l = unChunked; l; l = listNext(l)) {
01715
Chunk chunk = listElement(l);
01716
01717
cdgPrintf(
CDG_WARNING,
" ");
01718
printChunk(
CDG_WARNING, chunk);
01719
cdgPrintf(
CDG_WARNING,
"\n");
01720 }
01721 }
01722
01723
if (!noErrors && !noUnChunked) {
01724
cdgPrintf(
CDG_INFO,
"\nINFO: chunker agrees with annotations.\n",
01725 noChunks);
01726 }
01727
01728
cdgPrintf(
CDG_INFO,
"\nINFO: got %d answers of which %d where correct, having %d keys\n",
01729 noChunks, noChunks - noErrors, noAnnoChunks);
01730
01731 listDelete(errorChunks);
01732 listDelete(unChunked);
01733
01734
return noErrors + noUnChunked;
01735 }
01736
01737
01738
01739
01740
01741
01742
01743
01744
01745 GraphemNode findGrapheme(
LexemGraph lg,
GraphemNode old)
01746 {
01747
int i;
01748
01749
if (!old)
01750
return NULL;
01751
01752
for (i = 0; i < vectorSize(lg->
graphemnodes); i++) {
01753
GraphemNode gn = vectorElement(lg->
graphemnodes, i);
01754
if (gn->
arc->from == old->
arc->from &&
01755 gn->
arc->to == old->
arc->to) {
01756
return gn;
01757 }
01758 }
01759
01760
return NULL;
01761 }
01762
01763
01764
01765
01766
01767
01768
01769
01770
01771 void chunkerReplaceGraphemes(
Chunk chunk,
LexemGraph lg)
01772 {
01773 List l;
01774
01775
if (!chunk)
01776
return;
01777
01778 chunk->
from =
findGrapheme(lg, chunk->
from);
01779 chunk->
to =
findGrapheme(lg, chunk->
to);
01780 chunk->
head =
findGrapheme(lg, chunk->
head);
01781
for (l = chunk->
nodes; l; l = listNext(l)) {
01782 listSetElement(l,
findGrapheme(lg, listElement(l)));
01783 }
01784
01785
for (l = chunk->
subChunks; l; l = listNext(l)) {
01786
chunkerReplaceGraphemes(listElement(l), lg);
01787 }
01788 }
01789
01790
01791
01792
01793
01794
01795
01796
01797
01798
01799
01800
01801
01802
01803 Boolean
chunkerCommandValidate(String name, String value, String *var)
01804 {
01805
int i;
01806
01807
01808
if (name != strRegister(
"chunkerCommand")) {
01809
cdgPrintf(
CDG_ERROR,
"ERROR: chunkerCommand callback does not handle %s\n",
01810 name);
01811
return FALSE;
01812 }
01813
01814
01815
chunkerArgs =
NULL;
01816
if (
chunkerArgs) {
01817
for(i = 0;
chunkerArgs[i]; i++) {
01818
cdgFreeString(
chunkerArgs[i]);
01819 }
01820 memFree(
chunkerArgs);
01821
chunkerArgs =
NULL;
01822 }
01823
01824
01825 {
01826 List args, l;
01827 String myvalue = strCopy(value);
01828 String start, end;
01829
01830
01831 args =
NULL;
01832
for (start = myvalue; *start; start = end) {
01833
01834
01835
for (;isspace((
int)*start); start++);
01836
01837
01838
if (*start ==
'"') {
01839 start++;
01840
01841
01842
for (end = start; *end && *end !=
'"'; end++);
01843
01844
if (!*end) {
01845
cdgPrintf(
CDG_ERROR,
"ERROR: unbalanced quotes in taggerCommand\n");
01846
return FALSE;
01847 }
01848
01849
01850 *end =
'\0';
01851 end++;
01852
01853 }
01854
01855
01856
else {
01857
01858
01859
for (end = start; *end && !isspace((
int)*end) && *end !=
'"'; end++);
01860
01861
if (*end ==
'"') {
01862
cdgPrintf(
CDG_ERROR,
"ERROR: unbalanced quotes in taggerCommand\n");
01863
return FALSE;
01864 }
01865
01866
01867
01868
if (*end) {
01869 *end =
'\0';
01870 end++;
01871 }
01872 }
01873
if (start != end) {
01874 args = listAppendElement(args, start);
01875 }
01876 }
01877
01878
01879
chunkerArgs = (
char **)memMalloc(
sizeof(
char *)*(listSize(args)+1));
01880
for (l = args, i = 0; l; l = listNext(l), i++) {
01881
chunkerArgs[i] = strRegister(listElement(l));
01882 }
01883
chunkerArgs[i] =
NULL;
01884 listDelete(args);
01885 memFree(myvalue);
01886 }
01887
01888
#if 0
01889
fprintf(stderr,
"chunkerArgs = \n");
01890
for (i = 0;
chunkerArgs[i]; i++) {
01891 fprintf(stderr,
"%d: <%s>\n", i,
chunkerArgs[i]);
01892 }
01893 fprintf(stderr,
"\n");
01894
#endif
01895
01896
01897
return TRUE;
01898 }
01899
01900
01901
01902
01903
01904
01905
01906
01907
01908 void chunkerInitialize(
void)
01909 {
01910 setRegister(
"chunker", SET_BOOL, &
chunkerUseChunker,
NULL,
NULL,
NULL,
NULL);
01911
01912 setRegister(
"chunkerMode", SET_ENUM, &
chunkerMode,
01913
NULL,
NULL,
NULL,
01914
"fake",
FakeChunker,
01915
"real",
RealChunker,
"eval",
EvalChunker,
NULL);
01916
01917
chunkerCommand = strRegister(
"");
01918 setRegister(
"chunkerCommand", SET_STRING, &
chunkerCommand,
01919
NULL, &
chunkerCommandValidate,
NULL,
NULL);
01920 }
01921
01922
01923
01924
01925
01926
01927
01928
01929
01930 void chunkerFinalize(
void)
01931 {
01932 }
01933
01934
01935
01936