00001 /* 00002 * Copyright (C) 1997-2004 The CDG Team <cdg@nats.informatik.uni-hamburg.de> 00003 * 00004 * This file is free software; as a special exception the author gives 00005 * unlimited permission to copy and/or distribute it, with or without 00006 * modifications, as long as this notice is preserved. 00007 * 00008 * This program is distributed in the hope that it will be useful, but 00009 * WITHOUT ANY WARRANTY, to the extent permitted by law; without even the 00010 * implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 00011 * 00012 */ 00013 00014 /* --------------------------------------------------------------------------- 00015 * @addtogroup Lexemgraph 00016 * 00017 * $Id: lexemgraph.h,v 1.68 2004/09/01 14:01:31 micha Exp $ 00018 */ 00019 00020 #ifndef LEXEMGRAPH_H 00021 #define LEXEMGRAPH_H 00022 00023 /* ------------------------------------------------------------------------ */ 00024 #include <cdg.h> 00025 #include <input.h> 00026 00027 /* ------------------------------------------------------------------------ */ 00028 00029 #ifndef SWIG 00030 /* ---------------------------------------------------------------------------- 00031 A LexemGraph is a word graph enriched with lexical information. In 00032 particular, it contains several lexemes for each arc of the Lattice 00033 that is lexically ambiguous. Each of these pairs of lexical entry and time 00034 span is called a lexeme node. The field \a lattice points to the 00035 underlying word graph. 00036 00037 The Vectors \a graphemnodes and \a nodes contains all grapheme nodes 00038 and lexeme nodes. Grapheme nodes are an intermediary data structure between 00039 word arcs and lexeme nodes that is not strictly necessary. 00040 00041 The fields \a min and \a max correspond to the fields with the 00042 same names in the Lattice. 00043 00044 The field \a distance holds an Array of the distance between any two 00045 lexeme nodes. The distance is measured in words, hence any two adjacent 00046 lexeme nodes have distance~1. The distance array is also used to check 00047 whether two lexeme nodes are compatible with each other, i.e. whether there 00048 is a path through the lexeme graph that includes them both. 00049 00050 The fields \a noOfPathsFromStart and \a noOfPathsToEnd hold arrays 00051 that map each grapheme node to the number of complete paths from the start 00052 or to the end of the entire graph that go through it. This is used to 00053 determine whether a lexeme node can be deleted or not. 00054 00055 The field \a noOfPaths holds the total number of paths from start to end 00056 possible in the word graph. This should reflect the state of the Vector 00057 \a isDeletedNode. 00058 00059 \b WARNING: The last three fields use GNU's \c long \c long \c int type 00060 as a cheap way to get 64-bit integers because the number of paths in 00061 realistic word graphs really does need that. However, the tool SWIG cannot 00062 deal with this type, and as a result XCDG cannot access the 00063 LexemGraph structure at all. Several functions in this library exist 00064 only to work around this restriction. Ultimately these fields should be 00065 converted to a proper Bigint type. 00066 00067 The Vector \a isDeletedNode marks those lexeme nodes that have been 00068 deleted and should be ignored. For instance, when an LV is added to a 00069 partial solution, all lexeme nodes that are not lgCompatibleNodes() 00070 to its lexeme nodes should be marked as deleted. 00071 */ 00072 typedef struct { 00073 Lattice lattice; /**< underlying word graph */ 00074 Vector graphemnodes; /**< vector of grapheme nodes */ 00075 Vector nodes; /**< vector of lexeme nodes */ 00076 int min; /**< minimum start position */ 00077 int max; /**< maximum end position */ 00078 00079 /** the following parts are considered dynamic */ 00080 00081 Array distance; /**< matrix of distances */ 00082 long long *noOfPathsFromStart; /**< vector of numbers of paths */ 00083 long long *noOfPathsToEnd; /**< vector of numbers of paths */ 00084 long long noOfPaths; /**< total number of paths through graph */ 00085 ByteVector isDeletedNode; /**< vector of boolean flags */ 00086 Vector tags; /**< set of POS tags */ 00087 List chunks; /**< set of all chunks of the lattice */ 00088 } LexemGraphStruct; 00089 typedef LexemGraphStruct *LexemGraph; 00090 00091 /* ---------------------------------------------------------------------------- 00092 A grapheme node represents the hypothesis of a specific phonetic form for a 00093 specific time interval. Thus, it corresponds biuniquely to an Arc in 00094 the underlying Lattice. However, it also holds information about the 00095 state of processing by a particular grammar. 00096 00097 The field \a no is an index into the field \a graphemnodes of the 00098 enclosing lexeme graph. 00099 00100 The field \a lexemgraph points to this graph. 00101 00102 The field \a arcpoints to the corresponding Arc. 00103 00104 The field \a lexicalEntries contains a List of all known lexical entries 00105 with the same phonetic form as the word on the Arc. 00106 00107 The field \a lexemes contains all lexeme nodes built from the elements 00108 of \a lexicalEntries. 00109 00110 The field \a ambiguity counts how many of these lexeme nodes are 00111 currently undeleted. 00112 */ 00113 struct GraphemNodeStruct { 00114 int no; /**< index in lg->graphemnodes */ 00115 LexemGraph lexemgraph; /**< pointer back to the lexem graph */ 00116 Arc arc; /**< arc in word lattice */ 00117 List lexemes; /**< list of disambiguated lexeme nodes */ 00118 Boolean live; /**< are there any lexemes left undeleted? */ 00119 Chunk chunk; /**< to which chunk do we belong */ 00120 }; 00121 #else 00122 typedef struct LexemGraphStruct *LexemGraph; 00123 #endif 00124 00125 /* ---------------------------------------------------------------------------- 00126 A LexemNode represents the hypothesis of a specific lexical variant 00127 for a specific time interval. 00128 00129 The field \a no is the index of the lexeme node in the field 00130 \a nodes of the enclosing lexeme graph. 00131 00132 The field \a lexemgraph point to the enclosing lexeme graph. 00133 00134 The field \a arc points to the underlying Arc. 00135 00136 The field \a lexem points to the lexical entry postulated. 00137 00138 The field \a grapheme points to the grapheme node used to build the 00139 lexeme node. 00140 00141 The field \a limit corresponds to the field \a limit in an LV: a 00142 lexeme node with limit \a x can only appear in solutions not better than 00143 \a x (see Frobbing) 00144 */ 00145 struct LexemNodeStruct { 00146 int no; /**< index in LexemGraph::nodes */ 00147 LexemGraph lexemgraph; /**< pointer back to the lexem graph */ 00148 Arc arc; /**< arc in word lattice */ 00149 LexiconItem lexem; /**< lexical entry */ 00150 GraphemNode grapheme; /**< pointer to the original grapheme */ 00151 Number tagscore; /**< Tagger: probability of the associated 00152 * category */ 00153 Number limit; /**< limit calculated during frobbing */ 00154 }; 00155 00156 /* ---------------------------------------------------------------------- */ 00157 extern LexemGraph lgNew(Lattice lat); 00158 00159 #ifndef SWIG 00160 extern Boolean lgCompactLVs; 00161 00162 extern Boolean lgAreDeletableNodes(LexemGraph lg, List lexemes); 00163 extern Boolean lgAreDeletedNodes(LexemGraph lg, List lexemes); 00164 extern Boolean lgCompatibleNodes(LexemGraph lg, LexemNode a, LexemNode b); 00165 extern Boolean lgCompatibleSets(LexemGraph lg, List a, List b); 00166 extern Boolean lgContains(LexemGraph lg, String form); 00167 extern Boolean lgCopySelection(LexemGraph destination, LexemGraph source); 00168 extern Boolean lgForbiddenBy(LexemGraph lg, LexemNode a, List b); 00169 extern Boolean lgIntersectingSets(List a, List b); 00170 extern Boolean lgIsDeletedNode(LexemGraph lg, LexemNode n); 00171 extern Boolean lgIsEndNode(GraphemNode n); 00172 extern Boolean lgIsStartNode(GraphemNode n); 00173 extern Boolean lgLexemeInLexemNodeList(LexiconItem le, List list); 00174 extern Boolean lgMayModify(LexemGraph lg, GraphemNode down, GraphemNode up); 00175 extern Boolean lgMember(LexemNode ln, List lexemes); 00176 extern Boolean lgNewFinal(LexemGraph lg); 00177 extern Boolean lgNewIter(LexemGraph lg, Arc arc); 00178 extern Boolean lgOverlap(LexemNode a, LexemNode b); 00179 extern Boolean lgSpuriousUppercase(LexemGraph lg, Arc arc); 00180 extern Boolean lgSubset(List a, List b); 00181 extern Boolean lgUpdateArcs(LexemGraph lg, Lattice lat, List listArcs); 00182 extern LexemGraph lgClone(LexemGraph lg); 00183 extern LexemGraph lgNewInit(); 00184 extern List lgMakePath(LexemGraph lg, List nodes); 00185 extern List lgMostProbablePath(LexemGraph lg); 00186 extern List lgPartitions(GraphemNode gn, BitString features); 00187 extern List lgQueryCat(LexemGraph lg, GraphemNode gn); 00188 extern int lgDistanceOfNodes(LexemGraph lg, LexemNode a, LexemNode b); 00189 extern int lgWidth(LexemGraph lg); 00190 extern long long computeNoOfPathsFromStart(LexemGraph lg, GraphemNode gn, 00191 long long sofar, long long maximal); 00192 extern long long computeNoOfPathsToEnd(LexemGraph lg, GraphemNode gn, 00193 long long sofar, long long maximal); 00194 extern void lgComputeDistances(LexemGraph lg); 00195 extern void lgComputeNoOfPaths(LexemGraph lg); 00196 extern void lgCopyTagScores(LexemGraph destination, LexemGraph source); 00197 extern void lgDelete(LexemGraph lg); 00198 extern void lgDeleteNode(LexemGraph lg, LexemNode n); 00199 extern void lgDeleteNodes(LexemGraph lg, List nodes); 00200 extern void lgInitialize(); 00201 extern void lgPrint(unsigned long, LexemGraph lg); 00202 extern void lgPrintNode(unsigned long mode, LexemNode ln); 00203 extern void lgRequireLexeme(LexemGraph lg, ByteVector v, LexemNode ln); 00204 extern void lgRequireLexemes(LexemGraph lg, ByteVector v, List which); 00205 #endif 00206 00207 /* ---------------------------------------------------------------------- */ 00208 00209 #endif /* don't insert anything after this #endif */