00001 /* 00002 00003 The BLAH library, a container library 00004 Copyright (C) 1997-2004 The CDG Team <cdg@nats.informatik.uni-hamburg.de> 00005 00006 This program is free software; you can redistribute it and or modify 00007 it under the terms of the GNU General Public License as published by 00008 the Free Software Foundation; either version 2 of the License, or 00009 (at your option) any later version. 00010 00011 This program is distributed in the hope that it will be useful, 00012 but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00014 GNU General Public License for more details. 00015 00016 You should have received a copy of the GNU General Public License 00017 along with this program; if not, write to the Free Software 00018 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 00019 00020 Contact: blah@nats.informatik.uni-hamburg.de 00021 00022 $Id: string.c,v 1.11 2004/09/23 15:47:14 micha Exp $ 00023 00024 */ 00025 00026 /* --------------------------------------------------------------------------- 00027 * @defgroup String Strings 00028 * implementation of strings. 00029 * 00030 * Strings in C are represented by arrays of characters.The end of the string 00031 * is marked with a special character, the null character, which is simply the 00032 * character with the value 0. Whenever we write a string, enclosed in double 00033 * quotes, C automatically creates an array of characters for us, containing that 00034 * string, terminated by the \a NULL character. 00035 * 00036 * FIXME: this comment is plain copy/pasted from cdg.c 00037 * 00038 * The symbol table \ref cdgSymbolTable)is used to share strings registered 00039 * (cdgRegisterString()) to it. Sharing strings speeds up string comparison 00040 * alot as we dont need strcmp for this any more. A pointer comarison suffices. 00041 * So we store strings in a hash. But 00042 * be warned: changing a registered string directly will break things seriously. 00043 * If you need to change a registered string, make a copy of it (strCopy), 00044 * manipulate it for your needs and then register it once again. 00045 * @{ 00046 */ 00047 00048 /* ------------------------------------------------------------------------ */ 00049 #include <stdio.h> 00050 #include <string.h> 00051 #include <assert.h> 00052 #include <iconv.h> 00053 #include "blah.h" 00054 //#define DEBUG_DECODING 00055 //#define DEBUG_STRING 00056 00057 /* --------------------------------------------------------------------------- 00058 * container for shared strings. 00059 * This global variable stores all registered strings 00060 */ 00061 static Hashtable _strStore = NULL; 00062 00063 /* --------------------------------------------------------------------------- 00064 * strings with reference counters 00065 */ 00066 typedef struct { 00067 String data; /**< the workload */ 00068 int counter; /**< a reference counter */ 00069 #ifdef DEBUG_STRING 00070 int magic; /**< magic value */ 00071 #endif 00072 } SharedStringStruct; 00073 typedef SharedStringStruct *SharedString; 00074 00075 static SharedString _strNewSharedString(const String str); 00076 static void _strDeleteSharedString(SharedString); 00077 static void _strDeleteStoreEntry(String, SharedString); 00078 static SharedString _strLookup(String); 00079 static String _strTryRegister(String); 00080 #ifdef DEBUG_STRING 00081 static void _strDebugStoreEntry(String, SharedString); 00082 #endif 00083 00084 /* 00085 * Conversion descriptor for String decoding with iconv (3). 00086 * This is a handler created with iconv_open (3). 00087 */ 00088 static iconv_t _strConversionDescriptor = (iconv_t)-1; 00089 00090 /* --------------------------------------------------------------------------- 00091 * this performs string copying function. 00092 * 00093 * This function constructs a copy of the given source string. The returned string is 00094 * not shared any more as its source might have been. So in order to manipulate 00095 * a shared string, first strCopy() it, then alter it and strRegister() it finaly. 00096 * While copying the string new memory is allocated for you. Take care of it. 00097 * 00098 * @param s the string that has to be copied. 00099 * @returns the new copied string. 00100 */ 00101 String strCopy(const String s) 00102 { 00103 String t = strdup(s); 00104 00105 if (t == NULL) { 00106 fprintf(stderr, "No memory left\n"); 00107 abort(); 00108 } 00109 return t; 00110 } 00111 00112 /* -------------------------------------------------------------------------- 00113 * returns a formated string. 00114 * This function is our version of vsprintf(). See strPrintf() for more 00115 * information. 00116 */ 00117 String strVPrintf(const String fmt, va_list ap) 00118 { 00119 /* Guess we need no more than 50 bytes. */ 00120 int n, size = 50; 00121 String p = (String)memMalloc(size), q; 00122 00123 while (1) { 00124 /* Try to print in the allocated space. */ 00125 n = vsnprintf(p, size, fmt, ap); 00126 /* If that worked, return the string. */ 00127 if (n > -1 && n < size) { 00128 break; 00129 } 00130 /* Else try again with more space. */ 00131 if (n > -1) /* glibc 2.1 */ 00132 size = n + 1; /* precisely what is needed */ 00133 else /* glibc 2.0 */ 00134 size *= 2; /* twice the old size */ 00135 p = memRealloc(p, size); 00136 } 00137 00138 q = strRegister(p); 00139 memFree(p); 00140 return q; 00141 } 00142 00143 /* -------------------------------------------------------------------------- 00144 * returns a formated string. 00145 * This function basically has been taken from the sprintf() manual page. 00146 * The differences between sprintf() and strPrintf() are that you don't have 00147 * to bother about memory allocation. We allocate enuf memory to hold the 00148 * formated result string. Further more then this string is strRegister()ed 00149 * for you, so you might get an already shared string returned to you. 00150 * Use strDelete() to indicate your lake of interest on the result string. 00151 */ 00152 String strPrintf(const String fmt, ...) 00153 { 00154 String str; 00155 va_list ap; 00156 00157 va_start(ap, fmt); 00158 str = strVPrintf(fmt, ap); 00159 va_end(ap); 00160 00161 return str; 00162 } 00163 00164 /* -------------------------------------------------------------------------- 00165 * lookup a string in the string store. 00166 * This function returns a pointer to a SharedString if the given string 00167 * argument is already shared, or NULL if this string isn't shared yet. 00168 */ 00169 static SharedString _strLookup(String str) 00170 { 00171 if (!str) 00172 return NULL; 00173 else 00174 return (SharedString) hashGet(_strStore, str); 00175 } 00176 00177 /* -------------------------------------------------------------------------- 00178 * try to register a new string. 00179 * This function only registers new strings. It will not increase the 00180 * reference counter of an already registered string. In any case it will 00181 * return a known string. 00182 */ 00183 static String _strTryRegister(String str) 00184 { 00185 SharedString sstr = _strLookup(str); 00186 00187 if (!sstr) 00188 return strRegister(str); 00189 else 00190 return str; 00191 } 00192 00193 /* -------------------------------------------------------------------------- 00194 * concatenates two strings. 00195 * 00196 * This is our version of the standard unix strcat() with the differences 00197 * that both arguments are \c const strings. A concatenated shared string of 00198 * \c a and \c b is returned. Both arguments might be NULL. 00199 * 00200 * @returns the target concatenated with the source. 00201 */ 00202 String strCat(const String a, const String b) 00203 { 00204 if (a) { 00205 if (b) { 00206 return strPrintf("%s%s", a, b); 00207 } else { 00208 return _strTryRegister(a); 00209 } 00210 } else { 00211 if (b) { 00212 return _strTryRegister(b); 00213 } else { 00214 return strRegister(""); 00215 } 00216 } 00217 } 00218 00219 /* -------------------------------------------------------------------------- 00220 * concatenates a list of strings. 00221 * 00222 * This function takes a list of strings and concatenates them together in 00223 * a newly allocated string. Be sure that all list elements are realy of 00224 * type string. We can't grant that here. If the list is NULL or empty 00225 * NULL is returned to you. The return value is a registered string. 00226 * 00227 * @param list of strings 00228 * @returns the new appended string. 00229 */ 00230 String strFromList(List list) 00231 { 00232 String str, result; 00233 List l; 00234 int len; 00235 00236 if (!list || listSize(list) == 0) { 00237 #if 0 00238 return strCopy(""); 00239 #else 00240 return NULL; 00241 #endif 00242 } 00243 00244 len = 1; 00245 for (l = list; l; l = listNext(l)) { 00246 len += strlen((String)listElement(l)); 00247 } 00248 00249 str = (String)memMalloc(sizeof(String) * len); 00250 str[0] = '\0'; 00251 00252 for (l = list; l; l = listNext(l)) { 00253 strcat(str, listElement(l)); 00254 } 00255 00256 result = strRegister(str); 00257 memFree(str); 00258 return result; 00259 } 00260 00261 /* --------------------------------------------------------------------------- 00262 * concatenates many strings together. 00263 * 00264 * This function allocates the memory for the result string. 00265 * The argument strings are not modified by the function. 00266 * The last string in the argument list must be NULL. 00267 * 00268 * @param str the head of the string to be produced 00269 * @param ... represents the strings to be appended to the head. 00270 * @returns the new appended string. 00271 */ 00272 String strAppend(const String head, ...) 00273 { 00274 String str; 00275 List l = NULL; 00276 va_list ap; 00277 00278 va_start(ap, head); 00279 l = listAppendElement(l, head); 00280 while(1) { 00281 str = va_arg(ap, String); 00282 if (!str) 00283 break; 00284 l = listAppendElement(l, str); 00285 } 00286 va_end(ap); 00287 str = strFromList(l); 00288 listDelete(l); 00289 00290 return str; 00291 } 00292 00293 /* --------------------------------------------------------------------------- 00294 * allocate a new SharedString. 00295 * This function constructs a new SharedString. It contains no worload data yet. 00296 */ 00297 static SharedString _strNewSharedString(const String str) 00298 { 00299 SharedString sstr = memMalloc(sizeof(SharedStringStruct)); 00300 sstr->data = str; 00301 sstr->counter = 1; 00302 #ifdef DEBUG_STRING 00303 sstr->magic = 4711; 00304 #endif 00305 00306 return sstr; 00307 } 00308 00309 #ifdef DEBUG_STRING 00310 /* --------------------------------------------------------------------------- 00311 * deallocate a key value pair. 00312 * This is used to deallocate the key and the value of the \c _strStore. 00313 */ 00314 static void _strDebugStoreEntry(String key, SharedString value) 00315 { 00316 int strcmpResult = strcmp(key, value->data); 00317 fprintf(stderr, "DEBUG: _strDebugStoreEntry('%s', '%p')\n", key, value); 00318 assert(strcmpResult == 0); 00319 } 00320 #endif 00321 00322 /* --------------------------------------------------------------------------- 00323 * deallocate a key value pair. 00324 * This is used to deallocate the key and the value of the \c _strStore. 00325 */ 00326 static void _strDeleteStoreEntry(String key, SharedString value) 00327 { 00328 _strDeleteSharedString(value); /* frees the key aswell */ 00329 } 00330 00331 /* --------------------------------------------------------------------------- 00332 * deallocated a SharedString. 00333 * This function deallocates a SharedString and its workload. 00334 */ 00335 static void _strDeleteSharedString(SharedString sstr) 00336 { 00337 if(sstr) { 00338 #ifdef DEBUG_STRING 00339 //fprintf(stderr, "DEBUG: freeing shared string '%s'\n", sstr->data); 00340 assert(sstr->magic == 4711); 00341 #endif 00342 memFree(sstr->data); 00343 memFree(sstr); 00344 } 00345 } 00346 00347 /* --------------------------------------------------------------------------- 00348 * unregister a string 00349 * This function tries deallocate the \c str string when its reference 00350 * counter licenses it. Note, that the pointer \c str might get invalid 00351 * or not depending on the reference counter. 00352 */ 00353 void strDelete(String str) 00354 { 00355 SharedString sstr = _strLookup(str); 00356 00357 if (sstr && --sstr->counter <= 0) { 00358 hashRemove(_strStore, str); 00359 _strDeleteSharedString(sstr); 00360 } 00361 } 00362 00363 /* --------------------------------------------------------------------------- 00364 * register a string in symbol table. 00365 * 00366 * This function registers a string to be shared. This is done by copying 00367 * it into the \c _strStore (leaving the argument string pointer untouched). 00368 * If the string already exists in symbol table the _stored_ string 00369 * is returned. 00370 * If the string doesn't exist then the string is _copied_ and 00371 * entered in the symbol table; the new string is returned. 00372 * In no case the memory of string s is referenced by the symbol table. 00373 * But the returned string is owned by the symbol table and will be 00374 * shared by other references later on. So be careful and never change 00375 * a registered string in place. Use strCopy() first to check out a copy 00376 * of a shared string. 00377 * If \c str is NULL then a registered empty string is returned, that is 00378 * strRegister(NULL) == strRegister(""). 00379 */ 00380 String strRegister(const String str) 00381 { 00382 SharedString sstr; 00383 String thisStr = str; 00384 00385 if (!thisStr) { 00386 thisStr = ""; 00387 } 00388 00389 sstr = _strLookup(thisStr); 00390 00391 if (!sstr) { 00392 sstr = _strNewSharedString(strCopy(thisStr)); /* diff to _strShare */ 00393 hashSet(_strStore, sstr->data, (Pointer) sstr); 00394 } else { 00395 sstr->counter++; 00396 } 00397 #ifdef DEBUG_STRING 00398 assert(sstr->magic == 4711); 00399 #endif 00400 return sstr->data; 00401 } 00402 00403 /* --------------------------------------------------------------------------- 00404 * return the number of shared strings. 00405 */ 00406 int strStoreSize(void) 00407 { 00408 return hashSize(_strStore); 00409 } 00410 00411 /* --------------------------------------------------------------------------- 00412 * module finalization routine. 00413 * This function is only called by blahInitialize() and should not be 00414 * used from outside 00415 * It basically deallocates the \c _strStore. 00416 * It also closes the conversionHandler. 00417 */ 00418 void strFinalize(void) 00419 { 00420 if (_strStore) { 00421 #ifndef DEBUG_STRING 00422 hashForEachFree((Hashtable)_strStore, _strDeleteStoreEntry); 00423 #else 00424 List keys = hashListOfKeys(_strStore); 00425 List l; 00426 fprintf(stderr, "DEBUG: nr keys=%d size=%d\n", listSize(keys), hashSize(_strStore)); 00427 assert(listSize(keys) == hashSize(_strStore)); 00428 for (l = keys; l; l = listNext(l)) { 00429 fprintf(stderr, "DEBUG: key='%s'\n", listElement(l)); 00430 } 00431 hashForEach((Hashtable)_strStore, _strDeleteStoreEntry); 00432 #endif 00433 } 00434 00435 /* close conversion Descriptor */ 00436 if (_strConversionDescriptor != (iconv_t)-1) { 00437 iconv_close(_strConversionDescriptor); 00438 #ifdef DEBUG_DECODING 00439 fprintf(stderr, "DEBUG: Conversion descriptor successfully closed\n"); 00440 #endif 00441 } 00442 00443 } 00444 00445 /* ---------------------------------------------------------------------------- 00446 Translate a string from unicode UTF-8 to ISO-8859-1. 00447 00448 The String will be left untouched if there is any problem while decoding. 00449 */ 00450 String strDecode(String word) { 00451 String result, s, inbuf, outbuf; 00452 int inbytesleft, outbytesleft; 00453 00454 if(!word) { 00455 return NULL; 00456 } 00457 if((iconv_t)-1 == _strConversionDescriptor) { 00458 #ifdef DEBUG_DECODING 00459 fprintf(stderr, "DEBUG: Conversion Descriptor not available, conversion skipped!\n"); 00460 #endif 00461 return word; 00462 } 00463 00464 result = memMalloc(strlen(word) +1); 00465 strcpy(result, ""); 00466 00467 inbuf = word; // don't use the original pointers when calling iconv() 00468 outbuf = result; 00469 outbytesleft = inbytesleft = strlen(word) + 1; 00470 00471 /* convert the string 00472 * iconv modifies all it's arguments, so we have to take care, when using 00473 * them later. 00474 * 'result' automatically points to the converted string when conversion is done 00475 */ 00476 if (iconv(_strConversionDescriptor, &inbuf, &inbytesleft, &outbuf, &outbytesleft) == -1) { 00477 #ifdef DEBUG_DECODING 00478 fprintf(stderr, "DEBUG: Conversion from UTF-8 to ISO-8859-1 failed for %s!\n", word); 00479 #endif 00480 strcpy(result, word); // take the original word as fallback 00481 } 00482 00483 #ifdef DEBUG_DECODING 00484 if(strcmp(word, result)) { 00485 fprintf(stderr, "DEBUG: normalised `%s' to `%s'\n", word, result); 00486 } 00487 #endif 00488 00489 s = strRegister(result); 00490 memFree(result); 00491 return s; 00492 } 00493 00494 /* --------------------------------------------------------------------------- 00495 * module initialization routine. 00496 * This function is only called by blahInitialize() and should not be 00497 * used from outside It basically allocates the \c _strStore. 00498 * It also sets the conversion handler for String decoding 00499 */ 00500 void strInitialize(void) 00501 { 00502 _strStore = 00503 hashNew(500, 0.7, hashStringHashFunction, hashStringEqualFunction); 00504 00505 /* set conversion descriptor: UTF-8 to ISO-8859-1 */ 00506 _strConversionDescriptor = iconv_open("ISO-8859-1", "UTF-8"); 00507 #ifdef DEBUG_DECODING 00508 if (_strConversionDescriptor == (iconv_t) -1) { 00509 fprintf(stderr, "DEBUG: Opening UTF-8 to ISO-8859-1 conversion handler failed!\n"); 00510 } 00511 #endif 00512 00513 } 00514 00515 /* ------------------------------------------------------------------------- */ 00516 /** @} */