*** misc/libtextcat-2.2/configure Thu May 22 13:39:55 2003 --- misc/build/libtextcat-2.2/configure Tue Nov 27 13:51:14 2007 *************** *** 5391,5397 **** allow_undefined_flag= no_undefined_flag= need_lib_prefix=unknown ! need_version=unknown # when you set need_version to no, make sure it does not cause -set_version # flags to be left without arguments archive_cmds= --- 5391,5398 ---- allow_undefined_flag= no_undefined_flag= need_lib_prefix=unknown ! #need_version=unknown ! need_version=no # when you set need_version to no, make sure it does not cause -set_version # flags to be left without arguments archive_cmds= *************** *** 5785,5791 **** # cross-compilation, but unfortunately the echo tests do not # yet detect zsh echo's removal of \ escapes. Also zsh mangles # `"' quotes if we put them in here... so don't! ! archive_cmds='$CC -r -keep_private_externs -nostdlib -o ${lib}-master.o $libobjs && $CC $(test .$module = .yes && echo -bundle || echo -dynamiclib) $allow_undefined_flag -o $lib ${lib}-master.o $deplibs$linker_flags $(test .$module != .yes && echo -install_name $rpath/$soname $verstring)' # We need to add '_' to the symbols in $export_symbols first #archive_expsym_cmds="$archive_cmds"' && strip -s $export_symbols' hardcode_direct=yes --- 5786,5792 ---- # cross-compilation, but unfortunately the echo tests do not # yet detect zsh echo's removal of \ escapes. Also zsh mangles # `"' quotes if we put them in here... so don't! ! archive_cmds='$CC -r -keep_private_externs -nostdlib -o ${lib}-master.o $libobjs && $CC $(test .$module = .yes && echo -bundle || echo -dynamiclib) $allow_undefined_flag -o $lib ${lib}-master.o $deplibs$compiler_flags $(test .$module != .yes && echo -install_name $rpath/$soname $verstring)' # We need to add '_' to the symbols in $export_symbols first #archive_expsym_cmds="$archive_cmds"' && strip -s $export_symbols' hardcode_direct=yes *************** *** 6280,6286 **** ;; freebsd*) ! objformat=`test -x /usr/bin/objformat && /usr/bin/objformat || echo aout` version_type=freebsd-$objformat case $version_type in freebsd-elf*) --- 6281,6287 ---- ;; freebsd*) ! objformat=`test -x /usr/bin/objformat && /usr/bin/objformat || echo elf` version_type=freebsd-$objformat case $version_type in freebsd-elf*) *** misc/libtextcat-2.2/src/Makefile.in Thu May 22 13:39:52 2003 --- misc/build/libtextcat-2.2/src/Makefile.in Tue Nov 27 13:49:17 2007 *************** *** 124,143 **** target_vendor = @target_vendor@ AUTOMAKE_OPTIONS = 1.4 foreign ! WARNS = -W -Wall -Wshadow -Wpointer-arith ! IFLAGS = ! FLAGS = -g -O3 -funroll-loops -D_THREAD_SAFE -D_GNU_SOURCE VERBOSE = -DVERBOSE AM_CFLAGS = $(IFLAGS) $(VERBOSE) $(WARNS) $(FLAGS) AM_LDFLAGS = -g noinst_HEADERS = \ ! common.h constants.h fingerprint.h textcat.h wg_mempool.h lib_LTLIBRARIES = libtextcat.la libtextcat_la_SOURCES = \ ! common.c fingerprint.c textcat.c wg_mempool.c bin_PROGRAMS = createfp --- 124,143 ---- target_vendor = @target_vendor@ AUTOMAKE_OPTIONS = 1.4 foreign ! #WARNS = -W -Wall -Wshadow -Wpointer-arith ! IFLAGS = ! #FLAGS = -g -O3 -funroll-loops -D_THREAD_SAFE -D_GNU_SOURCE VERBOSE = -DVERBOSE AM_CFLAGS = $(IFLAGS) $(VERBOSE) $(WARNS) $(FLAGS) AM_LDFLAGS = -g noinst_HEADERS = \ ! common.h constants.h fingerprint.h textcat.h wg_mempool.h utf8misc.h lib_LTLIBRARIES = libtextcat.la libtextcat_la_SOURCES = \ ! common.c fingerprint.c textcat.c wg_mempool.c utf8misc.c bin_PROGRAMS = createfp *************** *** 156,162 **** libtextcat_la_LDFLAGS = libtextcat_la_LIBADD = am_libtextcat_la_OBJECTS = common.lo fingerprint.lo textcat.lo \ ! wg_mempool.lo libtextcat_la_OBJECTS = $(am_libtextcat_la_OBJECTS) bin_PROGRAMS = createfp$(EXEEXT) noinst_PROGRAMS = testtextcat$(EXEEXT) --- 156,162 ---- libtextcat_la_LDFLAGS = libtextcat_la_LIBADD = am_libtextcat_la_OBJECTS = common.lo fingerprint.lo textcat.lo \ ! wg_mempool.lo utf8misc.lo libtextcat_la_OBJECTS = $(am_libtextcat_la_OBJECTS) bin_PROGRAMS = createfp$(EXEEXT) noinst_PROGRAMS = testtextcat$(EXEEXT) *************** *** 177,183 **** @AMDEP_TRUE@DEP_FILES = ./$(DEPDIR)/common.Plo ./$(DEPDIR)/createfp.Po \ @AMDEP_TRUE@ ./$(DEPDIR)/fingerprint.Plo \ @AMDEP_TRUE@ ./$(DEPDIR)/testtextcat.Po ./$(DEPDIR)/textcat.Plo \ ! @AMDEP_TRUE@ ./$(DEPDIR)/wg_mempool.Plo COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) LTCOMPILE = $(LIBTOOL) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) \ --- 177,184 ---- @AMDEP_TRUE@DEP_FILES = ./$(DEPDIR)/common.Plo ./$(DEPDIR)/createfp.Po \ @AMDEP_TRUE@ ./$(DEPDIR)/fingerprint.Plo \ @AMDEP_TRUE@ ./$(DEPDIR)/testtextcat.Po ./$(DEPDIR)/textcat.Plo \ ! @AMDEP_TRUE@ ./$(DEPDIR)/wg_mempool.Plo \ ! @AMDEP_TRUE@ ./$(DEPDIR)/utf8misc.Plo COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \ $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) LTCOMPILE = $(LIBTOOL) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) \ *************** *** 213,219 **** @rm -f stamp-h1 cd $(top_builddir) && $(SHELL) ./config.status src/config.h ! $(srcdir)/config.h.in: $(top_srcdir)/configure.ac $(ACLOCAL_M4) cd $(top_srcdir) && $(AUTOHEADER) touch $(srcdir)/config.h.in --- 214,220 ---- @rm -f stamp-h1 cd $(top_builddir) && $(SHELL) ./config.status src/config.h ! $(srcdir)/config.h.in: $(top_srcdir)/configure.ac $(ACLOCAL_M4) cd $(top_srcdir) && $(AUTOHEADER) touch $(srcdir)/config.h.in *************** *** 247,254 **** echo "rm -f \"$${dir}/so_locations\""; \ rm -f "$${dir}/so_locations"; \ done ! libtextcat.la: $(libtextcat_la_OBJECTS) $(libtextcat_la_DEPENDENCIES) ! $(LINK) -rpath $(libdir) $(libtextcat_la_LDFLAGS) $(libtextcat_la_OBJECTS) $(libtextcat_la_LIBADD) $(LIBS) binPROGRAMS_INSTALL = $(INSTALL_PROGRAM) install-binPROGRAMS: $(bin_PROGRAMS) @$(NORMAL_INSTALL) --- 248,255 ---- echo "rm -f \"$${dir}/so_locations\""; \ rm -f "$${dir}/so_locations"; \ done ! libtextcat.la: $(libtextcat_la_OBJECTS) $(libtextcat_la_DEPENDENCIES) ! $(LINK) -avoid-version -rpath $(libdir) $(libtextcat_la_LDFLAGS) $(libtextcat_la_OBJECTS) $(libtextcat_la_LIBADD) $(LIBS) binPROGRAMS_INSTALL = $(INSTALL_PROGRAM) install-binPROGRAMS: $(bin_PROGRAMS) @$(NORMAL_INSTALL) *************** *** 285,294 **** echo " rm -f $$p $$f"; \ rm -f $$p $$f ; \ done ! createfp$(EXEEXT): $(createfp_OBJECTS) $(createfp_DEPENDENCIES) @rm -f createfp$(EXEEXT) $(LINK) $(createfp_LDFLAGS) $(createfp_OBJECTS) $(createfp_LDADD) $(LIBS) ! testtextcat$(EXEEXT): $(testtextcat_OBJECTS) $(testtextcat_DEPENDENCIES) @rm -f testtextcat$(EXEEXT) $(LINK) $(testtextcat_LDFLAGS) $(testtextcat_OBJECTS) $(testtextcat_LDADD) $(LIBS) --- 286,295 ---- echo " rm -f $$p $$f"; \ rm -f $$p $$f ; \ done ! createfp$(EXEEXT): $(createfp_OBJECTS) $(createfp_DEPENDENCIES) @rm -f createfp$(EXEEXT) $(LINK) $(createfp_LDFLAGS) $(createfp_OBJECTS) $(createfp_LDADD) $(LIBS) ! testtextcat$(EXEEXT): $(testtextcat_OBJECTS) $(testtextcat_DEPENDENCIES) @rm -f testtextcat$(EXEEXT) $(LINK) $(testtextcat_LDFLAGS) $(testtextcat_OBJECTS) $(testtextcat_LDADD) $(LIBS) *************** *** 304,309 **** --- 305,311 ---- @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/testtextcat.Po@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/textcat.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/wg_mempool.Plo@am__quote@ + @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/utf8misc.Plo@am__quote@ distclean-depend: -rm -rf ./$(DEPDIR) *** misc/libtextcat-2.2/src/common.c Thu May 22 13:32:43 2003 --- misc/build/libtextcat-2.2/src/common.c Tue Nov 27 13:49:17 2007 *************** *** 3,25 **** * * Copyright (c) 2003, WiseGuys Internet B.V. * All rights reserved. ! * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: ! * * - Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. ! * * - Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the * distribution. ! * * - Neither the name of the WiseGuys Internet B.V. nor the names of * its contributors may be used to endorse or promote products derived * from this software without specific prior written permission. ! * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR --- 3,25 ---- * * Copyright (c) 2003, WiseGuys Internet B.V. * All rights reserved. ! * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: ! * * - Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. ! * * - Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the * distribution. ! * * - Neither the name of the WiseGuys Internet B.V. nor the names of * its contributors may be used to endorse or promote products derived * from this software without specific prior written permission. ! * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR *************** *** 114,124 **** wgmem_error( "Error while strduping %u bytes.\n", strlen(s) ); } ! return( result ); } ! extern void* wg_realloc( void *ptr, size_t size ) ! { void *result; if (!size) { --- 114,124 ---- wgmem_error( "Error while strduping %u bytes.\n", strlen(s) ); } ! return( result ); } ! extern void* wg_realloc( void *ptr, size_t size ) ! { void *result; if (!size) { *************** *** 131,137 **** wgmem_error( "Error while reallocing %u bytes.\n", size ); } ! return( result ); } extern void wg_free( void *mem ) --- 131,137 ---- wgmem_error( "Error while reallocing %u bytes.\n", size ); } ! return( result ); } extern void wg_free( void *mem ) *************** *** 148,159 **** if ( fgets(line, size, fp) == NULL ) { return NULL; } ! /** kill term null **/ if ( (p = strpbrk( line, "\n\r" )) ) { *p = '\0'; ! } ! return line; } --- 148,159 ---- if ( fgets(line, size, fp) == NULL ) { return NULL; } ! /** kill term null **/ if ( (p = strpbrk( line, "\n\r" )) ) { *p = '\0'; ! } ! return line; } *************** *** 164,202 **** * * ARGUMENTS: * - result: ! * * After the split, this array contains pointers to the start of each * detected segment. Must be preallocated and at least as large as * maxsegments. The pointers point into the dest buffer. ! * ! * - dest: ! * * String into which result points as an index. Must be preallocated, and * at least as big as src. You can use src as dest, but in that case src * is overwritten! ! * ! * - src: ! * * The string to split. Sequences of whitespace are treated as separators, unless * escaped. There are two ways to escape: by using single quotes (anything * between single quotes is treated as one segment), or by using a backslash * to escape the next character. The backslash escape works inside quotation * as well. ! * * Example: ! * * "It\'s very\ easy 'to use WiseGuys\' wg_split()' function" is split into: ! * * "It's" * "very easy" * "to use WiseGuys' wg_split()" * "function" ! * ! * - maxsegments: ! * * The maximum number of segments. If the splitter runs out of segments, * the remainder of the string is stored in the last segment. ! * * RETURN VALUE: * The number of segments found. */ --- 164,202 ---- * * ARGUMENTS: * - result: ! * * After the split, this array contains pointers to the start of each * detected segment. Must be preallocated and at least as large as * maxsegments. The pointers point into the dest buffer. ! * ! * - dest: ! * * String into which result points as an index. Must be preallocated, and * at least as big as src. You can use src as dest, but in that case src * is overwritten! ! * ! * - src: ! * * The string to split. Sequences of whitespace are treated as separators, unless * escaped. There are two ways to escape: by using single quotes (anything * between single quotes is treated as one segment), or by using a backslash * to escape the next character. The backslash escape works inside quotation * as well. ! * * Example: ! * * "It\'s very\ easy 'to use WiseGuys\' wg_split()' function" is split into: ! * * "It's" * "very easy" * "to use WiseGuys' wg_split()" * "function" ! * ! * - maxsegments: ! * * The maximum number of segments. If the splitter runs out of segments, * the remainder of the string is stored in the last segment. ! * * RETURN VALUE: * The number of segments found. */ *************** *** 218,229 **** switch (state) { case 0: /*** Skip spaces ***/ ! while ( isspace((int) *p) ) { p++; } state = 1; ! case 1: /*** Start segment ***/ result[cnt] = w; cnt++; --- 218,229 ---- switch (state) { case 0: /*** Skip spaces ***/ ! while ( isspace((unsigned char) *p) ) { p++; } state = 1; ! case 1: /*** Start segment ***/ result[cnt] = w; cnt++; *************** *** 232,243 **** case 2: /*** Unquoted segment ***/ while (*p) { ! if ( isspace((int) *p) ) { *w++ = '\0'; p++; state = 0; break; ! } else if ( *p == '\'' ) { /*** Start quotation ***/ p++; --- 232,243 ---- case 2: /*** Unquoted segment ***/ while (*p) { ! if ( isspace((unsigned char) *p) ) { *w++ = '\0'; p++; state = 0; break; ! } else if ( *p == '\'' ) { /*** Start quotation ***/ p++; *************** *** 292,308 **** } extern void wg_timerstart(wgtimer_t *t) { - #ifdef HAVE_GETTIMEOFDAY gettimeofday( &(t->start), NULL ); - #endif } extern uint4 wg_timerstop(wgtimer_t *t) { - #ifdef HAVE_GETTIMEOFDAY uint4 result; gettimeofday( &(t->stop), NULL ); result = (t->stop.tv_sec - t->start.tv_sec) * 1000000 + --- 292,308 ---- } + #ifdef HAVE_GETTIMEOFDAY /* TL : no struct timeval under Win32 */ extern void wg_timerstart(wgtimer_t *t) { gettimeofday( &(t->start), NULL ); } + #endif /* TL : no struct timeval under Win32 */ + #ifdef HAVE_GETTIMEOFDAY /* TL : no struct timeval under Win32 */ extern uint4 wg_timerstop(wgtimer_t *t) { uint4 result; gettimeofday( &(t->stop), NULL ); result = (t->stop.tv_sec - t->start.tv_sec) * 1000000 + *************** *** 312,336 **** t->start.tv_usec = t->stop.tv_usec; return result; - #else - return 0; - #endif } /** * wg_strgmov -- a guarded strcpy() variation ! * * copies src to dest (including terminating zero), and returns * pointer to position of terminating zero in dest. The function is * guaranteed not to write past destlimit. If the copy couldn't be ! * finished, the function returns NULL after restoring the first ! * character in dest for your convenience (since this is usually a zero). */ char *wg_strgmov( char *dest, const char *src, const char *destlimit ) { char tmp, *w; ! if ( !dest || dest >= destlimit ) { return NULL; } --- 312,334 ---- t->start.tv_usec = t->stop.tv_usec; return result; } + #endif /* TL : no struct timeval under Win32 */ /** * wg_strgmov -- a guarded strcpy() variation ! * * copies src to dest (including terminating zero), and returns * pointer to position of terminating zero in dest. The function is * guaranteed not to write past destlimit. If the copy couldn't be ! * finished, the function returns NULL after restoring the first ! * character in dest for your convenience (since this is usually a zero). */ char *wg_strgmov( char *dest, const char *src, const char *destlimit ) { char tmp, *w; ! if ( !dest || dest >= destlimit ) { return NULL; } *************** *** 355,361 **** } /* ! * wg_trim() -- remove whitespace surrounding a string. * * Example: " bla bla bla " becomes "bla bla bla" after trimming. * --- 353,359 ---- } /* ! * wg_trim() -- remove whitespace surrounding a string. * * Example: " bla bla bla " becomes "bla bla bla" after trimming. * *************** *** 373,384 **** char *lastnonspace = &dest[-1]; const char *p = src; char *w = dest; ! ! while ( isspace((int)*p) ) { p++; } while (*p) { ! if ( !isspace((int)*p) ) { lastnonspace = w; } *w++ = *p++; --- 371,382 ---- char *lastnonspace = &dest[-1]; const char *p = src; char *w = dest; ! ! while ( isspace((unsigned char)*p) ) { p++; } while (*p) { ! if ( !isspace((unsigned char)*p) ) { lastnonspace = w; } *w++ = *p++; *** misc/libtextcat-2.2/src/common.h Thu May 22 15:02:29 2003 --- misc/build/libtextcat-2.2/src/common.h Tue Nov 27 13:49:17 2007 *************** *** 1,28 **** #ifndef _COMMON_H_ #define _COMMON_H_ /** ! * common.h -- a mixed bag of helper functions * * Copyright (C) 2003 WiseGuys Internet B.V. * * THE BSD LICENSE ! * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: ! * * - Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. ! * * - Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the * distribution. ! * * - Neither the name of the WiseGuys Internet B.V. nor the names of * its contributors may be used to endorse or promote products derived * from this software without specific prior written permission. ! * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR --- 1,28 ---- #ifndef _COMMON_H_ #define _COMMON_H_ /** ! * common.h -- a mixed bag of helper functions * * Copyright (C) 2003 WiseGuys Internet B.V. * * THE BSD LICENSE ! * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: ! * * - Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. ! * * - Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the * distribution. ! * * - Neither the name of the WiseGuys Internet B.V. nor the names of * its contributors may be used to endorse or promote products derived * from this software without specific prior written permission. ! * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR *************** *** 86,95 **** --- 86,97 ---- typedef char boole; #endif + #ifdef HAVE_GETTIMEOFDAY /* TL : no struct timeval under Win32 */ typedef struct wgtimer_s { struct timeval start; struct timeval stop; } wgtimer_t; + #endif /* TL : no struct timeval under Win32 */ extern void *wg_malloc( size_t size ); *************** *** 101,113 **** extern char *wg_getline( char *line, int size, FILE *fp ); extern void wg_timerstart(wgtimer_t *t); extern uint4 wg_timerstop(wgtimer_t *t); extern unsigned int wg_split( char **result, char *dest, char *src, int maxsegments ); extern char *wg_strgmov( char *dest, const char *src, const char *destlimit ); extern char *wg_trim( char *dest, const char *src ); ! #endif --- 103,117 ---- extern char *wg_getline( char *line, int size, FILE *fp ); + #ifdef HAVE_GETTIMEOFDAY /* TL : no struct timeval under Win32 */ extern void wg_timerstart(wgtimer_t *t); extern uint4 wg_timerstop(wgtimer_t *t); + #endif /* TL : no struct timeval under Win32 */ extern unsigned int wg_split( char **result, char *dest, char *src, int maxsegments ); extern char *wg_strgmov( char *dest, const char *src, const char *destlimit ); extern char *wg_trim( char *dest, const char *src ); ! #endif *** misc/libtextcat-2.2/src/constants.h Thu May 22 13:32:43 2003 --- misc/build/libtextcat-2.2/src/constants.h Tue Nov 27 13:49:17 2007 *************** *** 39,44 **** --- 39,46 ---- */ #include + #define _UTF8_ + #define DESCRIPTION "out of place" /* Reported matches are those fingerprints with a score less than best *************** *** 59,72 **** /* Maximum number of n-grams in a fingerprint */ #define MAXNGRAMS 400 ! /* Maximum size of an n-gram? */ ! #define MAXNGRAMSIZE 5 /* Which characters are not acceptable in n-grams? */ ! #define INVALID(c) (isspace((int)c) || isdigit((int)c)) /* Minimum size (in characters) for accepting a document */ ! #define MINDOCSIZE 25 /* Maximum penalty for missing an n-gram in fingerprint */ #define MAXOUTOFPLACE 400 --- 61,81 ---- /* Maximum number of n-grams in a fingerprint */ #define MAXNGRAMS 400 ! /* Maximum number of character of an n-gram? */ ! #define MAXNGRAMSYMBOL 5 + /* Maximum size of the string representing an n-gram (must be greater than number of symbol) */ + #ifdef _UTF8_ + #define MAXNGRAMSIZE 20 + #else + #define MAXNGRAMSIZE MAXNGRAMSYMBOL + #endif + /* Which characters are not acceptable in n-grams? */ ! #define INVALID(c) (isspace((unsigned char)c) || isdigit((unsigned char)c)) /* Minimum size (in characters) for accepting a document */ ! #define MINDOCSIZE 6 /* Maximum penalty for missing an n-gram in fingerprint */ #define MAXOUTOFPLACE 400 *************** *** 76,79 **** --- 85,91 ---- #define MAXSCORE INT_MAX + /* where the fingerprints files are stored */ + #define DEFAULT_FINGERPRINTS_PATH "" + #endif *** misc/libtextcat-2.2/src/fingerprint.c Thu May 22 13:32:43 2003 --- misc/build/libtextcat-2.2/src/fingerprint.c Tue Nov 27 13:49:18 2007 *************** *** 6,28 **** * All rights reserved. * * THE BSD LICENSE ! * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: ! * * - Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. ! * * - Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the * distribution. ! * * - Neither the name of the WiseGuys Internet B.V. nor the names of * its contributors may be used to endorse or promote products derived * from this software without specific prior written permission. ! * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR --- 6,28 ---- * All rights reserved. * * THE BSD LICENSE ! * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: ! * * - Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. ! * * - Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the * distribution. ! * * - Neither the name of the WiseGuys Internet B.V. nor the names of * its contributors may be used to endorse or promote products derived * from this software without specific prior written permission. ! * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR *************** *** 51,57 **** * The reason why we go through the trouble of doing a partial * (heap)sort is that a full quicksort behaves horribly on the data: * most n-grams have a very low count, resulting in a data set in ! * nearly-sorted order. This causes quicksort to behave very badly. * Heapsort, on the other hand, behaves handsomely: worst case is * Mlog(N) for M n-grams filtered through a N-sized heap. * --- 51,57 ---- * The reason why we go through the trouble of doing a partial * (heap)sort is that a full quicksort behaves horribly on the data: * most n-grams have a very low count, resulting in a data set in ! * nearly-sorted order. This causes quicksort to behave very badly. * Heapsort, on the other hand, behaves handsomely: worst case is * Mlog(N) for M n-grams filtered through a N-sized heap. * *************** *** 63,68 **** --- 63,72 ---- * - put table/heap datastructure in a separate file. */ + #ifndef _UTF8_ + #define _UTF8_ + #endif + #include "config.h" #include #ifdef HAVE_STDLIB_H *************** *** 80,89 **** --- 84,95 ---- #include "wg_mempool.h" #include "constants.h" + #include "utf8misc.h" #define TABLESIZE (1<table[ hash ]; ! ! while ( entry ) { if ( issame( entry->str, p, len ) ) { /*** Found it! ***/ entry->cnt++; --- 140,153 ---- } /* increases frequency of ngram(p,len) */ ! static int increasefreq( table_t *t, char *p, int len ) ! { ! uint4 hash = simplehash( p, len ) & TABLEMASK; entry_t *entry = t->table[ hash ]; ! ! while ( entry ) { if ( issame( entry->str, p, len ) ) { /*** Found it! ***/ entry->cnt++; *************** *** 168,174 **** } /*** Not found, so create ***/ ! entry = wgmempool_alloc( t->pool, sizeof(entry_t) ); strcpy( entry->str, p ); entry->cnt = 1; --- 159,165 ---- } /*** Not found, so create ***/ ! entry = (entry_t*)(wgmempool_alloc( t->pool, sizeof(entry_t) )); strcpy( entry->str, p ); entry->cnt = 1; *************** *** 181,192 **** #if 0 /* looks up ngram(p,len) */ ! static entry_t *findfreq( table_t *t, char *p, int len ) ! { ! uint4 hash = simplehash( p, len ) & TABLEMASK; entry_t *entry = t->table[ hash ]; ! ! while ( entry ) { if ( issame( entry->str, p, len ) ) { return entry; } --- 172,183 ---- #if 0 /* looks up ngram(p,len) */ ! static entry_t *findfreq( table_t *t, char *p, int len ) ! { ! uint4 hash = simplehash( p, len ) & TABLEMASK; entry_t *entry = t->table[ hash ]; ! ! while ( entry ) { if ( issame( entry->str, p, len ) ) { return entry; } *************** *** 219,225 **** #define GREATER(x,y) ((x).cnt > (y).cnt) #define LESS(x,y) ((x).cnt < (y).cnt) ! inline static void siftup( table_t *t, unsigned int child ) { entry_t *heap = t->heap; unsigned int parent = (child-1) >> 1; --- 210,216 ---- #define GREATER(x,y) ((x).cnt > (y).cnt) #define LESS(x,y) ((x).cnt < (y).cnt) ! static void siftup( table_t *t, unsigned int child ) { entry_t *heap = t->heap; unsigned int parent = (child-1) >> 1; *************** *** 241,247 **** } ! inline static void siftdown( table_t *t, unsigned int heapsize, uint4 parent ) { entry_t *heap = t->heap; unsigned int child = parent*2 + 1; --- 232,238 ---- } ! static void siftdown( table_t *t, unsigned int heapsize, uint4 parent ) { entry_t *heap = t->heap; unsigned int child = parent*2 + 1; *************** *** 273,279 **** if (t->size < t->heapsize) { memcpy( &(heap[t->size]), item, sizeof(entry_t)); siftup( t, t->size ); ! t->size++; return 0; } --- 264,270 ---- if (t->size < t->heapsize) { memcpy( &(heap[t->size]), item, sizeof(entry_t)); siftup( t, t->size ); ! t->size++; return 0; } *************** *** 316,333 **** /*** Fill result heap ***/ for (i=0; itable[i]; while (p) { heapinsert(t, p); p = p->next; } ! } return 1; } static table_t *inittable(uint4 maxngrams) ! { table_t *result = (table_t *)wg_zalloc( sizeof(table_t) ); result->table = (entry_t **)wg_zalloc( sizeof(entry_t*) * TABLESIZE ); result->pool = wgmempool_Init( 10000, 10 ); --- 307,324 ---- /*** Fill result heap ***/ for (i=0; itable[i]; while (p) { heapinsert(t, p); p = p->next; } ! } return 1; } static table_t *inittable(uint4 maxngrams) ! { table_t *result = (table_t *)wg_zalloc( sizeof(table_t) ); result->table = (entry_t **)wg_zalloc( sizeof(entry_t*) * TABLESIZE ); result->pool = wgmempool_Init( 10000, 10 ); *************** *** 347,353 **** wgmempool_Done(t->pool); wg_free(t->table); wg_free(t->heap); ! wg_free(t); } --- 338,344 ---- wgmempool_Done(t->pool); wg_free(t->table); wg_free(t->heap); ! wg_free(t); } *************** *** 354,360 **** extern void *fp_Init(const char *name) { fp_t *h = (fp_t *)wg_zalloc( sizeof(fp_t) ); ! if ( name ) { h->name = wg_strdup(name); } --- 345,351 ---- extern void *fp_Init(const char *name) { fp_t *h = (fp_t *)wg_zalloc( sizeof(fp_t) ); ! if ( name ) { h->name = wg_strdup(name); } *************** *** 458,478 **** return dest; } ! static void createngramtable( table_t *t, const char *buf ) { char n[MAXNGRAMSIZE+1]; const char *p = buf; int i; /*** Get all n-grams where 1<=n<=MAXNGRAMSIZE. Allow underscores only at borders. ***/ ! for (;;p++) { ! const char *q = p; char *m = n; /*** First char may be an underscore ***/ ! *m++ = *q++; *m = '\0'; increasefreq( t, n, 1 ); --- 449,475 ---- return dest; } ! /** ! * this function extract all n-gram from past buffer and put them into the table "t" ! * [modified] by Jocelyn Merand to accept utf-8 multi-character symbols to be used in OpenOffice ! */ static void createngramtable( table_t *t, const char *buf ) { char n[MAXNGRAMSIZE+1]; const char *p = buf; int i; + int pointer = 0; /*** Get all n-grams where 1<=n<=MAXNGRAMSIZE. Allow underscores only at borders. ***/ ! while(1) { ! const char *q = &p[pointer]; /*[modified] previously p++ above (for(;;p++)) now, it's pointer wich is increased so we have to get the new pointer on the buffer*/ char *m = n; /*** First char may be an underscore ***/ ! int decay = charcopy(q, m); /*[modified] previously *q++ = *m++*/ ! q = &(p[pointer+decay]); /*[modified] the old copying method do not manage multi-character symbols*/ ! m += decay; /*[modified]*/ *m = '\0'; increasefreq( t, n, 1 ); *************** *** 482,500 **** } /*** Let the compiler unroll this ***/ ! for ( i=2; i<=MAXNGRAMSIZE; i++) { ! *m++ = *q; *m = '\0'; increasefreq( t, n, i ); if ( *q == '_' ) break; ! q++; if ( *q == '\0' ) { return; } } } return; } --- 479,500 ---- } /*** Let the compiler unroll this ***/ ! for ( i=2; i<=MAXNGRAMSYMBOL; i++) { ! decay = charcopy(q, m); /*[modified] like above*/ ! m += decay; *m = '\0'; increasefreq( t, n, i ); if ( *q == '_' ) break; ! q += decay; if ( *q == '\0' ) { return; } } + + pointer = nextcharstart(p,pointer); /*[modified] p[pointer] must point on the next start of symbol, but whith utf next start is not surely next char*/ } return; } *************** *** 514,520 **** { ngram_t *x = (ngram_t *)a; ngram_t *y = (ngram_t *)b; ! return mystrcmp( x->str, y->str ); } --- 514,520 ---- { ngram_t *x = (ngram_t *)a; ngram_t *y = (ngram_t *)b; ! return mystrcmp( x->str, y->str ); } *************** *** 522,533 **** { ngram_t *x = (ngram_t *)a; ngram_t *y = (ngram_t *)b; ! return x->rank - y->rank; } /** ! * Create a fingerprint: * - record the frequency of each unique n-gram in a hash table * - take the most frequent n-grams * - sort them alphabetically, recording their relative rank --- 522,533 ---- { ngram_t *x = (ngram_t *)a; ngram_t *y = (ngram_t *)b; ! return x->rank - y->rank; } /** ! * Create a fingerprint: * - record the frequency of each unique n-gram in a hash table * - take the most frequent n-grams * - sort them alphabetically, recording their relative rank *************** *** 544,563 **** } /*** Throw out all invalid chars ***/ ! tmp = prepbuffer( buffer, bufsize ); if ( tmp == NULL ) { return 0; } - h = (fp_t*)handle; t = inittable(maxngrams); /*** Create a hash table containing n-gram counts ***/ createngramtable(t, tmp); ! /*** Take the top N n-grams and add them to the profile ***/ ! table2heap(t); ! maxngrams = WGMIN( maxngrams, t->size ); h->fprint = (ngram_t *)wg_malloc( sizeof(ngram_t) * maxngrams ); h->size = maxngrams; --- 544,564 ---- } /*** Throw out all invalid chars ***/ ! tmp = prepbuffer( buffer, bufsize ); ! /*printf("Cleaned buffer : %s\n",tmp);*/ if ( tmp == NULL ) { return 0; } h = (fp_t*)handle; t = inittable(maxngrams); + /*printf("Table initialized\n");*/ /*** Create a hash table containing n-gram counts ***/ createngramtable(t, tmp); ! /*printf("Table created\n");*/ /*** Take the top N n-grams and add them to the profile ***/ ! table2heap(t); ! maxngrams = WGMIN( maxngrams, t->size ); h->fprint = (ngram_t *)wg_malloc( sizeof(ngram_t) * maxngrams ); h->size = maxngrams; *************** *** 568,574 **** entry_t tmp2; heapextract(t, &tmp2); ! /*** the string and its rank is all we need ***/ strcpy( h->fprint[i].str, tmp2.str ); h->fprint[i].rank = i; --- 569,575 ---- entry_t tmp2; heapextract(t, &tmp2); ! /*** the string and its rank is all we need ***/ strcpy( h->fprint[i].str, tmp2.str ); h->fprint[i].rank = i; *************** *** 578,584 **** wg_free(tmp); /*** Sort n-grams alphabetically, for easy comparison ***/ ! qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp_str ); return 1; } --- 579,585 ---- wg_free(tmp); /*** Sort n-grams alphabetically, for easy comparison ***/ ! qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp_str ); return 1; } *************** *** 608,614 **** #endif return 0; } ! h->fprint = (ngram_t *)wg_malloc(maxngrams * sizeof(ngram_t)); while (cnt < maxngrams && wg_getline(line,1024,fp)) { --- 609,615 ---- #endif return 0; } ! h->fprint = (ngram_t *)wg_malloc(maxngrams * sizeof(ngram_t)); while (cnt < maxngrams && wg_getline(line,1024,fp)) { *************** *** 635,641 **** h->size = cnt; /*** Sort n-grams, for easy comparison later on ***/ ! qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp_str ); fclose(fp); --- 636,642 ---- h->size = cnt; /*** Sort n-grams, for easy comparison later on ***/ ! qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp_str ); fclose(fp); *************** *** 648,661 **** { uint4 i; fp_t *h = (fp_t *)handle; ! ngram_t *tmp = wg_malloc( sizeof(ngram_t) * h->size ); ! /*** Make a temporary and sort it on rank ***/ memcpy( tmp, h->fprint, h->size * sizeof(ngram_t) ); ! qsort( tmp, h->size, sizeof(ngram_t), ngramcmp_rank ); for (i=0; isize; i++) { ! fprintf( fp, "%s\n", tmp[i].str ); } wg_free( tmp ); } --- 649,663 ---- { uint4 i; fp_t *h = (fp_t *)handle; ! ngram_t *tmp = (ngram_t*)wg_malloc( sizeof(ngram_t) * h->size ); ! /*** Make a temporary and sort it on rank ***/ memcpy( tmp, h->fprint, h->size * sizeof(ngram_t) ); ! qsort( tmp, h->size, sizeof(ngram_t), ngramcmp_rank ); for (i=0; isize; i++) { ! /*fprintf( fp, "%s\t%i\n", tmp[i].str, tmp[i].rank );*/ ! fprintf( fp, "%s\n", tmp[i].str); } wg_free( tmp ); } *************** *** 669,675 **** uint4 i = 0; uint4 j = 0; sint4 sum = 0; ! /*** Compare the profiles in mergesort fashion ***/ while ( i < c->size && j < u->size ) { --- 671,677 ---- uint4 i = 0; uint4 j = 0; sint4 sum = 0; ! /*** Compare the profiles in mergesort fashion ***/ while ( i < c->size && j < u->size ) { *************** *** 705,711 **** } return sum; ! } --- 707,713 ---- } return sum; ! } *** misc/libtextcat-2.2/src/fingerprint.h Mon May 19 14:16:31 2003 --- misc/build/libtextcat-2.2/src/fingerprint.h Tue Nov 27 13:49:18 2007 *************** *** 41,47 **** --- 41,53 ---- extern int fp_Read( void *handle, const char *fname, int maxngrams ); extern sint4 fp_Compare( void *cat, void *unknown, int cutoff ); extern void fp_Show( void *handle ); + #ifdef __cplusplus + extern "C" { + #endif extern const char *fp_Name( void *handle ); + #ifdef __cplusplus + } + #endif extern void fp_Print( void *handle, FILE *fp ); #endif *** misc/libtextcat-2.2/src/libtextcat.map Tue Nov 27 13:51:28 2007 --- misc/build/libtextcat-2.2/src/libtextcat.map Tue Nov 27 13:49:18 2007 *************** *** 1 **** ! dummy --- 1,40 ---- ! { ! global: ! charcopy ! issame ! nextcharstart ! utfstrlen ! wgmempool_Done ! wgmempool_Init ! wgmempool_Reset ! wgmempool_alloc ! wgmempool_getline ! wgmempool_strdup ! special_textcat_Init ! textcat_Classify ! textcat_Done ! textcat_Init ! textcat_Version ! fp_Compare ! fp_Create ! fp_Debug ! fp_Done ! fp_Init ! fp_Name ! fp_Print ! fp_Read ! heapextract ! wg_calloc ! wg_free ! wg_getline ! wg_malloc ! wg_split ! wg_strdup ! wg_strgmov ! wg_trim ! wg_zalloc ! wgmem_error ! ! local: ! *; ! } *** misc/libtextcat-2.2/src/makefile.mk Tue Nov 27 13:51:28 2007 --- misc/build/libtextcat-2.2/src/makefile.mk Tue Nov 27 13:49:18 2007 *************** *** 1 **** ! dummy --- 1,92 ---- ! #************************************************************************* ! # ! # $RCSfile: libtextcat-2.2.patch,v $ ! # ! # $Revision: 1.7 $ ! # ! # last change: $Author: obo $ $Date: 2008-01-04 15:02:30 $ ! # ! #* The Contents of this file are made available subject to ! #* the terms of GNU Lesser General Public License Version 2.1. ! #* ! #* ! #* GNU Lesser General Public License Version 2.1 ! #* ============================================= ! #* Copyright 2005 by Sun Microsystems, Inc. ! #* 901 San Antonio Road, Palo Alto, CA 94303, USA ! #* ! #* This library is free software; you can redistribute it and/or ! #* modify it under the terms of the GNU Lesser General Public ! #* License version 2.1, as published by the Free Software Foundation. ! #* ! #* This library is distributed in the hope that it will be useful, ! #* but WITHOUT ANY WARRANTY; without even the implied warranty of ! #* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ! #* Lesser General Public License for more details. ! #* ! #* You should have received a copy of the GNU Lesser General Public ! #* License along with this library; if not, write to the Free Software ! #* Foundation, Inc., 59 Temple Place, Suite 330, Boston, ! #* MA 02111-1307 USA ! #* ! #************************************************************************* ! ! PRJ = ..$/..$/..$/..$/.. ! ! PRJNAME = libtextcat ! TARGET = libtextcat ! CFLAGSCALL=gsd ! ! USE_DEFFILE=TRUE ! EXTERNAL_WARNINGS_NOT_ERRORS := TRUE ! ! .INCLUDE : settings.mk ! ! # --- Files -------------------------------------------------------- ! ! # !! not to be compiled because those belong to a stand alone programs: !! ! # $(SLO)$/createfp.obj\ ! # $(SLO)$/testtextcat.obj ! ! SLOFILES= \ ! $(SLO)$/common.obj\ ! $(SLO)$/fingerprint.obj\ ! $(SLO)$/textcat.obj\ ! $(SLO)$/wg_mempool.obj\ ! $(SLO)$/utf8misc.obj ! ! #SHL1TARGET= $(TARGET)$(UPD)$(DLLPOSTFIX) ! SHL1TARGET= $(TARGET) ! ! SHL1STDLIBS= ! ! # build DLL ! SHL1LIBS= $(SLB)$/$(TARGET).lib ! SHL1IMPLIB= i$(TARGET) ! SHL1DEPN= $(SHL1LIBS) ! SHL1DEF= $(MISC)$/$(SHL1TARGET).def ! ! # build DEF file ! DEF1NAME= $(SHL1TARGET) ! DEF1DEPN=$(MISC)$/$(SHL1TARGET).flt ! ! SHL1VERSIONMAP= libtextcat.map ! ! # --- Targets ------------------------------------------------------ ! ! .INCLUDE : target.mk ! ! # copy hand supplied configuration file for Win32 builds to the file ! # which is included in the source code ! $(SLOFILES) : config.h ! config.h : ! $(GNUCOPY) $(OUT)$/misc$/build$/libtextcat-2.2$/src$/win32_config.h $(OUT)$/misc$/build$/libtextcat-2.2$/src$/config.h ! ! ! $(MISC)$/$(SHL1TARGET).flt: makefile.mk ! @echo ------------------------------ ! @echo Making: $@ ! @echo Imp>$@ ! @echo __CT>>$@ ! @echo _real>>$@ ! @echo unnamed>>$@ *** misc/libtextcat-2.2/src/textcat.c Thu May 22 13:32:43 2003 --- misc/build/libtextcat-2.2/src/textcat.c Tue Nov 27 13:49:18 2007 *************** *** 4,26 **** * Copyright (C) 2003 WiseGuys Internet B.V. * * THE BSD LICENSE ! * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: ! * * - Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. ! * * - Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the * distribution. ! * * - Neither the name of the WiseGuys Internet B.V. nor the names of * its contributors may be used to endorse or promote products derived * from this software without specific prior written permission. ! * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR --- 4,26 ---- * Copyright (C) 2003 WiseGuys Internet B.V. * * THE BSD LICENSE ! * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: ! * * - Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. ! * * - Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the * distribution. ! * * - Neither the name of the WiseGuys Internet B.V. nor the names of * its contributors may be used to endorse or promote products derived * from this software without specific prior written permission. ! * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR *************** *** 74,79 **** --- 74,80 ---- typedef struct { void **fprint; + char *fprint_disable; uint4 size; uint4 maxsize; *************** *** 112,122 **** fp_Done( h->fprint[i] ); } wg_free( h->fprint ); wg_free( h ); } ! extern void *textcat_Init( const char *conffile ) { textcat_t *h; char line[1024]; --- 113,133 ---- fp_Done( h->fprint[i] ); } wg_free( h->fprint ); + wg_free( h->fprint_disable ); wg_free( h ); } ! /** Replaces older function */ ! extern void *textcat_Init( const char *conffile ){ ! return special_textcat_Init( conffile, DEFAULT_FINGERPRINTS_PATH ); ! } ! ! /** ! * Originaly this function had only one parameter (conffile) it has been modified since OOo use ! * Basicaly prefix is the directory path where fingerprints are stored ! */ ! extern void *special_textcat_Init( const char *conffile, const char *prefix ) { textcat_t *h; char line[1024]; *************** *** 134,144 **** h->size = 0; h->maxsize = 16; h->fprint = (void **)wg_malloc( sizeof(void*) * h->maxsize ); while ( wg_getline( line, 1024, fp ) ) { char *p; char *segment[4]; ! int res; /*** Skip comments ***/ #ifdef HAVE_STRCHR --- 145,157 ---- h->size = 0; h->maxsize = 16; h->fprint = (void **)wg_malloc( sizeof(void*) * h->maxsize ); + h->fprint_disable = (char *)wg_malloc( sizeof(char*) * h->maxsize ); /*added to store the state of languages*/ while ( wg_getline( line, 1024, fp ) ) { char *p; char *segment[4]; ! char finger_print_file_name[512]; ! int res; /*** Skip comments ***/ #ifdef HAVE_STRCHR *************** *** 156,162 **** /*** Ensure enough space ***/ if ( h->size == h->maxsize ) { h->maxsize *= 2; ! h->fprint = (void *)wg_realloc( h->fprint, sizeof(void*) * h->maxsize ); } /*** Load data ***/ --- 169,176 ---- /*** Ensure enough space ***/ if ( h->size == h->maxsize ) { h->maxsize *= 2; ! h->fprint = (void **)wg_realloc( h->fprint, sizeof(void*) * h->maxsize ); ! h->fprint_disable = (char *)wg_realloc( h->fprint_disable, sizeof(char*) * h->maxsize ); } /*** Load data ***/ *************** *** 163,172 **** if ((h->fprint[ h->size ] = fp_Init( segment[1] ))==NULL) { goto ERROR; } ! if ( fp_Read( h->fprint[h->size], segment[0], 400 ) == 0 ) { textcat_Done(h); goto ERROR; ! } h->size++; } --- 177,191 ---- if ((h->fprint[ h->size ] = fp_Init( segment[1] ))==NULL) { goto ERROR; } ! finger_print_file_name[0] = '\0'; ! strcat(finger_print_file_name, prefix); ! strcat(finger_print_file_name, segment[0]); ! ! if ( fp_Read( h->fprint[h->size], finger_print_file_name, 400 ) == 0 ) { textcat_Done(h); goto ERROR; ! } ! h->fprint_disable[h->size] = 0xF0; /*0xF0 is the code for enabled languages, 0x0F is for disabled*/ h->size++; } *************** *** 203,213 **** result = _TEXTCAT_RESULT_SHORT; goto READY; } ! /*** Calculate the score for each category. ***/ for (i=0; isize; i++) { ! int score = fp_Compare( h->fprint[i], unknown, threshold ); ! candidates[i].score = score; candidates[i].name = fp_Name( h->fprint[i] ); if ( score < minscore ) { minscore = score; --- 222,239 ---- result = _TEXTCAT_RESULT_SHORT; goto READY; } ! /*** Calculate the score for each category. ***/ for (i=0; isize; i++) { ! int score; ! if(h->fprint_disable[i] & 0x0F){ /*if this language is disabled*/ ! score = MAXSCORE; ! } ! else{ ! score = fp_Compare( h->fprint[i], unknown, threshold ); ! /*printf("Score for %s : %i\n", fp_Name(h->fprint[i]), score);*/ ! } ! candidates[i].score = score; candidates[i].name = fp_Name( h->fprint[i] ); if ( score < minscore ) { minscore = score; *************** *** 218,224 **** /*** Find the best performers ***/ for (i=0; isize; i++) { if ( candidates[i].score < threshold ) { - if ( ++cnt == MAXCANDIDATES+1 ) { break; } --- 244,249 ---- *************** *** 235,241 **** else { char *p = result; char *plimit = result+MAXOUTPUTSIZE; ! qsort( candidates, cnt, sizeof(candidate_t), cmpcandidates ); *p = '\0'; --- 260,266 ---- else { char *p = result; char *plimit = result+MAXOUTPUTSIZE; ! qsort( candidates, cnt, sizeof(candidate_t), cmpcandidates ); *p = '\0'; *************** *** 247,253 **** } READY: fp_Done(unknown); ! #ifdef SHOULD_FREE free(candidates); #undef SHOULD_FREE #endif --- 272,278 ---- } READY: fp_Done(unknown); ! #ifdef SHOULD_FREE free(candidates); #undef SHOULD_FREE #endif *** misc/libtextcat-2.2/src/textcat.h Mon May 19 14:16:31 2003 --- misc/build/libtextcat-2.2/src/textcat.h Tue Nov 27 13:49:18 2007 *************** *** 40,45 **** --- 40,48 ---- #define _TEXTCAT_RESULT_UNKOWN "UNKNOWN" #define _TEXTCAT_RESULT_SHORT "SHORT" + #ifdef __cplusplus + extern "C" { + #endif /** * textcat_Init() - Initialize the text classifier. The textfile *************** *** 51,60 **** --- 54,72 ---- * Returns: handle on success, NULL on error. (At the moment, the * only way errors can occur, is when the library cannot read the * conffile, or one of the fingerprint files listed in it.) + * + * Replace older function (and has exacly the same behaviour) + * see below */ extern void *textcat_Init( const char *conffile ); /** + * Originaly this function had only one parameter (conffile) it has been modified since OOo must be able to load alternativ DB + * Basicaly prefix is the directory path where fingerprints are stored + */ + extern void *special_textcat_Init( const char *conffile, const char *prefix ); + + /** * textcat_Done() - Free up resources for handle */ extern void textcat_Done( void *handle ); *************** *** 77,80 **** --- 89,96 ---- * textcat_Version() - Returns a string describing the version of this classifier. */ extern char *textcat_Version(); + + #ifdef __cplusplus + } + #endif #endif *** misc/libtextcat-2.2/src/utf8misc.c Tue Nov 27 13:51:28 2007 --- misc/build/libtextcat-2.2/src/utf8misc.c Tue Nov 27 13:49:18 2007 *************** *** 1 **** ! dummy --- 1,132 ---- ! /*************************************************************************** ! * Copyright (C) 2006 by Jocelyn Merand * ! * joc.mer@gmail.com * ! * * ! * THE BSD LICENSE ! * ! * Redistribution and use in source and binary forms, with or without ! * modification, are permitted provided that the following conditions ! * are met: ! * ! * - Redistributions of source code must retain the above copyright ! * notice, this list of conditions and the following disclaimer. ! * ! * - Redistributions in binary form must reproduce the above copyright ! * notice, this list of conditions and the following disclaimer in the ! * documentation and/or other materials provided with the ! * distribution. ! * ! * - Neither the name of the WiseGuys Internet B.V. nor the names of ! * its contributors may be used to endorse or promote products derived ! * from this software without specific prior written permission. ! * ! * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ! * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ! * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ! * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ! * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ! * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ! * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ! * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ! * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ! * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ! * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ! ***************************************************************************/ ! ! #ifndef _UTF8_MISC_H_ ! #include "utf8misc.h" ! #endif ! ! ! int nextcharstart(const char *str, int position){ ! int pointer = position; ! ! if(str[pointer] & ESCAPE_MASK){ /*if the first bit of the current char is 1*/ ! ! /*then str[pointer] is an escape character*/ ! ! char escape_char = ((str[pointer] & WEIGHT_MASK) << 1); /*and we use it to count (by bit translation) following characters (only the weightest part)*/ ! ! while(escape_char & ESCAPE_MASK && str[pointer]){/*every step, we move the byte of 1 bit left, when first bit is 0, it's finished*/ ! escape_char = escape_char <<1; ! ++pointer; ! } ! } ! if(str[pointer]){ /*finaly, if we are not on the \0 character, we jump to the next character*/ ! ++pointer; ! } ! return pointer; ! } ! ! ! int charcopy(const char *str, char *dest){ ! ! int pointer = 0; ! if(str[pointer] & ESCAPE_MASK){ /*if the first bit of the current char is 1*/ ! ! /*then str[pointer] is an escape character*/ ! ! char escape_char = ((str[pointer] & WEIGHT_MASK) << 1); /*and we use it to count following characters (only the weightest part)*/ ! ! while(escape_char & ESCAPE_MASK && str[pointer]){ /*every step, we move the byte of 1 bit left, when first bit is 0, it's finished*/ ! dest[pointer] = str[pointer]; ! escape_char = escape_char <<1; ! ++pointer; ! } ! } ! if(str[pointer]){ ! dest[pointer] = str[pointer]; ! ++pointer; ! } ! ! return pointer; ! } ! ! ! int issame( char *lex, char *key, int len ) ! { ! /*printf("[%s] prefix of [%s] with length %i", lex, key, len);*/ ! int char_counter = 0; ! int pointer = 0; ! while(char_counter < len) { ! ! if(key[pointer] & ESCAPE_MASK){ /*if the first bit of the current char is 1*/ ! ! /*then key[pointer] is an escap character*/ ! ! char escape_char = ((key[pointer] & WEIGHT_MASK) << 1); /*and we use it to count (only the weightest part)*/ ! ! while(escape_char & ESCAPE_MASK && key[pointer] == lex[pointer] ){ ! escape_char = escape_char <<1; ! ++pointer; ! } ! } ! ++char_counter; /*and we are on a new utf8 character*/ ! if ( key[pointer] != lex[pointer] ) { ! return 0; ! /*printf(" NO\n", lex, key, len);*/ ! } ! ++pointer; ! } ! if ( lex[pointer] != '\0' ) { ! return 0; ! /*printf(" NO\n");*/ ! } ! ! /*printf(" YES\n");*/ ! ! return 1; ! } ! ! ! extern int utfstrlen(const char* str){ ! int char_counter = 0; ! int pointer = 0; ! while(str[pointer]) { ! pointer = nextcharstart(str, pointer); ! ! ++char_counter; /*and we are on a new utf8 character*/ ! } ! return char_counter; ! } ! *** misc/libtextcat-2.2/src/utf8misc.h Tue Nov 27 13:51:28 2007 --- misc/build/libtextcat-2.2/src/utf8misc.h Tue Nov 27 13:49:18 2007 *************** *** 1 **** ! dummy --- 1,88 ---- ! /*************************************************************************** ! * Copyright (C) 2006 by Jocelyn Merand * ! * joc.mer@gmail.com * ! * * ! * THE BSD LICENSE ! * ! * Redistribution and use in source and binary forms, with or without ! * modification, are permitted provided that the following conditions ! * are met: ! * ! * - Redistributions of source code must retain the above copyright ! * notice, this list of conditions and the following disclaimer. ! * ! * - Redistributions in binary form must reproduce the above copyright ! * notice, this list of conditions and the following disclaimer in the ! * documentation and/or other materials provided with the ! * distribution. ! * ! * - Neither the name of the WiseGuys Internet B.V. nor the names of ! * its contributors may be used to endorse or promote products derived ! * from this software without specific prior written permission. ! * ! * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ! * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ! * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ! * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ! * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, ! * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT ! * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, ! * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY ! * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT ! * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE ! * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ! ***************************************************************************/ ! ! #ifndef _UTF8_MISC_H_ ! #define _UTF8_MISC_H_ ! ! /** ! * These variables are used in character processing functions ! * These have been added to manage utf-8 symbols, particularly escape chars ! */ ! #ifdef _UTF8_ ! #define ESCAPE_MASK 0x80 ! #define WEIGHT_MASK 0xF0 ! #else ! #define ESCAPE_MASK 0xFF ! #define WEIGHT_MASK 0x00 ! #endif ! ! ! /* ! * Is used to jump to the next start of char ! * of course it's only usefull when encoding is utf-8 ! * This function have been added by Jocelyn Merand to use libtextcat in OOo ! */ ! int nextcharstart(const char *str, int position); ! ! ! /*Copy the char in str to dest ! * of course it's only usefull when encoding is utf8 and the symbol is encoded with more than 1 char ! * return the number of char jumped ! * This function have been added by Jocelyn Merand to use libtextcat in OOo ! */ ! int charcopy(const char *str, char *dest); ! ! ! /* checks if n-gram lex is a prefix of key and of length len ! * if _UTF8_ is defined, it uses escap characters and len is not realy the length of lex ! * in this case, len is the number of utf-8 char strlen("€") == 3 but len == 1 ! */ ! int issame( char *lex, char *key, int len ); ! ! ! /* Counts the number of characters ! * if _UTF8_ is defined, it uses escap characters and the result is not realy the length of str ! * in this case, the result is the number of utf-8 char strlen("€") == 3 but utfstrlen("€") == 1 ! */ ! #ifdef __cplusplus ! extern "C" { ! #endif ! extern int utfstrlen(const char* str); ! #ifdef __cplusplus ! } ! #endif ! ! #endif ! *** misc/libtextcat-2.2/src/win32_config.h Tue Nov 27 13:51:28 2007 --- misc/build/libtextcat-2.2/src/win32_config.h Tue Nov 27 13:49:18 2007 *************** *** 1 **** ! dummy --- 1,136 ---- ! /* src/config.h. Generated by configure. */ ! /* src/config.h.in. Generated from configure.ac by autoheader. */ ! ! /* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP ! systems. This function is required for `alloca.c' support on those systems. ! */ ! /* #undef CRAY_STACKSEG_END */ ! ! /* Define to 1 if using `alloca.c'. */ ! /* #undef C_ALLOCA */ ! ! /* Define to 1 if you have `alloca', as a function or macro. */ ! /* #undef HAVE_ALLOCA */ ! ! /* Define to 1 if you have and it should be used (not on Ultrix). ! */ ! /* #undef HAVE_ALLOCA_H */ ! ! /* Define to 1 if you have the header file. */ ! #define HAVE_DLFCN_H 1 ! ! /* Define to 1 if you don't have `vprintf' but do have `_doprnt.' */ ! /* #undef HAVE_DOPRNT */ ! ! /* Define to 1 if you have the `gettimeofday' function. */ ! /* #undef HAVE_GETTIMEOFDAY */ ! ! /* Define to 1 if you have the header file. */ ! /* #undef HAVE_INTTYPES_H */ ! ! /* Define to 1 if you have the header file. */ ! #define HAVE_LIMITS_H 1 ! ! /* Define to 1 if your system has a GNU libc compatible `malloc' function, and ! to 0 otherwise. */ ! #define HAVE_MALLOC 1 ! ! /* Define to 1 if you have the header file. */ ! #define HAVE_MEMORY_H 1 ! ! /* Define to 1 if you have the `memset' function. */ ! #define HAVE_MEMSET 1 ! ! /* Define to 1 if your system has a GNU libc compatible `realloc' function, ! and to 0 otherwise. */ ! #define HAVE_REALLOC 1 ! ! /* Define to 1 if you have the header file. */ ! /* #undef HAVE_STDINT_H */ ! ! /* Define to 1 if you have the header file. */ ! #define HAVE_STDLIB_H 1 ! ! /* Define to 1 if you have the `strchr' function. */ ! #define HAVE_STRCHR 1 ! ! /* Define to 1 if you have the `strdup' function. */ ! #define HAVE_STRDUP 1 ! ! /* Define to 1 if you have the header file. */ ! /* #undef HAVE_STRINGS_H */ ! ! /* Define to 1 if you have the header file. */ ! #define HAVE_STRING_H 1 ! ! /* Define to 1 if you have the `strpbrk' function. */ ! #define HAVE_STRPBRK 1 ! ! /* Define to 1 if you have the header file. */ ! #define HAVE_SYS_STAT_H 1 ! ! /* Define to 1 if you have the header file. */ ! /* #undef HAVE_SYS_TIME_H */ ! ! /* Define to 1 if you have the header file. */ ! #define HAVE_SYS_TYPES_H 1 ! ! /* Define to 1 if you have the header file. */ ! #define HAVE_UNISTD_H 1 ! ! /* Define to 1 if you have the `vprintf' function. */ ! #define HAVE_VPRINTF 1 ! ! /* Name of package */ ! #define PACKAGE "libtextcat" ! ! /* Define to the address where bug reports for this package should be sent. */ ! #define PACKAGE_BUGREPORT "" ! ! /* Define to the full name of this package. */ ! #define PACKAGE_NAME "libtextcat" ! ! /* Define to the full name and version of this package. */ ! #define PACKAGE_STRING "libtextcat 2.2" ! ! /* Define to the one symbol short name of this package. */ ! #define PACKAGE_TARNAME "libtextcat" ! ! /* Define to the version of this package. */ ! #define PACKAGE_VERSION "2.2" ! ! /* If using the C implementation of alloca, define if you know the ! direction of stack growth for your system; otherwise it will be ! automatically deduced at run-time. ! STACK_DIRECTION > 0 => grows toward higher addresses ! STACK_DIRECTION < 0 => grows toward lower addresses ! STACK_DIRECTION = 0 => direction of growth unknown */ ! /* #undef STACK_DIRECTION */ ! ! /* Define to 1 if you have the ANSI C header files. */ ! #define STDC_HEADERS 1 ! ! /* Define to 1 if you can safely include both and . */ ! #define TIME_WITH_SYS_TIME 1 ! ! /* Define to 1 if your declares `struct tm'. */ ! /* #undef TM_IN_SYS_TIME */ ! ! /* Version number of package */ ! #define VERSION "2.2" ! ! /* Define to empty if `const' does not conform to ANSI C. */ ! /* #undef const */ ! ! /* Define as `__inline' if that's what the C compiler calls it, or to nothing ! if it is not supported. */ ! /* #undef inline */ ! ! /* Define to rpl_malloc if the replacement function should be used. */ ! /* #undef malloc */ ! ! /* Define to rpl_realloc if the replacement function should be used. */ ! /* #undef realloc */ ! ! /* Define to `unsigned' if does not define. */ ! /* #undef size_t */