office-gobmx/libtextcat/libtextcat-2.2.patch
Jens-Heiner Rechtien 59cfd92712 INTEGRATION: CWS tl48_SRC680 (1.3.8); FILE MERGED
2007/10/09 12:44:01 tl 1.3.8.1: #i81311# fixed warnings from the C runtime library
2007-11-01 11:09:41 +00:00

2253 lines
64 KiB
Diff

*** misc/libtextcat-2.2/configure Thu May 22 13:39:55 2003
--- misc/build/libtextcat-2.2/configure Thu Nov 1 12:53:31 2007
***************
*** 5391,5397 ****
allow_undefined_flag=
no_undefined_flag=
need_lib_prefix=unknown
! need_version=unknown
# when you set need_version to no, make sure it does not cause -set_version
# flags to be left without arguments
archive_cmds=
--- 5391,5398 ----
allow_undefined_flag=
no_undefined_flag=
need_lib_prefix=unknown
! #need_version=unknown
! need_version=no
# when you set need_version to no, make sure it does not cause -set_version
# flags to be left without arguments
archive_cmds=
***************
*** 6280,6286 ****
;;
freebsd*)
! objformat=`test -x /usr/bin/objformat && /usr/bin/objformat || echo aout`
version_type=freebsd-$objformat
case $version_type in
freebsd-elf*)
--- 6281,6287 ----
;;
freebsd*)
! objformat=`test -x /usr/bin/objformat && /usr/bin/objformat || echo elf`
version_type=freebsd-$objformat
case $version_type in
freebsd-elf*)
*** misc/libtextcat-2.2/src/Makefile.in Thu May 22 13:39:52 2003
--- misc/build/libtextcat-2.2/src/Makefile.in Thu Nov 1 12:53:31 2007
***************
*** 124,143 ****
target_vendor = @target_vendor@
AUTOMAKE_OPTIONS = 1.4 foreign
! WARNS = -W -Wall -Wshadow -Wpointer-arith
! IFLAGS =
! FLAGS = -g -O3 -funroll-loops -D_THREAD_SAFE -D_GNU_SOURCE
VERBOSE = -DVERBOSE
AM_CFLAGS = $(IFLAGS) $(VERBOSE) $(WARNS) $(FLAGS)
AM_LDFLAGS = -g
noinst_HEADERS = \
! common.h constants.h fingerprint.h textcat.h wg_mempool.h
lib_LTLIBRARIES = libtextcat.la
libtextcat_la_SOURCES = \
! common.c fingerprint.c textcat.c wg_mempool.c
bin_PROGRAMS = createfp
--- 124,143 ----
target_vendor = @target_vendor@
AUTOMAKE_OPTIONS = 1.4 foreign
! #WARNS = -W -Wall -Wshadow -Wpointer-arith
! IFLAGS =
! #FLAGS = -g -O3 -funroll-loops -D_THREAD_SAFE -D_GNU_SOURCE
VERBOSE = -DVERBOSE
AM_CFLAGS = $(IFLAGS) $(VERBOSE) $(WARNS) $(FLAGS)
AM_LDFLAGS = -g
noinst_HEADERS = \
! common.h constants.h fingerprint.h textcat.h wg_mempool.h utf8misc.h
lib_LTLIBRARIES = libtextcat.la
libtextcat_la_SOURCES = \
! common.c fingerprint.c textcat.c wg_mempool.c utf8misc.c
bin_PROGRAMS = createfp
***************
*** 156,162 ****
libtextcat_la_LDFLAGS =
libtextcat_la_LIBADD =
am_libtextcat_la_OBJECTS = common.lo fingerprint.lo textcat.lo \
! wg_mempool.lo
libtextcat_la_OBJECTS = $(am_libtextcat_la_OBJECTS)
bin_PROGRAMS = createfp$(EXEEXT)
noinst_PROGRAMS = testtextcat$(EXEEXT)
--- 156,162 ----
libtextcat_la_LDFLAGS =
libtextcat_la_LIBADD =
am_libtextcat_la_OBJECTS = common.lo fingerprint.lo textcat.lo \
! wg_mempool.lo utf8misc.lo
libtextcat_la_OBJECTS = $(am_libtextcat_la_OBJECTS)
bin_PROGRAMS = createfp$(EXEEXT)
noinst_PROGRAMS = testtextcat$(EXEEXT)
***************
*** 177,183 ****
@AMDEP_TRUE@DEP_FILES = ./$(DEPDIR)/common.Plo ./$(DEPDIR)/createfp.Po \
@AMDEP_TRUE@ ./$(DEPDIR)/fingerprint.Plo \
@AMDEP_TRUE@ ./$(DEPDIR)/testtextcat.Po ./$(DEPDIR)/textcat.Plo \
! @AMDEP_TRUE@ ./$(DEPDIR)/wg_mempool.Plo
COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
LTCOMPILE = $(LIBTOOL) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) \
--- 177,184 ----
@AMDEP_TRUE@DEP_FILES = ./$(DEPDIR)/common.Plo ./$(DEPDIR)/createfp.Po \
@AMDEP_TRUE@ ./$(DEPDIR)/fingerprint.Plo \
@AMDEP_TRUE@ ./$(DEPDIR)/testtextcat.Po ./$(DEPDIR)/textcat.Plo \
! @AMDEP_TRUE@ ./$(DEPDIR)/wg_mempool.Plo \
! @AMDEP_TRUE@ ./$(DEPDIR)/utf8misc.Plo
COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
LTCOMPILE = $(LIBTOOL) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) \
***************
*** 213,219 ****
@rm -f stamp-h1
cd $(top_builddir) && $(SHELL) ./config.status src/config.h
! $(srcdir)/config.h.in: $(top_srcdir)/configure.ac $(ACLOCAL_M4)
cd $(top_srcdir) && $(AUTOHEADER)
touch $(srcdir)/config.h.in
--- 214,220 ----
@rm -f stamp-h1
cd $(top_builddir) && $(SHELL) ./config.status src/config.h
! $(srcdir)/config.h.in: $(top_srcdir)/configure.ac $(ACLOCAL_M4)
cd $(top_srcdir) && $(AUTOHEADER)
touch $(srcdir)/config.h.in
***************
*** 247,254 ****
echo "rm -f \"$${dir}/so_locations\""; \
rm -f "$${dir}/so_locations"; \
done
! libtextcat.la: $(libtextcat_la_OBJECTS) $(libtextcat_la_DEPENDENCIES)
! $(LINK) -rpath $(libdir) $(libtextcat_la_LDFLAGS) $(libtextcat_la_OBJECTS) $(libtextcat_la_LIBADD) $(LIBS)
binPROGRAMS_INSTALL = $(INSTALL_PROGRAM)
install-binPROGRAMS: $(bin_PROGRAMS)
@$(NORMAL_INSTALL)
--- 248,255 ----
echo "rm -f \"$${dir}/so_locations\""; \
rm -f "$${dir}/so_locations"; \
done
! libtextcat.la: $(libtextcat_la_OBJECTS) $(libtextcat_la_DEPENDENCIES)
! $(LINK) -avoid-version -rpath $(libdir) $(libtextcat_la_LDFLAGS) $(libtextcat_la_OBJECTS) $(libtextcat_la_LIBADD) $(LIBS)
binPROGRAMS_INSTALL = $(INSTALL_PROGRAM)
install-binPROGRAMS: $(bin_PROGRAMS)
@$(NORMAL_INSTALL)
***************
*** 285,294 ****
echo " rm -f $$p $$f"; \
rm -f $$p $$f ; \
done
! createfp$(EXEEXT): $(createfp_OBJECTS) $(createfp_DEPENDENCIES)
@rm -f createfp$(EXEEXT)
$(LINK) $(createfp_LDFLAGS) $(createfp_OBJECTS) $(createfp_LDADD) $(LIBS)
! testtextcat$(EXEEXT): $(testtextcat_OBJECTS) $(testtextcat_DEPENDENCIES)
@rm -f testtextcat$(EXEEXT)
$(LINK) $(testtextcat_LDFLAGS) $(testtextcat_OBJECTS) $(testtextcat_LDADD) $(LIBS)
--- 286,295 ----
echo " rm -f $$p $$f"; \
rm -f $$p $$f ; \
done
! createfp$(EXEEXT): $(createfp_OBJECTS) $(createfp_DEPENDENCIES)
@rm -f createfp$(EXEEXT)
$(LINK) $(createfp_LDFLAGS) $(createfp_OBJECTS) $(createfp_LDADD) $(LIBS)
! testtextcat$(EXEEXT): $(testtextcat_OBJECTS) $(testtextcat_DEPENDENCIES)
@rm -f testtextcat$(EXEEXT)
$(LINK) $(testtextcat_LDFLAGS) $(testtextcat_OBJECTS) $(testtextcat_LDADD) $(LIBS)
***************
*** 304,309 ****
--- 305,311 ----
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/testtextcat.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/textcat.Plo@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/wg_mempool.Plo@am__quote@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/utf8misc.Plo@am__quote@
distclean-depend:
-rm -rf ./$(DEPDIR)
*** misc/libtextcat-2.2/src/common.c Thu May 22 13:32:43 2003
--- misc/build/libtextcat-2.2/src/common.c Thu Nov 1 13:06:37 2007
***************
*** 3,25 ****
*
* Copyright (c) 2003, WiseGuys Internet B.V.
* All rights reserved.
! *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
! *
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
! *
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the
* distribution.
! *
* - Neither the name of the WiseGuys Internet B.V. nor the names of
* its contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
! *
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
--- 3,25 ----
*
* Copyright (c) 2003, WiseGuys Internet B.V.
* All rights reserved.
! *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
! *
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
! *
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the
* distribution.
! *
* - Neither the name of the WiseGuys Internet B.V. nor the names of
* its contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
! *
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
***************
*** 114,124 ****
wgmem_error( "Error while strduping %u bytes.\n", strlen(s) );
}
! return( result );
}
! extern void* wg_realloc( void *ptr, size_t size )
! {
void *result;
if (!size) {
--- 114,124 ----
wgmem_error( "Error while strduping %u bytes.\n", strlen(s) );
}
! return( result );
}
! extern void* wg_realloc( void *ptr, size_t size )
! {
void *result;
if (!size) {
***************
*** 131,137 ****
wgmem_error( "Error while reallocing %u bytes.\n", size );
}
! return( result );
}
extern void wg_free( void *mem )
--- 131,137 ----
wgmem_error( "Error while reallocing %u bytes.\n", size );
}
! return( result );
}
extern void wg_free( void *mem )
***************
*** 148,159 ****
if ( fgets(line, size, fp) == NULL ) {
return NULL;
}
!
/** kill term null **/
if ( (p = strpbrk( line, "\n\r" )) ) {
*p = '\0';
! }
!
return line;
}
--- 148,159 ----
if ( fgets(line, size, fp) == NULL ) {
return NULL;
}
!
/** kill term null **/
if ( (p = strpbrk( line, "\n\r" )) ) {
*p = '\0';
! }
!
return line;
}
***************
*** 164,202 ****
*
* ARGUMENTS:
* - result:
! *
* After the split, this array contains pointers to the start of each
* detected segment. Must be preallocated and at least as large as
* maxsegments. The pointers point into the dest buffer.
! *
! * - dest:
! *
* String into which result points as an index. Must be preallocated, and
* at least as big as src. You can use src as dest, but in that case src
* is overwritten!
! *
! * - src:
! *
* The string to split. Sequences of whitespace are treated as separators, unless
* escaped. There are two ways to escape: by using single quotes (anything
* between single quotes is treated as one segment), or by using a backslash
* to escape the next character. The backslash escape works inside quotation
* as well.
! *
* Example:
! *
* "It\'s very\ easy 'to use WiseGuys\' wg_split()' function" is split into:
! *
* "It's"
* "very easy"
* "to use WiseGuys' wg_split()"
* "function"
! *
! * - maxsegments:
! *
* The maximum number of segments. If the splitter runs out of segments,
* the remainder of the string is stored in the last segment.
! *
* RETURN VALUE:
* The number of segments found.
*/
--- 164,202 ----
*
* ARGUMENTS:
* - result:
! *
* After the split, this array contains pointers to the start of each
* detected segment. Must be preallocated and at least as large as
* maxsegments. The pointers point into the dest buffer.
! *
! * - dest:
! *
* String into which result points as an index. Must be preallocated, and
* at least as big as src. You can use src as dest, but in that case src
* is overwritten!
! *
! * - src:
! *
* The string to split. Sequences of whitespace are treated as separators, unless
* escaped. There are two ways to escape: by using single quotes (anything
* between single quotes is treated as one segment), or by using a backslash
* to escape the next character. The backslash escape works inside quotation
* as well.
! *
* Example:
! *
* "It\'s very\ easy 'to use WiseGuys\' wg_split()' function" is split into:
! *
* "It's"
* "very easy"
* "to use WiseGuys' wg_split()"
* "function"
! *
! * - maxsegments:
! *
* The maximum number of segments. If the splitter runs out of segments,
* the remainder of the string is stored in the last segment.
! *
* RETURN VALUE:
* The number of segments found.
*/
***************
*** 218,229 ****
switch (state) {
case 0:
/*** Skip spaces ***/
! while ( isspace((int) *p) ) {
p++;
}
state = 1;
! case 1:
/*** Start segment ***/
result[cnt] = w;
cnt++;
--- 218,229 ----
switch (state) {
case 0:
/*** Skip spaces ***/
! while ( isspace((unsigned char) *p) ) {
p++;
}
state = 1;
! case 1:
/*** Start segment ***/
result[cnt] = w;
cnt++;
***************
*** 232,243 ****
case 2:
/*** Unquoted segment ***/
while (*p) {
! if ( isspace((int) *p) ) {
*w++ = '\0';
p++;
state = 0;
break;
! }
else if ( *p == '\'' ) {
/*** Start quotation ***/
p++;
--- 232,243 ----
case 2:
/*** Unquoted segment ***/
while (*p) {
! if ( isspace((unsigned char) *p) ) {
*w++ = '\0';
p++;
state = 0;
break;
! }
else if ( *p == '\'' ) {
/*** Start quotation ***/
p++;
***************
*** 292,308 ****
}
extern void wg_timerstart(wgtimer_t *t)
{
- #ifdef HAVE_GETTIMEOFDAY
gettimeofday( &(t->start), NULL );
- #endif
}
extern uint4 wg_timerstop(wgtimer_t *t)
{
- #ifdef HAVE_GETTIMEOFDAY
uint4 result;
gettimeofday( &(t->stop), NULL );
result = (t->stop.tv_sec - t->start.tv_sec) * 1000000 +
--- 292,308 ----
}
+ #ifdef HAVE_GETTIMEOFDAY /* TL : no struct timeval under Win32 */
extern void wg_timerstart(wgtimer_t *t)
{
gettimeofday( &(t->start), NULL );
}
+ #endif /* TL : no struct timeval under Win32 */
+ #ifdef HAVE_GETTIMEOFDAY /* TL : no struct timeval under Win32 */
extern uint4 wg_timerstop(wgtimer_t *t)
{
uint4 result;
gettimeofday( &(t->stop), NULL );
result = (t->stop.tv_sec - t->start.tv_sec) * 1000000 +
***************
*** 312,336 ****
t->start.tv_usec = t->stop.tv_usec;
return result;
- #else
- return 0;
- #endif
}
/**
* wg_strgmov -- a guarded strcpy() variation
! *
* copies src to dest (including terminating zero), and returns
* pointer to position of terminating zero in dest. The function is
* guaranteed not to write past destlimit. If the copy couldn't be
! * finished, the function returns NULL after restoring the first
! * character in dest for your convenience (since this is usually a zero).
*/
char *wg_strgmov( char *dest, const char *src, const char *destlimit )
{
char tmp, *w;
!
if ( !dest || dest >= destlimit ) {
return NULL;
}
--- 312,334 ----
t->start.tv_usec = t->stop.tv_usec;
return result;
}
+ #endif /* TL : no struct timeval under Win32 */
/**
* wg_strgmov -- a guarded strcpy() variation
! *
* copies src to dest (including terminating zero), and returns
* pointer to position of terminating zero in dest. The function is
* guaranteed not to write past destlimit. If the copy couldn't be
! * finished, the function returns NULL after restoring the first
! * character in dest for your convenience (since this is usually a zero).
*/
char *wg_strgmov( char *dest, const char *src, const char *destlimit )
{
char tmp, *w;
!
if ( !dest || dest >= destlimit ) {
return NULL;
}
***************
*** 355,361 ****
}
/*
! * wg_trim() -- remove whitespace surrounding a string.
*
* Example: " bla bla bla " becomes "bla bla bla" after trimming.
*
--- 353,359 ----
}
/*
! * wg_trim() -- remove whitespace surrounding a string.
*
* Example: " bla bla bla " becomes "bla bla bla" after trimming.
*
***************
*** 373,384 ****
char *lastnonspace = &dest[-1];
const char *p = src;
char *w = dest;
!
! while ( isspace((int)*p) ) {
p++;
}
while (*p) {
! if ( !isspace((int)*p) ) {
lastnonspace = w;
}
*w++ = *p++;
--- 371,382 ----
char *lastnonspace = &dest[-1];
const char *p = src;
char *w = dest;
!
! while ( isspace((unsigned char)*p) ) {
p++;
}
while (*p) {
! if ( !isspace((unsigned char)*p) ) {
lastnonspace = w;
}
*w++ = *p++;
*** misc/libtextcat-2.2/src/common.h Thu May 22 15:02:29 2003
--- misc/build/libtextcat-2.2/src/common.h Thu Nov 1 12:53:31 2007
***************
*** 1,28 ****
#ifndef _COMMON_H_
#define _COMMON_H_
/**
! * common.h -- a mixed bag of helper functions
*
* Copyright (C) 2003 WiseGuys Internet B.V.
*
* THE BSD LICENSE
! *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
! *
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
! *
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the
* distribution.
! *
* - Neither the name of the WiseGuys Internet B.V. nor the names of
* its contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
! *
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
--- 1,28 ----
#ifndef _COMMON_H_
#define _COMMON_H_
/**
! * common.h -- a mixed bag of helper functions
*
* Copyright (C) 2003 WiseGuys Internet B.V.
*
* THE BSD LICENSE
! *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
! *
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
! *
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the
* distribution.
! *
* - Neither the name of the WiseGuys Internet B.V. nor the names of
* its contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
! *
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
***************
*** 86,95 ****
--- 86,97 ----
typedef char boole;
#endif
+ #ifdef HAVE_GETTIMEOFDAY /* TL : no struct timeval under Win32 */
typedef struct wgtimer_s {
struct timeval start;
struct timeval stop;
} wgtimer_t;
+ #endif /* TL : no struct timeval under Win32 */
extern void *wg_malloc( size_t size );
***************
*** 101,113 ****
extern char *wg_getline( char *line, int size, FILE *fp );
extern void wg_timerstart(wgtimer_t *t);
extern uint4 wg_timerstop(wgtimer_t *t);
extern unsigned int wg_split( char **result, char *dest, char *src, int maxsegments );
extern char *wg_strgmov( char *dest, const char *src, const char *destlimit );
extern char *wg_trim( char *dest, const char *src );
!
#endif
--- 103,117 ----
extern char *wg_getline( char *line, int size, FILE *fp );
+ #ifdef HAVE_GETTIMEOFDAY /* TL : no struct timeval under Win32 */
extern void wg_timerstart(wgtimer_t *t);
extern uint4 wg_timerstop(wgtimer_t *t);
+ #endif /* TL : no struct timeval under Win32 */
extern unsigned int wg_split( char **result, char *dest, char *src, int maxsegments );
extern char *wg_strgmov( char *dest, const char *src, const char *destlimit );
extern char *wg_trim( char *dest, const char *src );
!
#endif
*** misc/libtextcat-2.2/src/constants.h Thu May 22 13:32:43 2003
--- misc/build/libtextcat-2.2/src/constants.h Thu Nov 1 13:05:24 2007
***************
*** 39,44 ****
--- 39,46 ----
*/
#include <limits.h>
+ #define _UTF8_
+
#define DESCRIPTION "out of place"
/* Reported matches are those fingerprints with a score less than best
***************
*** 59,72 ****
/* Maximum number of n-grams in a fingerprint */
#define MAXNGRAMS 400
! /* Maximum size of an n-gram? */
! #define MAXNGRAMSIZE 5
/* Which characters are not acceptable in n-grams? */
! #define INVALID(c) (isspace((int)c) || isdigit((int)c))
/* Minimum size (in characters) for accepting a document */
! #define MINDOCSIZE 25
/* Maximum penalty for missing an n-gram in fingerprint */
#define MAXOUTOFPLACE 400
--- 61,81 ----
/* Maximum number of n-grams in a fingerprint */
#define MAXNGRAMS 400
! /* Maximum number of character of an n-gram? */
! #define MAXNGRAMSYMBOL 5
!
! /* Maximum size of the string representing an n-gram (must be greater than number of symbol) */
! #ifdef _UTF8_
! #define MAXNGRAMSIZE 20
! #else
! #define MAXNGRAMSIZE MAXNGRAMSYMBOL
! #endif
/* Which characters are not acceptable in n-grams? */
! #define INVALID(c) (isspace((unsigend char)c) || isdigit((unsigned char)c))
/* Minimum size (in characters) for accepting a document */
! #define MINDOCSIZE 6
/* Maximum penalty for missing an n-gram in fingerprint */
#define MAXOUTOFPLACE 400
***************
*** 76,79 ****
--- 85,91 ----
#define MAXSCORE INT_MAX
+ /* where the fingerprints files are stored */
+ #define DEFAULT_FINGERPRINTS_PATH ""
+
#endif
*** misc/libtextcat-2.2/src/fingerprint.c Thu May 22 13:32:43 2003
--- misc/build/libtextcat-2.2/src/fingerprint.c Thu Nov 1 12:53:31 2007
***************
*** 6,28 ****
* All rights reserved.
*
* THE BSD LICENSE
! *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
! *
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
! *
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the
* distribution.
! *
* - Neither the name of the WiseGuys Internet B.V. nor the names of
* its contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
! *
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
--- 6,28 ----
* All rights reserved.
*
* THE BSD LICENSE
! *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
! *
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
! *
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the
* distribution.
! *
* - Neither the name of the WiseGuys Internet B.V. nor the names of
* its contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
! *
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
***************
*** 51,57 ****
* The reason why we go through the trouble of doing a partial
* (heap)sort is that a full quicksort behaves horribly on the data:
* most n-grams have a very low count, resulting in a data set in
! * nearly-sorted order. This causes quicksort to behave very badly.
* Heapsort, on the other hand, behaves handsomely: worst case is
* Mlog(N) for M n-grams filtered through a N-sized heap.
*
--- 51,57 ----
* The reason why we go through the trouble of doing a partial
* (heap)sort is that a full quicksort behaves horribly on the data:
* most n-grams have a very low count, resulting in a data set in
! * nearly-sorted order. This causes quicksort to behave very badly.
* Heapsort, on the other hand, behaves handsomely: worst case is
* Mlog(N) for M n-grams filtered through a N-sized heap.
*
***************
*** 63,68 ****
--- 63,72 ----
* - put table/heap datastructure in a separate file.
*/
+ #ifndef _UTF8_
+ #define _UTF8_
+ #endif
+
#include "config.h"
#include <stdio.h>
#ifdef HAVE_STDLIB_H
***************
*** 80,89 ****
--- 84,95 ----
#include "wg_mempool.h"
#include "constants.h"
+ #include "utf8misc.h"
#define TABLESIZE (1<<TABLEPOW)
#define TABLEMASK ((TABLESIZE)-1)
+
typedef struct {
sint2 rank;
***************
*** 96,102 ****
const char *name;
ngram_t *fprint;
uint4 size;
!
} fp_t;
typedef struct entry_s {
--- 102,108 ----
const char *name;
ngram_t *fprint;
uint4 size;
!
} fp_t;
typedef struct entry_s {
***************
*** 105,117 ****
struct entry_s *next;
} entry_t;
! typedef struct table_s {
void *pool;
entry_t **table;
entry_t *heap;
struct table_s *next;
!
uint4 heapsize;
uint4 size;
} table_t;
--- 111,123 ----
struct entry_s *next;
} entry_t;
! typedef struct table_s {
void *pool;
entry_t **table;
entry_t *heap;
struct table_s *next;
!
uint4 heapsize;
uint4 size;
} table_t;
***************
*** 122,128 ****
* fast and furious little hash function
*
* (Note that we could use some kind of rolling checksum, and update it
! * during n-gram construction)
*/
static uint4 simplehash( const char *p, int len )
{
--- 128,134 ----
* fast and furious little hash function
*
* (Note that we could use some kind of rolling checksum, and update it
! * during n-gram construction)
*/
static uint4 simplehash( const char *p, int len )
{
***************
*** 134,162 ****
}
- /* checks if n-gram lex is a prefix of key and of length len */
- inline int issame( char *lex, char *key, int len )
- {
- int i;
- for (i=0; i<len; i++) {
- if ( key[i] != lex[i] ) {
- return 0;
- }
- }
- if ( lex[i] != 0 ) {
- return 0;
- }
- return 1;
- }
-
/* increases frequency of ngram(p,len) */
! static inline int increasefreq( table_t *t, char *p, int len )
! {
! uint4 hash = simplehash( p, len ) & TABLEMASK;
entry_t *entry = t->table[ hash ];
!
! while ( entry ) {
if ( issame( entry->str, p, len ) ) {
/*** Found it! ***/
entry->cnt++;
--- 140,153 ----
}
/* increases frequency of ngram(p,len) */
! static int increasefreq( table_t *t, char *p, int len )
! {
! uint4 hash = simplehash( p, len ) & TABLEMASK;
entry_t *entry = t->table[ hash ];
!
! while ( entry ) {
if ( issame( entry->str, p, len ) ) {
/*** Found it! ***/
entry->cnt++;
***************
*** 168,174 ****
}
/*** Not found, so create ***/
! entry = wgmempool_alloc( t->pool, sizeof(entry_t) );
strcpy( entry->str, p );
entry->cnt = 1;
--- 159,165 ----
}
/*** Not found, so create ***/
! entry = (entry_t*)(wgmempool_alloc( t->pool, sizeof(entry_t) ));
strcpy( entry->str, p );
entry->cnt = 1;
***************
*** 181,192 ****
#if 0
/* looks up ngram(p,len) */
! static entry_t *findfreq( table_t *t, char *p, int len )
! {
! uint4 hash = simplehash( p, len ) & TABLEMASK;
entry_t *entry = t->table[ hash ];
!
! while ( entry ) {
if ( issame( entry->str, p, len ) ) {
return entry;
}
--- 172,183 ----
#if 0
/* looks up ngram(p,len) */
! static entry_t *findfreq( table_t *t, char *p, int len )
! {
! uint4 hash = simplehash( p, len ) & TABLEMASK;
entry_t *entry = t->table[ hash ];
!
! while ( entry ) {
if ( issame( entry->str, p, len ) ) {
return entry;
}
***************
*** 219,225 ****
#define GREATER(x,y) ((x).cnt > (y).cnt)
#define LESS(x,y) ((x).cnt < (y).cnt)
! inline static void siftup( table_t *t, unsigned int child )
{
entry_t *heap = t->heap;
unsigned int parent = (child-1) >> 1;
--- 210,216 ----
#define GREATER(x,y) ((x).cnt > (y).cnt)
#define LESS(x,y) ((x).cnt < (y).cnt)
! static void siftup( table_t *t, unsigned int child )
{
entry_t *heap = t->heap;
unsigned int parent = (child-1) >> 1;
***************
*** 241,247 ****
}
! inline static void siftdown( table_t *t, unsigned int heapsize, uint4 parent )
{
entry_t *heap = t->heap;
unsigned int child = parent*2 + 1;
--- 232,238 ----
}
! static void siftdown( table_t *t, unsigned int heapsize, uint4 parent )
{
entry_t *heap = t->heap;
unsigned int child = parent*2 + 1;
***************
*** 273,279 ****
if (t->size < t->heapsize) {
memcpy( &(heap[t->size]), item, sizeof(entry_t));
siftup( t, t->size );
! t->size++;
return 0;
}
--- 264,270 ----
if (t->size < t->heapsize) {
memcpy( &(heap[t->size]), item, sizeof(entry_t));
siftup( t, t->size );
! t->size++;
return 0;
}
***************
*** 316,333 ****
/*** Fill result heap ***/
for (i=0; i<TABLESIZE; i++) {
! entry_t *p = t->table[i];
while (p) {
heapinsert(t, p);
p = p->next;
}
! }
return 1;
}
static table_t *inittable(uint4 maxngrams)
! {
table_t *result = (table_t *)wg_zalloc( sizeof(table_t) );
result->table = (entry_t **)wg_zalloc( sizeof(entry_t*) * TABLESIZE );
result->pool = wgmempool_Init( 10000, 10 );
--- 307,324 ----
/*** Fill result heap ***/
for (i=0; i<TABLESIZE; i++) {
! entry_t *p = t->table[i];
while (p) {
heapinsert(t, p);
p = p->next;
}
! }
return 1;
}
static table_t *inittable(uint4 maxngrams)
! {
table_t *result = (table_t *)wg_zalloc( sizeof(table_t) );
result->table = (entry_t **)wg_zalloc( sizeof(entry_t*) * TABLESIZE );
result->pool = wgmempool_Init( 10000, 10 );
***************
*** 347,360 ****
wgmempool_Done(t->pool);
wg_free(t->table);
wg_free(t->heap);
! wg_free(t);
}
extern void *fp_Init(const char *name)
{
fp_t *h = (fp_t *)wg_zalloc( sizeof(fp_t) );
!
if ( name ) {
h->name = wg_strdup(name);
}
--- 338,351 ----
wgmempool_Done(t->pool);
wg_free(t->table);
wg_free(t->heap);
! wg_free(t);
}
extern void *fp_Init(const char *name)
{
fp_t *h = (fp_t *)wg_zalloc( sizeof(fp_t) );
!
if ( name ) {
h->name = wg_strdup(name);
}
***************
*** 458,478 ****
return dest;
}
!
static void createngramtable( table_t *t, const char *buf )
{
char n[MAXNGRAMSIZE+1];
const char *p = buf;
int i;
/*** Get all n-grams where 1<=n<=MAXNGRAMSIZE. Allow underscores only at borders. ***/
! for (;;p++) {
! const char *q = p;
char *m = n;
/*** First char may be an underscore ***/
! *m++ = *q++;
*m = '\0';
increasefreq( t, n, 1 );
--- 449,475 ----
return dest;
}
! /**
! * this function extract all n-gram from past buffer and put them into the table "t"
! * [modified] by Jocelyn Merand to accept utf-8 multi-character symbols to be used in OpenOffice
! */
static void createngramtable( table_t *t, const char *buf )
{
char n[MAXNGRAMSIZE+1];
const char *p = buf;
int i;
+ int pointer = 0;
/*** Get all n-grams where 1<=n<=MAXNGRAMSIZE. Allow underscores only at borders. ***/
! while(1) {
! const char *q = &p[pointer]; /*[modified] previously p++ above (for(;;p++)) now, it's pointer wich is increased so we have to get the new pointer on the buffer*/
char *m = n;
/*** First char may be an underscore ***/
! int decay = charcopy(q, m); /*[modified] previously *q++ = *m++*/
! q = &(p[pointer+decay]); /*[modified] the old copying method do not manage multi-character symbols*/
! m += decay; /*[modified]*/
*m = '\0';
increasefreq( t, n, 1 );
***************
*** 482,500 ****
}
/*** Let the compiler unroll this ***/
! for ( i=2; i<=MAXNGRAMSIZE; i++) {
! *m++ = *q;
*m = '\0';
increasefreq( t, n, i );
if ( *q == '_' ) break;
! q++;
if ( *q == '\0' ) {
return;
}
}
}
return;
}
--- 479,500 ----
}
/*** Let the compiler unroll this ***/
! for ( i=2; i<=MAXNGRAMSYMBOL; i++) {
! decay = charcopy(q, m); /*[modified] like above*/
! m += decay;
*m = '\0';
increasefreq( t, n, i );
if ( *q == '_' ) break;
! q += decay;
if ( *q == '\0' ) {
return;
}
}
+
+ pointer = nextcharstart(p,pointer); /*[modified] p[pointer] must point on the next start of symbol, but whith utf next start is not surely next char*/
}
return;
}
***************
*** 514,520 ****
{
ngram_t *x = (ngram_t *)a;
ngram_t *y = (ngram_t *)b;
!
return mystrcmp( x->str, y->str );
}
--- 514,520 ----
{
ngram_t *x = (ngram_t *)a;
ngram_t *y = (ngram_t *)b;
!
return mystrcmp( x->str, y->str );
}
***************
*** 522,533 ****
{
ngram_t *x = (ngram_t *)a;
ngram_t *y = (ngram_t *)b;
!
return x->rank - y->rank;
}
/**
! * Create a fingerprint:
* - record the frequency of each unique n-gram in a hash table
* - take the most frequent n-grams
* - sort them alphabetically, recording their relative rank
--- 522,533 ----
{
ngram_t *x = (ngram_t *)a;
ngram_t *y = (ngram_t *)b;
!
return x->rank - y->rank;
}
/**
! * Create a fingerprint:
* - record the frequency of each unique n-gram in a hash table
* - take the most frequent n-grams
* - sort them alphabetically, recording their relative rank
***************
*** 544,563 ****
}
/*** Throw out all invalid chars ***/
! tmp = prepbuffer( buffer, bufsize );
if ( tmp == NULL ) {
return 0;
}
-
h = (fp_t*)handle;
t = inittable(maxngrams);
/*** Create a hash table containing n-gram counts ***/
createngramtable(t, tmp);
!
/*** Take the top N n-grams and add them to the profile ***/
! table2heap(t);
! maxngrams = WGMIN( maxngrams, t->size );
h->fprint = (ngram_t *)wg_malloc( sizeof(ngram_t) * maxngrams );
h->size = maxngrams;
--- 544,564 ----
}
/*** Throw out all invalid chars ***/
! tmp = prepbuffer( buffer, bufsize );
! /*printf("Cleaned buffer : %s\n",tmp);*/
if ( tmp == NULL ) {
return 0;
}
h = (fp_t*)handle;
t = inittable(maxngrams);
+ /*printf("Table initialized\n");*/
/*** Create a hash table containing n-gram counts ***/
createngramtable(t, tmp);
! /*printf("Table created\n");*/
/*** Take the top N n-grams and add them to the profile ***/
! table2heap(t);
! maxngrams = WGMIN( maxngrams, t->size );
h->fprint = (ngram_t *)wg_malloc( sizeof(ngram_t) * maxngrams );
h->size = maxngrams;
***************
*** 568,574 ****
entry_t tmp2;
heapextract(t, &tmp2);
!
/*** the string and its rank is all we need ***/
strcpy( h->fprint[i].str, tmp2.str );
h->fprint[i].rank = i;
--- 569,575 ----
entry_t tmp2;
heapextract(t, &tmp2);
!
/*** the string and its rank is all we need ***/
strcpy( h->fprint[i].str, tmp2.str );
h->fprint[i].rank = i;
***************
*** 578,584 ****
wg_free(tmp);
/*** Sort n-grams alphabetically, for easy comparison ***/
! qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp_str );
return 1;
}
--- 579,585 ----
wg_free(tmp);
/*** Sort n-grams alphabetically, for easy comparison ***/
! qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp_str );
return 1;
}
***************
*** 608,614 ****
#endif
return 0;
}
!
h->fprint = (ngram_t *)wg_malloc(maxngrams * sizeof(ngram_t));
while (cnt < maxngrams && wg_getline(line,1024,fp)) {
--- 609,615 ----
#endif
return 0;
}
!
h->fprint = (ngram_t *)wg_malloc(maxngrams * sizeof(ngram_t));
while (cnt < maxngrams && wg_getline(line,1024,fp)) {
***************
*** 635,641 ****
h->size = cnt;
/*** Sort n-grams, for easy comparison later on ***/
! qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp_str );
fclose(fp);
--- 636,642 ----
h->size = cnt;
/*** Sort n-grams, for easy comparison later on ***/
! qsort( h->fprint, h->size, sizeof(ngram_t), ngramcmp_str );
fclose(fp);
***************
*** 648,661 ****
{
uint4 i;
fp_t *h = (fp_t *)handle;
! ngram_t *tmp = wg_malloc( sizeof(ngram_t) * h->size );
!
/*** Make a temporary and sort it on rank ***/
memcpy( tmp, h->fprint, h->size * sizeof(ngram_t) );
! qsort( tmp, h->size, sizeof(ngram_t), ngramcmp_rank );
for (i=0; i<h->size; i++) {
! fprintf( fp, "%s\n", tmp[i].str );
}
wg_free( tmp );
}
--- 649,663 ----
{
uint4 i;
fp_t *h = (fp_t *)handle;
! ngram_t *tmp = (ngram_t*)wg_malloc( sizeof(ngram_t) * h->size );
!
/*** Make a temporary and sort it on rank ***/
memcpy( tmp, h->fprint, h->size * sizeof(ngram_t) );
! qsort( tmp, h->size, sizeof(ngram_t), ngramcmp_rank );
for (i=0; i<h->size; i++) {
! /*fprintf( fp, "%s\t%i\n", tmp[i].str, tmp[i].rank );*/
! fprintf( fp, "%s\n", tmp[i].str);
}
wg_free( tmp );
}
***************
*** 669,675 ****
uint4 i = 0;
uint4 j = 0;
sint4 sum = 0;
!
/*** Compare the profiles in mergesort fashion ***/
while ( i < c->size && j < u->size ) {
--- 671,677 ----
uint4 i = 0;
uint4 j = 0;
sint4 sum = 0;
!
/*** Compare the profiles in mergesort fashion ***/
while ( i < c->size && j < u->size ) {
***************
*** 705,711 ****
}
return sum;
!
}
--- 707,713 ----
}
return sum;
!
}
*** misc/libtextcat-2.2/src/fingerprint.h Mon May 19 14:16:31 2003
--- misc/build/libtextcat-2.2/src/fingerprint.h Thu Nov 1 12:53:31 2007
***************
*** 41,47 ****
--- 41,53 ----
extern int fp_Read( void *handle, const char *fname, int maxngrams );
extern sint4 fp_Compare( void *cat, void *unknown, int cutoff );
extern void fp_Show( void *handle );
+ #ifdef __cplusplus
+ extern "C" {
+ #endif
extern const char *fp_Name( void *handle );
+ #ifdef __cplusplus
+ }
+ #endif
extern void fp_Print( void *handle, FILE *fp );
#endif
*** misc/libtextcat-2.2/src/libtextcat.map Thu Nov 1 13:07:33 2007
--- misc/build/libtextcat-2.2/src/libtextcat.map Thu Nov 1 12:53:31 2007
***************
*** 1 ****
! dummy
--- 1,40 ----
! {
! global:
! charcopy
! issame
! nextcharstart
! utfstrlen
! wgmempool_Done
! wgmempool_Init
! wgmempool_Reset
! wgmempool_alloc
! wgmempool_getline
! wgmempool_strdup
! special_textcat_Init
! textcat_Classify
! textcat_Done
! textcat_Init
! textcat_Version
! fp_Compare
! fp_Create
! fp_Debug
! fp_Done
! fp_Init
! fp_Name
! fp_Print
! fp_Read
! heapextract
! wg_calloc
! wg_free
! wg_getline
! wg_malloc
! wg_split
! wg_strdup
! wg_strgmov
! wg_trim
! wg_zalloc
! wgmem_error
!
! local:
! *;
! }
*** misc/libtextcat-2.2/src/makefile.mk Thu Nov 1 13:07:33 2007
--- misc/build/libtextcat-2.2/src/makefile.mk Thu Nov 1 12:53:31 2007
***************
*** 1 ****
! dummy
--- 1,92 ----
! #*************************************************************************
! #
! # $RCSfile: libtextcat-2.2.patch,v $
! #
! # $Revision: 1.5 $
! #
! # last change: $Author: hr $ $Date: 2007-11-01 12:09:41 $
! #
! #* The Contents of this file are made available subject to
! #* the terms of GNU Lesser General Public License Version 2.1.
! #*
! #*
! #* GNU Lesser General Public License Version 2.1
! #* =============================================
! #* Copyright 2005 by Sun Microsystems, Inc.
! #* 901 San Antonio Road, Palo Alto, CA 94303, USA
! #*
! #* This library is free software; you can redistribute it and/or
! #* modify it under the terms of the GNU Lesser General Public
! #* License version 2.1, as published by the Free Software Foundation.
! #*
! #* This library is distributed in the hope that it will be useful,
! #* but WITHOUT ANY WARRANTY; without even the implied warranty of
! #* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! #* Lesser General Public License for more details.
! #*
! #* You should have received a copy of the GNU Lesser General Public
! #* License along with this library; if not, write to the Free Software
! #* Foundation, Inc., 59 Temple Place, Suite 330, Boston,
! #* MA 02111-1307 USA
! #*
! #*************************************************************************
!
! PRJ = ..$/..$/..$/..$/..
!
! PRJNAME = libtextcat
! TARGET = libtextcat
! CFLAGSCALL=gsd
!
! USE_DEFFILE=TRUE
! EXTERNAL_WARNINGS_NOT_ERRORS := TRUE
!
! .INCLUDE : settings.mk
!
! # --- Files --------------------------------------------------------
!
! # !! not to be compiled because those belong to a stand alone programs: !!
! # $(SLO)$/createfp.obj\
! # $(SLO)$/testtextcat.obj
!
! SLOFILES= \
! $(SLO)$/common.obj\
! $(SLO)$/fingerprint.obj\
! $(SLO)$/textcat.obj\
! $(SLO)$/wg_mempool.obj\
! $(SLO)$/utf8misc.obj
!
! #SHL1TARGET= $(TARGET)$(UPD)$(DLLPOSTFIX)
! SHL1TARGET= $(TARGET)
!
! SHL1STDLIBS=
!
! # build DLL
! SHL1LIBS= $(SLB)$/$(TARGET).lib
! SHL1IMPLIB= i$(TARGET)
! SHL1DEPN= $(SHL1LIBS)
! SHL1DEF= $(MISC)$/$(SHL1TARGET).def
!
! # build DEF file
! DEF1NAME= $(SHL1TARGET)
! DEF1DEPN=$(MISC)$/$(SHL1TARGET).flt
!
! SHL1VERSIONMAP= libtextcat.map
!
! # --- Targets ------------------------------------------------------
!
! .INCLUDE : target.mk
!
! # copy hand supplied configuration file for Win32 builds to the file
! # which is included in the source code
! $(SLOFILES) : config.h
! config.h :
! $(GNUCOPY) $(OUT)$/misc$/build$/libtextcat-2.2$/src$/win32_config.h $(OUT)$/misc$/build$/libtextcat-2.2$/src$/config.h
!
!
! $(MISC)$/$(SHL1TARGET).flt: makefile.mk
! @echo ------------------------------
! @echo Making: $@
! @echo Imp>$@
! @echo __CT>>$@
! @echo _real>>$@
! @echo unnamed>>$@
*** misc/libtextcat-2.2/src/textcat.c Thu May 22 13:32:43 2003
--- misc/build/libtextcat-2.2/src/textcat.c Thu Nov 1 12:53:31 2007
***************
*** 4,26 ****
* Copyright (C) 2003 WiseGuys Internet B.V.
*
* THE BSD LICENSE
! *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
! *
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
! *
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the
* distribution.
! *
* - Neither the name of the WiseGuys Internet B.V. nor the names of
* its contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
! *
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
--- 4,26 ----
* Copyright (C) 2003 WiseGuys Internet B.V.
*
* THE BSD LICENSE
! *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
! *
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
! *
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the
* distribution.
! *
* - Neither the name of the WiseGuys Internet B.V. nor the names of
* its contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
! *
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
***************
*** 74,79 ****
--- 74,80 ----
typedef struct {
void **fprint;
+ char *fprint_disable;
uint4 size;
uint4 maxsize;
***************
*** 112,122 ****
fp_Done( h->fprint[i] );
}
wg_free( h->fprint );
wg_free( h );
}
! extern void *textcat_Init( const char *conffile )
{
textcat_t *h;
char line[1024];
--- 113,133 ----
fp_Done( h->fprint[i] );
}
wg_free( h->fprint );
+ wg_free( h->fprint_disable );
wg_free( h );
}
! /** Replaces older function */
! extern void *textcat_Init( const char *conffile ){
! return special_textcat_Init( conffile, DEFAULT_FINGERPRINTS_PATH );
! }
!
! /**
! * Originaly this function had only one parameter (conffile) it has been modified since OOo use
! * Basicaly prefix is the directory path where fingerprints are stored
! */
! extern void *special_textcat_Init( const char *conffile, const char *prefix )
{
textcat_t *h;
char line[1024];
***************
*** 134,144 ****
h->size = 0;
h->maxsize = 16;
h->fprint = (void **)wg_malloc( sizeof(void*) * h->maxsize );
while ( wg_getline( line, 1024, fp ) ) {
char *p;
char *segment[4];
! int res;
/*** Skip comments ***/
#ifdef HAVE_STRCHR
--- 145,157 ----
h->size = 0;
h->maxsize = 16;
h->fprint = (void **)wg_malloc( sizeof(void*) * h->maxsize );
+ h->fprint_disable = (char *)wg_malloc( sizeof(char*) * h->maxsize ); /*added to store the state of languages*/
while ( wg_getline( line, 1024, fp ) ) {
char *p;
char *segment[4];
! char finger_print_file_name[512];
! int res;
/*** Skip comments ***/
#ifdef HAVE_STRCHR
***************
*** 156,172 ****
/*** Ensure enough space ***/
if ( h->size == h->maxsize ) {
h->maxsize *= 2;
! h->fprint = (void *)wg_realloc( h->fprint, sizeof(void*) * h->maxsize );
}
/*** Load data ***/
if ((h->fprint[ h->size ] = fp_Init( segment[1] ))==NULL) {
goto ERROR;
}
! if ( fp_Read( h->fprint[h->size], segment[0], 400 ) == 0 ) {
textcat_Done(h);
goto ERROR;
! }
h->size++;
}
--- 169,191 ----
/*** Ensure enough space ***/
if ( h->size == h->maxsize ) {
h->maxsize *= 2;
! h->fprint = (void **)wg_realloc( h->fprint, sizeof(void*) * h->maxsize );
! h->fprint_disable = (char *)wg_realloc( h->fprint_disable, sizeof(char*) * h->maxsize );
}
/*** Load data ***/
if ((h->fprint[ h->size ] = fp_Init( segment[1] ))==NULL) {
goto ERROR;
}
! finger_print_file_name[0] = '\0';
! strcat(finger_print_file_name, prefix);
! strcat(finger_print_file_name, segment[0]);
!
! if ( fp_Read( h->fprint[h->size], finger_print_file_name, 400 ) == 0 ) {
textcat_Done(h);
goto ERROR;
! }
! h->fprint_disable[h->size] = 0xF0; /*0xF0 is the code for enabled languages, 0x0F is for disabled*/
h->size++;
}
***************
*** 203,213 ****
result = _TEXTCAT_RESULT_SHORT;
goto READY;
}
!
/*** Calculate the score for each category. ***/
for (i=0; i<h->size; i++) {
! int score = fp_Compare( h->fprint[i], unknown, threshold );
! candidates[i].score = score;
candidates[i].name = fp_Name( h->fprint[i] );
if ( score < minscore ) {
minscore = score;
--- 222,239 ----
result = _TEXTCAT_RESULT_SHORT;
goto READY;
}
!
/*** Calculate the score for each category. ***/
for (i=0; i<h->size; i++) {
! int score;
! if(h->fprint_disable[i] & 0x0F){ /*if this language is disabled*/
! score = MAXSCORE;
! }
! else{
! score = fp_Compare( h->fprint[i], unknown, threshold );
! /*printf("Score for %s : %i\n", fp_Name(h->fprint[i]), score);*/
! }
! candidates[i].score = score;
candidates[i].name = fp_Name( h->fprint[i] );
if ( score < minscore ) {
minscore = score;
***************
*** 218,224 ****
/*** Find the best performers ***/
for (i=0; i<h->size; i++) {
if ( candidates[i].score < threshold ) {
-
if ( ++cnt == MAXCANDIDATES+1 ) {
break;
}
--- 244,249 ----
***************
*** 235,241 ****
else {
char *p = result;
char *plimit = result+MAXOUTPUTSIZE;
!
qsort( candidates, cnt, sizeof(candidate_t), cmpcandidates );
*p = '\0';
--- 260,266 ----
else {
char *p = result;
char *plimit = result+MAXOUTPUTSIZE;
!
qsort( candidates, cnt, sizeof(candidate_t), cmpcandidates );
*p = '\0';
***************
*** 247,253 ****
}
READY:
fp_Done(unknown);
! #ifdef SHOULD_FREE
free(candidates);
#undef SHOULD_FREE
#endif
--- 272,278 ----
}
READY:
fp_Done(unknown);
! #ifdef SHOULD_FREE
free(candidates);
#undef SHOULD_FREE
#endif
*** misc/libtextcat-2.2/src/textcat.h Mon May 19 14:16:31 2003
--- misc/build/libtextcat-2.2/src/textcat.h Thu Nov 1 12:53:31 2007
***************
*** 40,45 ****
--- 40,48 ----
#define _TEXTCAT_RESULT_UNKOWN "UNKNOWN"
#define _TEXTCAT_RESULT_SHORT "SHORT"
+ #ifdef __cplusplus
+ extern "C" {
+ #endif
/**
* textcat_Init() - Initialize the text classifier. The textfile
***************
*** 51,60 ****
--- 54,72 ----
* Returns: handle on success, NULL on error. (At the moment, the
* only way errors can occur, is when the library cannot read the
* conffile, or one of the fingerprint files listed in it.)
+ *
+ * Replace older function (and has exacly the same behaviour)
+ * see below
*/
extern void *textcat_Init( const char *conffile );
/**
+ * Originaly this function had only one parameter (conffile) it has been modified since OOo must be able to load alternativ DB
+ * Basicaly prefix is the directory path where fingerprints are stored
+ */
+ extern void *special_textcat_Init( const char *conffile, const char *prefix );
+
+ /**
* textcat_Done() - Free up resources for handle
*/
extern void textcat_Done( void *handle );
***************
*** 77,80 ****
--- 89,96 ----
* textcat_Version() - Returns a string describing the version of this classifier.
*/
extern char *textcat_Version();
+
+ #ifdef __cplusplus
+ }
+ #endif
#endif
*** misc/libtextcat-2.2/src/utf8misc.c Thu Nov 1 13:07:33 2007
--- misc/build/libtextcat-2.2/src/utf8misc.c Thu Nov 1 12:53:31 2007
***************
*** 1 ****
! dummy
--- 1,132 ----
! /***************************************************************************
! * Copyright (C) 2006 by Jocelyn Merand *
! * joc.mer@gmail.com *
! * *
! * THE BSD LICENSE
! *
! * Redistribution and use in source and binary forms, with or without
! * modification, are permitted provided that the following conditions
! * are met:
! *
! * - Redistributions of source code must retain the above copyright
! * notice, this list of conditions and the following disclaimer.
! *
! * - Redistributions in binary form must reproduce the above copyright
! * notice, this list of conditions and the following disclaimer in the
! * documentation and/or other materials provided with the
! * distribution.
! *
! * - Neither the name of the WiseGuys Internet B.V. nor the names of
! * its contributors may be used to endorse or promote products derived
! * from this software without specific prior written permission.
! *
! * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
! * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
! * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
! * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
! * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
! * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
! * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
! * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
! * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
! * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
! * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
! ***************************************************************************/
!
! #ifndef _UTF8_MISC_H_
! #include "utf8misc.h"
! #endif
!
!
! int nextcharstart(const char *str, int position){
! int pointer = position;
!
! if(str[pointer] & ESCAPE_MASK){ /*if the first bit of the current char is 1*/
!
! /*then str[pointer] is an escape character*/
!
! char escape_char = ((str[pointer] & WEIGHT_MASK) << 1); /*and we use it to count (by bit translation) following characters (only the weightest part)*/
!
! while(escape_char & ESCAPE_MASK && str[pointer]){/*every step, we move the byte of 1 bit left, when first bit is 0, it's finished*/
! escape_char = escape_char <<1;
! ++pointer;
! }
! }
! if(str[pointer]){ /*finaly, if we are not on the \0 character, we jump to the next character*/
! ++pointer;
! }
! return pointer;
! }
!
!
! int charcopy(const char *str, char *dest){
!
! int pointer = 0;
! if(str[pointer] & ESCAPE_MASK){ /*if the first bit of the current char is 1*/
!
! /*then str[pointer] is an escape character*/
!
! char escape_char = ((str[pointer] & WEIGHT_MASK) << 1); /*and we use it to count following characters (only the weightest part)*/
!
! while(escape_char & ESCAPE_MASK && str[pointer]){ /*every step, we move the byte of 1 bit left, when first bit is 0, it's finished*/
! dest[pointer] = str[pointer];
! escape_char = escape_char <<1;
! ++pointer;
! }
! }
! if(str[pointer]){
! dest[pointer] = str[pointer];
! ++pointer;
! }
!
! return pointer;
! }
!
!
! int issame( char *lex, char *key, int len )
! {
! /*printf("[%s] prefix of [%s] with length %i", lex, key, len);*/
! int char_counter = 0;
! int pointer = 0;
! while(char_counter < len) {
!
! if(key[pointer] & ESCAPE_MASK){ /*if the first bit of the current char is 1*/
!
! /*then key[pointer] is an escap character*/
!
! char escape_char = ((key[pointer] & WEIGHT_MASK) << 1); /*and we use it to count (only the weightest part)*/
!
! while(escape_char & ESCAPE_MASK && key[pointer] == lex[pointer] ){
! escape_char = escape_char <<1;
! ++pointer;
! }
! }
! ++char_counter; /*and we are on a new utf8 character*/
! if ( key[pointer] != lex[pointer] ) {
! return 0;
! /*printf(" NO\n", lex, key, len);*/
! }
! ++pointer;
! }
! if ( lex[pointer] != '\0' ) {
! return 0;
! /*printf(" NO\n");*/
! }
!
! /*printf(" YES\n");*/
!
! return 1;
! }
!
!
! extern int utfstrlen(const char* str){
! int char_counter = 0;
! int pointer = 0;
! while(str[pointer]) {
! pointer = nextcharstart(str, pointer);
!
! ++char_counter; /*and we are on a new utf8 character*/
! }
! return char_counter;
! }
!
*** misc/libtextcat-2.2/src/utf8misc.h Thu Nov 1 13:07:33 2007
--- misc/build/libtextcat-2.2/src/utf8misc.h Thu Nov 1 12:53:31 2007
***************
*** 1 ****
! dummy
--- 1,88 ----
! /***************************************************************************
! * Copyright (C) 2006 by Jocelyn Merand *
! * joc.mer@gmail.com *
! * *
! * THE BSD LICENSE
! *
! * Redistribution and use in source and binary forms, with or without
! * modification, are permitted provided that the following conditions
! * are met:
! *
! * - Redistributions of source code must retain the above copyright
! * notice, this list of conditions and the following disclaimer.
! *
! * - Redistributions in binary form must reproduce the above copyright
! * notice, this list of conditions and the following disclaimer in the
! * documentation and/or other materials provided with the
! * distribution.
! *
! * - Neither the name of the WiseGuys Internet B.V. nor the names of
! * its contributors may be used to endorse or promote products derived
! * from this software without specific prior written permission.
! *
! * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
! * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
! * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
! * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
! * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
! * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
! * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
! * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
! * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
! * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
! * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
! ***************************************************************************/
!
! #ifndef _UTF8_MISC_H_
! #define _UTF8_MISC_H_
!
! /**
! * These variables are used in character processing functions
! * These have been added to manage utf-8 symbols, particularly escape chars
! */
! #ifdef _UTF8_
! #define ESCAPE_MASK 0x80
! #define WEIGHT_MASK 0xF0
! #else
! #define ESCAPE_MASK 0xFF
! #define WEIGHT_MASK 0x00
! #endif
!
!
! /*
! * Is used to jump to the next start of char
! * of course it's only usefull when encoding is utf-8
! * This function have been added by Jocelyn Merand to use libtextcat in OOo
! */
! int nextcharstart(const char *str, int position);
!
!
! /*Copy the char in str to dest
! * of course it's only usefull when encoding is utf8 and the symbol is encoded with more than 1 char
! * return the number of char jumped
! * This function have been added by Jocelyn Merand to use libtextcat in OOo
! */
! int charcopy(const char *str, char *dest);
!
!
! /* checks if n-gram lex is a prefix of key and of length len
! * if _UTF8_ is defined, it uses escap characters and len is not realy the length of lex
! * in this case, len is the number of utf-8 char strlen("€") == 3 but len == 1
! */
! int issame( char *lex, char *key, int len );
!
!
! /* Counts the number of characters
! * if _UTF8_ is defined, it uses escap characters and the result is not realy the length of str
! * in this case, the result is the number of utf-8 char strlen("€") == 3 but utfstrlen("€") == 1
! */
! #ifdef __cplusplus
! extern "C" {
! #endif
! extern int utfstrlen(const char* str);
! #ifdef __cplusplus
! }
! #endif
!
! #endif
!
*** misc/libtextcat-2.2/src/win32_config.h Thu Nov 1 13:07:33 2007
--- misc/build/libtextcat-2.2/src/win32_config.h Thu Nov 1 12:53:31 2007
***************
*** 1 ****
! dummy
--- 1,136 ----
! /* src/config.h. Generated by configure. */
! /* src/config.h.in. Generated from configure.ac by autoheader. */
!
! /* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP
! systems. This function is required for `alloca.c' support on those systems.
! */
! /* #undef CRAY_STACKSEG_END */
!
! /* Define to 1 if using `alloca.c'. */
! /* #undef C_ALLOCA */
!
! /* Define to 1 if you have `alloca', as a function or macro. */
! /* #undef HAVE_ALLOCA */
!
! /* Define to 1 if you have <alloca.h> and it should be used (not on Ultrix).
! */
! /* #undef HAVE_ALLOCA_H */
!
! /* Define to 1 if you have the <dlfcn.h> header file. */
! #define HAVE_DLFCN_H 1
!
! /* Define to 1 if you don't have `vprintf' but do have `_doprnt.' */
! /* #undef HAVE_DOPRNT */
!
! /* Define to 1 if you have the `gettimeofday' function. */
! /* #undef HAVE_GETTIMEOFDAY */
!
! /* Define to 1 if you have the <inttypes.h> header file. */
! /* #undef HAVE_INTTYPES_H */
!
! /* Define to 1 if you have the <limits.h> header file. */
! #define HAVE_LIMITS_H 1
!
! /* Define to 1 if your system has a GNU libc compatible `malloc' function, and
! to 0 otherwise. */
! #define HAVE_MALLOC 1
!
! /* Define to 1 if you have the <memory.h> header file. */
! #define HAVE_MEMORY_H 1
!
! /* Define to 1 if you have the `memset' function. */
! #define HAVE_MEMSET 1
!
! /* Define to 1 if your system has a GNU libc compatible `realloc' function,
! and to 0 otherwise. */
! #define HAVE_REALLOC 1
!
! /* Define to 1 if you have the <stdint.h> header file. */
! /* #undef HAVE_STDINT_H */
!
! /* Define to 1 if you have the <stdlib.h> header file. */
! #define HAVE_STDLIB_H 1
!
! /* Define to 1 if you have the `strchr' function. */
! #define HAVE_STRCHR 1
!
! /* Define to 1 if you have the `strdup' function. */
! #define HAVE_STRDUP 1
!
! /* Define to 1 if you have the <strings.h> header file. */
! /* #undef HAVE_STRINGS_H */
!
! /* Define to 1 if you have the <string.h> header file. */
! #define HAVE_STRING_H 1
!
! /* Define to 1 if you have the `strpbrk' function. */
! #define HAVE_STRPBRK 1
!
! /* Define to 1 if you have the <sys/stat.h> header file. */
! #define HAVE_SYS_STAT_H 1
!
! /* Define to 1 if you have the <sys/time.h> header file. */
! /* #undef HAVE_SYS_TIME_H */
!
! /* Define to 1 if you have the <sys/types.h> header file. */
! #define HAVE_SYS_TYPES_H 1
!
! /* Define to 1 if you have the <unistd.h> header file. */
! #define HAVE_UNISTD_H 1
!
! /* Define to 1 if you have the `vprintf' function. */
! #define HAVE_VPRINTF 1
!
! /* Name of package */
! #define PACKAGE "libtextcat"
!
! /* Define to the address where bug reports for this package should be sent. */
! #define PACKAGE_BUGREPORT ""
!
! /* Define to the full name of this package. */
! #define PACKAGE_NAME "libtextcat"
!
! /* Define to the full name and version of this package. */
! #define PACKAGE_STRING "libtextcat 2.2"
!
! /* Define to the one symbol short name of this package. */
! #define PACKAGE_TARNAME "libtextcat"
!
! /* Define to the version of this package. */
! #define PACKAGE_VERSION "2.2"
!
! /* If using the C implementation of alloca, define if you know the
! direction of stack growth for your system; otherwise it will be
! automatically deduced at run-time.
! STACK_DIRECTION > 0 => grows toward higher addresses
! STACK_DIRECTION < 0 => grows toward lower addresses
! STACK_DIRECTION = 0 => direction of growth unknown */
! /* #undef STACK_DIRECTION */
!
! /* Define to 1 if you have the ANSI C header files. */
! #define STDC_HEADERS 1
!
! /* Define to 1 if you can safely include both <sys/time.h> and <time.h>. */
! #define TIME_WITH_SYS_TIME 1
!
! /* Define to 1 if your <sys/time.h> declares `struct tm'. */
! /* #undef TM_IN_SYS_TIME */
!
! /* Version number of package */
! #define VERSION "2.2"
!
! /* Define to empty if `const' does not conform to ANSI C. */
! /* #undef const */
!
! /* Define as `__inline' if that's what the C compiler calls it, or to nothing
! if it is not supported. */
! /* #undef inline */
!
! /* Define to rpl_malloc if the replacement function should be used. */
! /* #undef malloc */
!
! /* Define to rpl_realloc if the replacement function should be used. */
! /* #undef realloc */
!
! /* Define to `unsigned' if <sys/types.h> does not define. */
! /* #undef size_t */