mythes12: move to proper external mythes module
This commit is contained in:
parent
14a6186576
commit
af15b9a545
10 changed files with 2 additions and 937 deletions
|
@ -1,9 +1,8 @@
|
||||||
lc lingucomponent : linguistic libtextcat svl HYPHEN:hyphen HUNSPELL:hunspell NULL
|
lc lingucomponent : linguistic libtextcat svl HYPHEN:hyphen HUNSPELL:hunspell MYTHES:mythes NULL
|
||||||
lc lingucomponent usr1 - all lc_mkout NULL
|
lc lingucomponent usr1 - all lc_mkout NULL
|
||||||
lc lingucomponent\inc nmake - all lc_inc NULL
|
lc lingucomponent\inc nmake - all lc_inc NULL
|
||||||
lc lingucomponent\source\lingutil nmake - all lc_util lc_inc NULL
|
lc lingucomponent\source\lingutil nmake - all lc_util lc_inc NULL
|
||||||
lc lingucomponent\source\thesaurus\mythes nmake - all lc_mythes lc_util lc_inc NULL
|
lc lingucomponent\source\thesaurus\libnth nmake - all lc_libnth lc_util lc_inc NULL
|
||||||
lc lingucomponent\source\thesaurus\libnth nmake - all lc_libnth lc_mythes lc_util lc_inc NULL
|
|
||||||
lc lingucomponent\source\spellcheck\spell nmake - all lc_libspell lc_util lc_inc NULL
|
lc lingucomponent\source\spellcheck\spell nmake - all lc_libspell lc_util lc_inc NULL
|
||||||
lc lingucomponent\source\hyphenator\altlinuxhyph\hyphen nmake - all lc_libhyphen lc_util lc_inc NULL
|
lc lingucomponent\source\hyphenator\altlinuxhyph\hyphen nmake - all lc_libhyphen lc_util lc_inc NULL
|
||||||
lc lingucomponent\source\languageguessing nmake - all lc_languageguessing lc_util lc_inc NULL
|
lc lingucomponent\source\languageguessing nmake - all lc_languageguessing lc_util lc_inc NULL
|
||||||
|
|
|
@ -1,39 +0,0 @@
|
||||||
|
|
||||||
CXX=g++
|
|
||||||
|
|
||||||
CXXFLAGS= -O2 -Wall -ansi -pedantic -I.
|
|
||||||
|
|
||||||
LDFLAGS=-L. -lmythes
|
|
||||||
|
|
||||||
LIBS=libmythes.a
|
|
||||||
|
|
||||||
AR=ar rc
|
|
||||||
RANLIB=ranlib
|
|
||||||
|
|
||||||
OBJS = mythes.o
|
|
||||||
|
|
||||||
all: example
|
|
||||||
|
|
||||||
libmythes.a: $(OBJS)
|
|
||||||
$(AR) $@ $(OBJS)
|
|
||||||
-@ ($(RANLIB) $@ || true) >/dev/null 2>&1
|
|
||||||
|
|
||||||
example: example.o $(LIBS)
|
|
||||||
$(CXX) $(CXXFLAGS) -o $@ example.o $(LDFLAGS)
|
|
||||||
|
|
||||||
%.o: %.cxx
|
|
||||||
$(CXX) $(CXXFLAGS) -c $<
|
|
||||||
|
|
||||||
clean:
|
|
||||||
rm -f *.o *~ example libthes.a
|
|
||||||
|
|
||||||
distclean: clean
|
|
||||||
|
|
||||||
depend:
|
|
||||||
makedepend -- $(CXXFLAGS) -- *.[ch]xx
|
|
||||||
|
|
||||||
# DO NOT DELETE THIS LINE -- make depend depends on it.
|
|
||||||
|
|
||||||
mythes.o: mythes.hxx
|
|
||||||
example.o: mythes.hxx
|
|
||||||
|
|
|
@ -1,60 +0,0 @@
|
||||||
MyThes is a simple thesaurus that uses a structured
|
|
||||||
text data file and an index file with binary search
|
|
||||||
to lookup words and phrases and return information
|
|
||||||
on part of speech, meanings, and synonyms
|
|
||||||
|
|
||||||
MyThes was written to provide a thesaurus for the
|
|
||||||
OpenOffice.org project
|
|
||||||
|
|
||||||
The Main features of MyThes are:
|
|
||||||
|
|
||||||
1. written in C++ to make it easier to interface with
|
|
||||||
Pspell, OpenOffice, AbiWord, etc
|
|
||||||
|
|
||||||
2. it is stateless, uses no static variables and
|
|
||||||
should be completely reentrant with no ifdefs
|
|
||||||
|
|
||||||
3. it compiles with -ansi and -pedantic and -Wall
|
|
||||||
with no warnings so it should be quite portable
|
|
||||||
|
|
||||||
4. it uses a perl program to read the structured
|
|
||||||
text file and create the index needed for bianry
|
|
||||||
searching (see dictionaries/en_US/th_gen_idx.pl)
|
|
||||||
|
|
||||||
5. it is very simple with *lots* of comments.
|
|
||||||
The main "smarts" are in the structure of the
|
|
||||||
text file that makes up the thesaurus data
|
|
||||||
|
|
||||||
6. It comes with a ready-to-go structured thesaurus
|
|
||||||
data file for en_US extracted from the WordNet-2.0 data.
|
|
||||||
(see dictioanries/en_US/th_en_US_new.dat)
|
|
||||||
|
|
||||||
Please see WordNet_license.txt and WordNet_readme.txt
|
|
||||||
for more information on the very useful project!
|
|
||||||
(found in dictionaries/en_US/)
|
|
||||||
|
|
||||||
7. The source code has a BSD license (and no advertising clause)
|
|
||||||
|
|
||||||
|
|
||||||
MyThes has the world's simplest Makefile and no
|
|
||||||
configure support. It does come with a simple example
|
|
||||||
program that looks up some words and returns meanings
|
|
||||||
and synonyms.
|
|
||||||
|
|
||||||
To build it simply do the following:
|
|
||||||
|
|
||||||
unzip mythes.zip
|
|
||||||
cd mythes
|
|
||||||
make
|
|
||||||
|
|
||||||
To run the example program:
|
|
||||||
./example th_en_US_new.idx th_en_US_new.dat checkme.lst
|
|
||||||
|
|
||||||
Please play around with it and let me know
|
|
||||||
what you think.
|
|
||||||
|
|
||||||
Thanks,
|
|
||||||
|
|
||||||
Kevin Hendricks
|
|
||||||
kevin.hendricks@sympatico.ca
|
|
||||||
|
|
|
@ -1,4 +0,0 @@
|
||||||
simple
|
|
||||||
complex
|
|
||||||
junk
|
|
||||||
jhjhjh
|
|
|
@ -1,131 +0,0 @@
|
||||||
Description of the Structure of the Data needed by MyThes
|
|
||||||
--------------------------------------------------------
|
|
||||||
|
|
||||||
MyThes is very simple. Almost all of the "smarts" are really
|
|
||||||
in the thesaurus data file itself.
|
|
||||||
|
|
||||||
The format for this file is at follows:
|
|
||||||
|
|
||||||
- no binary data
|
|
||||||
|
|
||||||
- line ending is a newline '\n' and not carriage return/linefeeds
|
|
||||||
|
|
||||||
- Line 1 is a character string that describes the encoding
|
|
||||||
used for the file. It is up to the calling program to convert
|
|
||||||
to and from this encoding if necessary.
|
|
||||||
|
|
||||||
ISO8859-1 is used by the th_en_US_new.dat file.
|
|
||||||
|
|
||||||
Strings currently recognized by OpenOffice.org are:
|
|
||||||
|
|
||||||
UTF-8
|
|
||||||
ISO8859-1
|
|
||||||
ISO8859-2
|
|
||||||
ISO8859-3
|
|
||||||
ISO8859-4
|
|
||||||
ISO8859-5
|
|
||||||
ISO8859-6
|
|
||||||
ISO8859-7
|
|
||||||
ISO8859-8
|
|
||||||
ISO8859-9
|
|
||||||
ISO8859-10
|
|
||||||
KOI8-R
|
|
||||||
CP-1251
|
|
||||||
ISO8859-14
|
|
||||||
ISCII-DEVANAGARI
|
|
||||||
|
|
||||||
|
|
||||||
- All of the remaning lines of the file follow this structure
|
|
||||||
|
|
||||||
entry|num_mean
|
|
||||||
pos|syn1_mean|syn2|...
|
|
||||||
.
|
|
||||||
.
|
|
||||||
.
|
|
||||||
pos|mean_syn1|syn2|...
|
|
||||||
|
|
||||||
|
|
||||||
where:
|
|
||||||
|
|
||||||
entry - all lowercase version of the word or phrase being described
|
|
||||||
num_mean - number of meanings for this entry
|
|
||||||
|
|
||||||
There is one meaning per line and each meaning is comprised of
|
|
||||||
|
|
||||||
pos - part of speech or other meaning specific description
|
|
||||||
syn1_mean - synonym 1 also used to describe the meaning itself
|
|
||||||
syn2 - synonym 2 for that meaning etc.
|
|
||||||
|
|
||||||
|
|
||||||
To make this even more clearer, here is actual data for the
|
|
||||||
entry "simple".
|
|
||||||
|
|
||||||
simple|9
|
|
||||||
(adj)|simple |elemental|ultimate|oversimplified|simplistic|simplex|simplified|unanalyzable|
|
|
||||||
undecomposable|uncomplicated|unsophisticated|easy|plain|unsubdivided
|
|
||||||
(adj)|elementary|uncomplicated|unproblematic|easy
|
|
||||||
(adj)|bare|mere|plain
|
|
||||||
(adj)|childlike|wide-eyed|dewy-eyed|naive |naif
|
|
||||||
(adj)|dim-witted|half-witted|simple-minded|retarded
|
|
||||||
(adj)|simple |unsubdivided|unlobed|smooth
|
|
||||||
(adj)|plain
|
|
||||||
(noun)|herb|herbaceous plant
|
|
||||||
(noun)|simpleton|person|individual|someone|somebody|mortal|human|soul
|
|
||||||
|
|
||||||
|
|
||||||
It says that "simple" has 9 different meanings and each
|
|
||||||
meaning will have its part of speech and at least 1 synonym
|
|
||||||
with other if presetn following on the same line.
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Once you ahve created your own structured text file you can use
|
|
||||||
the perl program "th_gen_idx.pl" which can be found in this
|
|
||||||
directory to create an index file that is used to seek into
|
|
||||||
your data file by the MyThes code.
|
|
||||||
|
|
||||||
The correct way to run the perl program is as follows:
|
|
||||||
|
|
||||||
cat th_en_US_new.dat | ./th_gen_idx.pl > th_en_US_new.idx
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Then if you head the resulting index file you should see the
|
|
||||||
following:
|
|
||||||
|
|
||||||
ISO8859-1
|
|
||||||
142689
|
|
||||||
'hood|10
|
|
||||||
's gravenhage|88
|
|
||||||
'tween|173
|
|
||||||
'tween decks|196
|
|
||||||
.22|231
|
|
||||||
.22 caliber|319
|
|
||||||
.22 calibre|365
|
|
||||||
.38 caliber|411
|
|
||||||
.38 calibre|457
|
|
||||||
.45 caliber|503
|
|
||||||
.45 calibre|549
|
|
||||||
0|595
|
|
||||||
1|666
|
|
||||||
1 chronicles|6283
|
|
||||||
1 esdras|6336
|
|
||||||
|
|
||||||
|
|
||||||
Line 1 is the same encoding string taken from the
|
|
||||||
structured thesaurus data file.
|
|
||||||
|
|
||||||
Line 2 is a count of the total number of entries
|
|
||||||
in your thesaurus.
|
|
||||||
|
|
||||||
All of the remaining lines are of the form
|
|
||||||
|
|
||||||
entry|byte_offset_into_data_file_where_entry_is_found
|
|
||||||
|
|
||||||
|
|
||||||
That's all there is too it.
|
|
||||||
|
|
||||||
|
|
||||||
Kevin
|
|
||||||
kevin.hendricks@sympatico.ca
|
|
||||||
|
|
|
@ -1,128 +0,0 @@
|
||||||
/*************************************************************************
|
|
||||||
*
|
|
||||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
|
||||||
*
|
|
||||||
* Copyright 2000, 2010 Oracle and/or its affiliates.
|
|
||||||
*
|
|
||||||
* OpenOffice.org - a multi-platform office productivity suite
|
|
||||||
*
|
|
||||||
* This file is part of OpenOffice.org.
|
|
||||||
*
|
|
||||||
* OpenOffice.org is free software: you can redistribute it and/or modify
|
|
||||||
* it under the terms of the GNU Lesser General Public License version 3
|
|
||||||
* only, as published by the Free Software Foundation.
|
|
||||||
*
|
|
||||||
* OpenOffice.org is distributed in the hope that it will be useful,
|
|
||||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
* GNU Lesser General Public License version 3 for more details
|
|
||||||
* (a copy is included in the LICENSE file that accompanied this code).
|
|
||||||
*
|
|
||||||
* You should have received a copy of the GNU Lesser General Public License
|
|
||||||
* version 3 along with OpenOffice.org. If not, see
|
|
||||||
* <http://www.openoffice.org/license.html>
|
|
||||||
* for a copy of the LGPLv3 License.
|
|
||||||
*
|
|
||||||
************************************************************************/
|
|
||||||
|
|
||||||
|
|
||||||
// MARKER(update_precomp.py): autogen include statement, do not remove
|
|
||||||
#include "precompiled_lingucomponent.hxx"
|
|
||||||
#include <cstring>
|
|
||||||
#include <cstdlib>
|
|
||||||
#include <cstdio>
|
|
||||||
|
|
||||||
#include "mythes.hxx"
|
|
||||||
|
|
||||||
extern char * mystrdup(const char * s);
|
|
||||||
|
|
||||||
using namespace std;
|
|
||||||
|
|
||||||
int
|
|
||||||
main(int argc, char** argv)
|
|
||||||
{
|
|
||||||
|
|
||||||
char * af;
|
|
||||||
char * df;
|
|
||||||
char * wtc;
|
|
||||||
FILE* wtclst;
|
|
||||||
|
|
||||||
/* first parse the command line options */
|
|
||||||
/* arg1 - index file, arg2 thesaurus data file, arg3 - file of words to check */
|
|
||||||
|
|
||||||
if (argv[1]) {
|
|
||||||
af = mystrdup(argv[1]);
|
|
||||||
} else {
|
|
||||||
fprintf(stderr,"correct syntax is:\n");
|
|
||||||
fprintf(stderr,"example index_file thesaurus_file file_of_words_to_check\n");
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
if (argv[2]) {
|
|
||||||
df = mystrdup(argv[2]);
|
|
||||||
} else {
|
|
||||||
fprintf(stderr,"correct syntax is:\n");
|
|
||||||
fprintf(stderr,"example index_file thesaurus_file file_of_words_to_check\n");
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
if (argv[3]) {
|
|
||||||
wtc = mystrdup(argv[3]);
|
|
||||||
} else {
|
|
||||||
fprintf(stderr,"correct syntax is:\n");
|
|
||||||
fprintf(stderr,"example index_file thesaurus_file file_of_words_to_check\n");
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/* open the words to check list */
|
|
||||||
wtclst = fopen(wtc,"r");
|
|
||||||
if (!wtclst) {
|
|
||||||
fprintf(stderr,"Error - could not open file of words to check\n");
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
// open a new thesaurus object
|
|
||||||
MyThes * pMT= new MyThes(af,df);
|
|
||||||
|
|
||||||
// get the encoding used for the thesaurus data
|
|
||||||
char * encoding = pMT->get_th_encoding();
|
|
||||||
fprintf(stdout,"Thesaurus uses encoding %s\n\n",encoding);
|
|
||||||
|
|
||||||
int k;
|
|
||||||
char buf[101];
|
|
||||||
mentry * pmean;
|
|
||||||
|
|
||||||
while(fgets(buf,100,wtclst)) {
|
|
||||||
k = strlen(buf);
|
|
||||||
*(buf + k - 1) = '\0';
|
|
||||||
int len = strlen(buf);
|
|
||||||
int count = pMT->Lookup(buf,len,&pmean);
|
|
||||||
// don't change value of pmean
|
|
||||||
// or count since needed for CleanUpAfterLookup routine
|
|
||||||
mentry* pm = pmean;
|
|
||||||
if (count) {
|
|
||||||
fprintf(stdout,"%s has %d meanings\n",buf,count);
|
|
||||||
for (int i=0; i < count; i++) {
|
|
||||||
fprintf(stdout," meaning %d: %s\n",i,pm->defn);
|
|
||||||
for (int j=0; j < pm->count; j++) {
|
|
||||||
fprintf(stdout," %s\n",pm->psyns[j]);
|
|
||||||
}
|
|
||||||
fprintf(stdout,"\n");
|
|
||||||
pm++;
|
|
||||||
}
|
|
||||||
fprintf(stdout,"\n\n");
|
|
||||||
// now clean up all allocated memory
|
|
||||||
pMT->CleanUpAfterLookup(&pmean,count);
|
|
||||||
} else {
|
|
||||||
fprintf(stdout,"\"%s\" is not in thesaurus!\n",buf);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
delete pMT;
|
|
||||||
fclose(wtclst);
|
|
||||||
free(wtc);
|
|
||||||
free(df);
|
|
||||||
free(af);
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
|
@ -1,34 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright 2003 Kevin B. Hendricks, Stratford, Ontario, Canada
|
|
||||||
* And Contributors. All rights reserved.
|
|
||||||
*
|
|
||||||
* Redistribution and use in source and binary forms, with or without
|
|
||||||
* modification, are permitted provided that the following conditions
|
|
||||||
* are met:
|
|
||||||
*
|
|
||||||
* 1. Redistributions of source code must retain the above copyright
|
|
||||||
* notice, this list of conditions and the following disclaimer.
|
|
||||||
*
|
|
||||||
* 2. Redistributions in binary form must reproduce the above copyright
|
|
||||||
* notice, this list of conditions and the following disclaimer in the
|
|
||||||
* documentation and/or other materials provided with the distribution.
|
|
||||||
*
|
|
||||||
* 3. All modifications to the source code must be clearly marked as
|
|
||||||
* such. Binary redistributions based on modified source code
|
|
||||||
* must be clearly marked as modified versions in the documentation
|
|
||||||
* and/or other materials provided with the distribution.
|
|
||||||
*
|
|
||||||
* THIS SOFTWARE IS PROVIDED BY KEVIN B. HENDRICKS AND CONTRIBUTORS
|
|
||||||
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
||||||
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
|
||||||
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
|
|
||||||
* KEVIN B. HENDRICKS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
||||||
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
|
||||||
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
||||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
||||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
||||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
||||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
||||||
* SUCH DAMAGE.
|
|
||||||
*
|
|
||||||
*/
|
|
|
@ -1,59 +0,0 @@
|
||||||
#*************************************************************************
|
|
||||||
#
|
|
||||||
# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
|
||||||
#
|
|
||||||
# Copyright 2000, 2010 Oracle and/or its affiliates.
|
|
||||||
#
|
|
||||||
# OpenOffice.org - a multi-platform office productivity suite
|
|
||||||
#
|
|
||||||
# This file is part of OpenOffice.org.
|
|
||||||
#
|
|
||||||
# OpenOffice.org is free software: you can redistribute it and/or modify
|
|
||||||
# it under the terms of the GNU Lesser General Public License version 3
|
|
||||||
# only, as published by the Free Software Foundation.
|
|
||||||
#
|
|
||||||
# OpenOffice.org is distributed in the hope that it will be useful,
|
|
||||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
# GNU Lesser General Public License version 3 for more details
|
|
||||||
# (a copy is included in the LICENSE file that accompanied this code).
|
|
||||||
#
|
|
||||||
# You should have received a copy of the GNU Lesser General Public License
|
|
||||||
# version 3 along with OpenOffice.org. If not, see
|
|
||||||
# <http://www.openoffice.org/license.html>
|
|
||||||
# for a copy of the LGPLv3 License.
|
|
||||||
#
|
|
||||||
#*************************************************************************
|
|
||||||
|
|
||||||
PRJ = ..$/..$/..
|
|
||||||
|
|
||||||
PRJNAME = lingucomponent
|
|
||||||
TARGET = mythes
|
|
||||||
LIBTARGET=NO
|
|
||||||
|
|
||||||
#----- Settings ---------------------------------------------------------
|
|
||||||
|
|
||||||
.INCLUDE : settings.mk
|
|
||||||
|
|
||||||
# --- Files --------------------------------------------------------
|
|
||||||
|
|
||||||
.IF "$(SYSTEM_MYTHES)" == "YES"
|
|
||||||
@all:
|
|
||||||
@echo "Using system mythes..."
|
|
||||||
.ENDIF
|
|
||||||
|
|
||||||
all_target: ALLTAR
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
SLOFILES= \
|
|
||||||
$(SLO)$/mythes.obj
|
|
||||||
|
|
||||||
LIB1TARGET= $(SLB)$/lib$(TARGET).lib
|
|
||||||
LIB1ARCHIV= $(LB)/lib$(TARGET).a
|
|
||||||
LIB1OBJFILES= $(SLOFILES)
|
|
||||||
|
|
||||||
# --- Targets ------------------------------------------------------
|
|
||||||
|
|
||||||
.INCLUDE : target.mk
|
|
||||||
|
|
|
@ -1,403 +0,0 @@
|
||||||
/*************************************************************************
|
|
||||||
*
|
|
||||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
|
||||||
*
|
|
||||||
* Copyright 2000, 2010 Oracle and/or its affiliates.
|
|
||||||
*
|
|
||||||
* OpenOffice.org - a multi-platform office productivity suite
|
|
||||||
*
|
|
||||||
* This file is part of OpenOffice.org.
|
|
||||||
*
|
|
||||||
* OpenOffice.org is free software: you can redistribute it and/or modify
|
|
||||||
* it under the terms of the GNU Lesser General Public License version 3
|
|
||||||
* only, as published by the Free Software Foundation.
|
|
||||||
*
|
|
||||||
* OpenOffice.org is distributed in the hope that it will be useful,
|
|
||||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
* GNU Lesser General Public License version 3 for more details
|
|
||||||
* (a copy is included in the LICENSE file that accompanied this code).
|
|
||||||
*
|
|
||||||
* You should have received a copy of the GNU Lesser General Public License
|
|
||||||
* version 3 along with OpenOffice.org. If not, see
|
|
||||||
* <http://www.openoffice.org/license.html>
|
|
||||||
* for a copy of the LGPLv3 License.
|
|
||||||
*
|
|
||||||
************************************************************************/
|
|
||||||
|
|
||||||
|
|
||||||
// MARKER(update_precomp.py): autogen include statement, do not remove
|
|
||||||
#include "precompiled_lingucomponent.hxx"
|
|
||||||
#include "license.readme"
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include <errno.h>
|
|
||||||
|
|
||||||
#include "mythes.hxx"
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
MyThes::MyThes(const char* idxpath, const char * datpath)
|
|
||||||
{
|
|
||||||
nw = 0;
|
|
||||||
encoding = NULL;
|
|
||||||
list = NULL;
|
|
||||||
offst = NULL;
|
|
||||||
|
|
||||||
if (thInitialize(idxpath, datpath) != 1) {
|
|
||||||
fprintf(stderr,"Error - can't open %s or %s\n",idxpath, datpath);
|
|
||||||
fflush(stderr);
|
|
||||||
thCleanup();
|
|
||||||
// did not initialize properly - throw exception?
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
MyThes::~MyThes()
|
|
||||||
{
|
|
||||||
thCleanup();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
int MyThes::thInitialize(const char* idxpath, const char* datpath)
|
|
||||||
{
|
|
||||||
|
|
||||||
// open the index file
|
|
||||||
FILE * pifile = fopen(idxpath,"r");
|
|
||||||
if (!pifile) {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// parse in encoding and index size */
|
|
||||||
char * wrd;
|
|
||||||
wrd = (char *)calloc(1, MAX_WD_LEN);
|
|
||||||
if (!wrd) {
|
|
||||||
fprintf(stderr,"Error - bad memory allocation\n");
|
|
||||||
fflush(stderr);
|
|
||||||
fclose(pifile);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
int len = readLine(pifile,wrd,MAX_WD_LEN);
|
|
||||||
encoding = mystrdup(wrd);
|
|
||||||
len = readLine(pifile,wrd,MAX_WD_LEN);
|
|
||||||
int idxsz = atoi(wrd);
|
|
||||||
|
|
||||||
|
|
||||||
// now allocate list, offst for the given size
|
|
||||||
list = (char**) calloc(idxsz,sizeof(char*));
|
|
||||||
offst = (unsigned int*) calloc(idxsz,sizeof(unsigned int));
|
|
||||||
|
|
||||||
if ( (!(list)) || (!(offst)) ) {
|
|
||||||
fprintf(stderr,"Error - bad memory allocation\n");
|
|
||||||
fflush(stderr);
|
|
||||||
fclose(pifile);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// now parse the remaining lines of the index
|
|
||||||
len = readLine(pifile,wrd,MAX_WD_LEN);
|
|
||||||
while (len > 0)
|
|
||||||
{
|
|
||||||
int np = mystr_indexOfChar(wrd,'|');
|
|
||||||
if (nw < idxsz) {
|
|
||||||
if (np >= 0) {
|
|
||||||
*(wrd+np) = '\0';
|
|
||||||
list[nw] = (char *)calloc(1,(np+1));
|
|
||||||
if (!list[nw]) {
|
|
||||||
fprintf(stderr,"Error - bad memory allocation\n");
|
|
||||||
fflush(stderr);
|
|
||||||
fclose(pifile);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
memcpy((list[nw]),wrd,np);
|
|
||||||
offst[nw] = atoi(wrd+np+1);
|
|
||||||
nw++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
len = readLine(pifile,wrd,MAX_WD_LEN);
|
|
||||||
}
|
|
||||||
|
|
||||||
free((void *)wrd);
|
|
||||||
fclose(pifile);
|
|
||||||
|
|
||||||
/* next open the data file */
|
|
||||||
pdfile = fopen(datpath,"r");
|
|
||||||
if (!pdfile) {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void MyThes::thCleanup()
|
|
||||||
{
|
|
||||||
/* first close the data file */
|
|
||||||
if (pdfile) {
|
|
||||||
fclose(pdfile);
|
|
||||||
pdfile=NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (list)
|
|
||||||
{
|
|
||||||
/* now free up all the allocated strings on the list */
|
|
||||||
for (int i=0; i < nw; i++)
|
|
||||||
{
|
|
||||||
if (list[i]) {
|
|
||||||
free(list[i]);
|
|
||||||
list[i] = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
free((void*)list);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (encoding) free((void*)encoding);
|
|
||||||
if (offst) free((void*)offst);
|
|
||||||
|
|
||||||
encoding = NULL;
|
|
||||||
list = NULL;
|
|
||||||
offst = NULL;
|
|
||||||
nw = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// lookup text in index and count of meanings and a list of meaning entries
|
|
||||||
// with each entry having a synonym count and pointer to an
|
|
||||||
// array of char * (i.e the synonyms)
|
|
||||||
//
|
|
||||||
// note: calling routine should call CleanUpAfterLookup with the original
|
|
||||||
// meaning point and count to properly deallocate memory
|
|
||||||
|
|
||||||
int MyThes::Lookup(const char * pText, int len, mentry** pme)
|
|
||||||
{
|
|
||||||
|
|
||||||
*pme = NULL;
|
|
||||||
|
|
||||||
// handle the case of missing file or file related errors
|
|
||||||
if (! pdfile) return 0;
|
|
||||||
|
|
||||||
long offset = 0;
|
|
||||||
|
|
||||||
/* copy search word and make sure null terminated */
|
|
||||||
char * wrd = (char *) calloc(1,(len+1));
|
|
||||||
memcpy(wrd,pText,len);
|
|
||||||
|
|
||||||
/* find it in the list */
|
|
||||||
int idx = nw > 0 ? binsearch(wrd,list,nw) : -1;
|
|
||||||
free(wrd);
|
|
||||||
if (idx < 0) return 0;
|
|
||||||
|
|
||||||
// now seek to the offset
|
|
||||||
offset = (long) offst[idx];
|
|
||||||
int rc = fseek(pdfile,offset,SEEK_SET);
|
|
||||||
if (rc) {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// grab the count of the number of meanings
|
|
||||||
// and allocate a list of meaning entries
|
|
||||||
char * buf = NULL;
|
|
||||||
buf = (char *) malloc( MAX_LN_LEN );
|
|
||||||
if (!buf) return 0;
|
|
||||||
readLine(pdfile, buf, (MAX_LN_LEN-1));
|
|
||||||
int np = mystr_indexOfChar(buf,'|');
|
|
||||||
if (np < 0) {
|
|
||||||
free(buf);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
int nmeanings = atoi(buf+np+1);
|
|
||||||
*pme = (mentry*) malloc( nmeanings * sizeof(mentry) );
|
|
||||||
if (!(*pme)) {
|
|
||||||
free(buf);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// now read in each meaning and parse it to get defn, count and synonym lists
|
|
||||||
mentry* pm = *(pme);
|
|
||||||
char dfn[MAX_WD_LEN];
|
|
||||||
|
|
||||||
for (int j = 0; j < nmeanings; j++) {
|
|
||||||
readLine(pdfile, buf, (MAX_LN_LEN-1));
|
|
||||||
|
|
||||||
pm->count = 0;
|
|
||||||
pm->psyns = NULL;
|
|
||||||
pm->defn = NULL;
|
|
||||||
|
|
||||||
// store away the part of speech for later use
|
|
||||||
char * p = buf;
|
|
||||||
char * pos = NULL;
|
|
||||||
np = mystr_indexOfChar(p,'|');
|
|
||||||
if (np >= 0) {
|
|
||||||
*(buf+np) = '\0';
|
|
||||||
pos = mystrdup(p);
|
|
||||||
p = p + np + 1;
|
|
||||||
} else {
|
|
||||||
pos = mystrdup("");
|
|
||||||
}
|
|
||||||
|
|
||||||
// count the number of fields in the remaining line
|
|
||||||
int nf = 1;
|
|
||||||
char * d = p;
|
|
||||||
np = mystr_indexOfChar(d,'|');
|
|
||||||
while ( np >= 0 ) {
|
|
||||||
nf++;
|
|
||||||
d = d + np + 1;
|
|
||||||
np = mystr_indexOfChar(d,'|');
|
|
||||||
}
|
|
||||||
pm->count = nf;
|
|
||||||
pm->psyns = (char **) malloc(nf*sizeof(char*));
|
|
||||||
|
|
||||||
// fill in the synonym list
|
|
||||||
d = p;
|
|
||||||
for (int jj = 0; jj < nf; jj++)
|
|
||||||
{
|
|
||||||
np = mystr_indexOfChar(d,'|');
|
|
||||||
if (np > 0)
|
|
||||||
{
|
|
||||||
*(d+np) = '\0';
|
|
||||||
pm->psyns[jj] = mystrdup(d);
|
|
||||||
d = d + np + 1;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
pm->psyns[jj] = mystrdup(d);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// add pos to first synonym to create the definition
|
|
||||||
int k = strlen(pos);
|
|
||||||
int m = strlen(pm->psyns[0]);
|
|
||||||
if ((k+m) < (MAX_WD_LEN - 1)) {
|
|
||||||
strncpy(dfn,pos,k);
|
|
||||||
*(dfn+k) = ' ';
|
|
||||||
strncpy((dfn+k+1),(pm->psyns[0]),m+1);
|
|
||||||
pm->defn = mystrdup(dfn);
|
|
||||||
} else {
|
|
||||||
pm->defn = mystrdup(pm->psyns[0]);
|
|
||||||
}
|
|
||||||
free(pos);
|
|
||||||
pm++;
|
|
||||||
|
|
||||||
}
|
|
||||||
free(buf);
|
|
||||||
|
|
||||||
return nmeanings;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
void MyThes::CleanUpAfterLookup(mentry ** pme, int nmeanings)
|
|
||||||
{
|
|
||||||
|
|
||||||
if (nmeanings == 0) return;
|
|
||||||
if ((*pme) == NULL) return;
|
|
||||||
|
|
||||||
mentry * pm = *pme;
|
|
||||||
|
|
||||||
for (int i = 0; i < nmeanings; i++) {
|
|
||||||
int count = pm->count;
|
|
||||||
for (int j = 0; j < count; j++) {
|
|
||||||
if (pm->psyns[j]) free(pm->psyns[j]);
|
|
||||||
pm->psyns[j] = NULL;
|
|
||||||
}
|
|
||||||
if (pm->psyns) free(pm->psyns);
|
|
||||||
pm->psyns = NULL;
|
|
||||||
if (pm->defn) free(pm->defn);
|
|
||||||
pm->defn = NULL;
|
|
||||||
pm->count = 0;
|
|
||||||
pm++;
|
|
||||||
}
|
|
||||||
pm = *pme;
|
|
||||||
free(pm);
|
|
||||||
*pme = NULL;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// read a line of text from a text file stripping
|
|
||||||
// off the line terminator and replacing it with
|
|
||||||
// a null string terminator.
|
|
||||||
// returns: -1 on error or the number of characters in
|
|
||||||
// in the returning string
|
|
||||||
|
|
||||||
// A maximum of nc characters will be returned
|
|
||||||
|
|
||||||
int MyThes::readLine(FILE * pf, char * buf, int nc)
|
|
||||||
{
|
|
||||||
|
|
||||||
if (fgets(buf,nc,pf)) {
|
|
||||||
mychomp(buf);
|
|
||||||
return strlen(buf);
|
|
||||||
}
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// performs a binary search on null terminated character
|
|
||||||
// strings
|
|
||||||
//
|
|
||||||
// returns: -1 on not found
|
|
||||||
// index of wrd in the list[]
|
|
||||||
|
|
||||||
int MyThes::binsearch(char * sw, char* _list[], int nlst)
|
|
||||||
{
|
|
||||||
int lp, up, mp, j, indx;
|
|
||||||
lp = 0;
|
|
||||||
up = nlst-1;
|
|
||||||
indx = -1;
|
|
||||||
if (strcmp(sw,_list[lp]) < 0) return -1;
|
|
||||||
if (strcmp(sw,_list[up]) > 0) return -1;
|
|
||||||
while (indx < 0 ) {
|
|
||||||
mp = (int)((lp+up) >> 1);
|
|
||||||
j = strcmp(sw,_list[mp]);
|
|
||||||
if ( j > 0) {
|
|
||||||
lp = mp + 1;
|
|
||||||
} else if (j < 0 ) {
|
|
||||||
up = mp - 1;
|
|
||||||
} else {
|
|
||||||
indx = mp;
|
|
||||||
}
|
|
||||||
if (lp > up) return -1;
|
|
||||||
}
|
|
||||||
return indx;
|
|
||||||
}
|
|
||||||
|
|
||||||
char * MyThes::get_th_encoding()
|
|
||||||
{
|
|
||||||
if (encoding) return encoding;
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// string duplication routine
|
|
||||||
char * MyThes::mystrdup(const char * p)
|
|
||||||
{
|
|
||||||
int sl = strlen(p) + 1;
|
|
||||||
char * d = (char *)malloc(sl);
|
|
||||||
if (d) {
|
|
||||||
memcpy(d,p,sl);
|
|
||||||
return d;
|
|
||||||
}
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
// remove cross-platform text line end characters
|
|
||||||
void MyThes::mychomp(char * s)
|
|
||||||
{
|
|
||||||
int k = strlen(s);
|
|
||||||
if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0';
|
|
||||||
if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0';
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// return index of char in string
|
|
||||||
int MyThes::mystr_indexOfChar(const char * d, int c)
|
|
||||||
{
|
|
||||||
char * p = strchr((char *)d,c);
|
|
||||||
if (p) return (int)(p-d);
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
|
@ -1,76 +0,0 @@
|
||||||
#ifndef _MYTHES_HXX_
|
|
||||||
#define _MYTHES_HXX_
|
|
||||||
|
|
||||||
// some maximum sizes for buffers
|
|
||||||
#define MAX_WD_LEN 200
|
|
||||||
#define MAX_LN_LEN 16384
|
|
||||||
|
|
||||||
|
|
||||||
// a meaning with definition, count of synonyms and synonym list
|
|
||||||
struct mentry {
|
|
||||||
char* defn;
|
|
||||||
int count;
|
|
||||||
char** psyns;
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
class MyThes
|
|
||||||
{
|
|
||||||
|
|
||||||
int nw; /* number of entries in thesaurus */
|
|
||||||
char** list; /* stores word list */
|
|
||||||
unsigned int* offst; /* stores offset list */
|
|
||||||
char * encoding; /* stores text encoding; */
|
|
||||||
|
|
||||||
FILE *pdfile;
|
|
||||||
|
|
||||||
// disallow copy-constructor and assignment-operator for now
|
|
||||||
MyThes();
|
|
||||||
MyThes(const MyThes &);
|
|
||||||
MyThes & operator = (const MyThes &);
|
|
||||||
|
|
||||||
public:
|
|
||||||
MyThes(const char* idxpath, const char* datpath);
|
|
||||||
~MyThes();
|
|
||||||
|
|
||||||
// lookup text in index and return number of meanings
|
|
||||||
// each meaning entry has a defintion, synonym count and pointer
|
|
||||||
// when complete return the *original* meaning entry and count via
|
|
||||||
// CleanUpAfterLookup to properly handle memory deallocation
|
|
||||||
|
|
||||||
int Lookup(const char * pText, int len, mentry** pme);
|
|
||||||
|
|
||||||
void CleanUpAfterLookup(mentry** pme, int nmean);
|
|
||||||
|
|
||||||
char* get_th_encoding();
|
|
||||||
|
|
||||||
private:
|
|
||||||
// Open index and dat files and load list array
|
|
||||||
int thInitialize (const char* indxpath, const char* datpath);
|
|
||||||
|
|
||||||
// internal close and cleanup dat and idx files
|
|
||||||
void thCleanup ();
|
|
||||||
|
|
||||||
// read a text line (\n terminated) stripping off line terminator
|
|
||||||
int readLine(FILE * pf, char * buf, int nc);
|
|
||||||
|
|
||||||
// binary search on null terminated character strings
|
|
||||||
int binsearch(char * wrd, char* list[], int nlst);
|
|
||||||
|
|
||||||
// string duplication routine
|
|
||||||
char * mystrdup(const char * p);
|
|
||||||
|
|
||||||
// remove cross-platform text line end characters
|
|
||||||
void mychomp(char * s);
|
|
||||||
|
|
||||||
// return index of char in string
|
|
||||||
int mystr_indexOfChar(const char * d, int c);
|
|
||||||
|
|
||||||
};
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue