move find-german-comments from build repo to bootstrap/bin
This commit is contained in:
parent
9b7439f870
commit
7be59837a9
7 changed files with 1718 additions and 0 deletions
162
bin/find-german-comments
Executable file
162
bin/find-german-comments
Executable file
|
@ -0,0 +1,162 @@
|
|||
#!/usr/bin/env python
|
||||
########################################################################
|
||||
#
|
||||
# Copyright (c) 2010 Jonas Jensen, Miklos Vajna
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person
|
||||
# obtaining a copy of this software and associated documentation
|
||||
# files (the "Software"), to deal in the Software without
|
||||
# restriction, including without limitation the rights to use,
|
||||
# copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the
|
||||
# Software is furnished to do so, subject to the following
|
||||
# conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be
|
||||
# included in all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
# OTHER DEALINGS IN THE SOFTWARE.
|
||||
#
|
||||
########################################################################
|
||||
|
||||
|
||||
import sys, re, subprocess, os, optparse, string
|
||||
|
||||
class Parser:
|
||||
"""
|
||||
This parser extracts comments from source files, tries to guess
|
||||
their language and then prints out the german ones.
|
||||
"""
|
||||
def __init__(self):
|
||||
self.strip = string.punctuation + " \n"
|
||||
op = optparse.OptionParser()
|
||||
op.set_usage("%prog [options] <rootdir>\n\n" +
|
||||
"Searches for german comments in cxx/hxx source files inside a given root\n" +
|
||||
"directory recursively.")
|
||||
op.add_option("-v", "--verbose", action="store_true", dest="verbose", default=False,
|
||||
help="Turn on verbose mode (print progress to stderr)")
|
||||
self.options, args = op.parse_args()
|
||||
try:
|
||||
dir = args[0]
|
||||
except IndexError:
|
||||
dir = "."
|
||||
self.check_source_files(dir)
|
||||
|
||||
def get_comments(self, filename):
|
||||
"""
|
||||
Extracts the source code comments.
|
||||
"""
|
||||
linenum = 0
|
||||
if self.options.verbose:
|
||||
sys.stderr.write("processing file '%s'...\n" % filename)
|
||||
sock = open(filename)
|
||||
# add an empty line to trigger the output of collected oneliner
|
||||
# comment group
|
||||
lines = sock.readlines() + ["\n"]
|
||||
sock.close()
|
||||
|
||||
in_comment = False
|
||||
buf = []
|
||||
count = 1
|
||||
for i in lines:
|
||||
if "//" in i and not in_comment:
|
||||
# if we find a new //-style comment, then we
|
||||
# just append it to a previous one if: there is
|
||||
# only whitespace before the // mark that is
|
||||
# necessary to make comments longer, giving
|
||||
# more reliable output
|
||||
if not len(re.sub("(.*)//.*", r"\1", i).strip(self.strip)):
|
||||
s = re.sub(".*// ?", "", i).strip(self.strip)
|
||||
if len(s):
|
||||
buf.append(s)
|
||||
else:
|
||||
# otherwise it's an independent //-style comment in the next line
|
||||
yield (count, "\n ".join(buf))
|
||||
buf = [re.sub(".*// ?", "", i.strip(self.strip))]
|
||||
elif "//" not in i and not in_comment and len(buf) > 0:
|
||||
# first normal line after a // block
|
||||
yield (count, "\n ".join(buf))
|
||||
buf = []
|
||||
elif "/*" in i and "*/" not in i and not in_comment:
|
||||
# start of a real multiline comment
|
||||
in_comment = True
|
||||
linenum = count
|
||||
s = re.sub(".*/\*+", "", i.strip(self.strip))
|
||||
if len(s):
|
||||
buf.append(s.strip(self.strip))
|
||||
elif in_comment and not "*/" in i:
|
||||
# in multiline comment
|
||||
s = re.sub("^( |\|)*\*?", "", i)
|
||||
if len(s.strip(self.strip)):
|
||||
buf.append(s.strip(self.strip))
|
||||
elif "*/" in i and in_comment:
|
||||
# end of multiline comment
|
||||
in_comment = False
|
||||
s = re.sub(r"\*+/.*", "", i.strip(self.strip))
|
||||
if len(s):
|
||||
buf.append(s)
|
||||
yield (count, "\n ".join(buf))
|
||||
buf = []
|
||||
elif "/*" in i and "*/" in i:
|
||||
# c-style oneliner comment
|
||||
yield (count, re.sub(".*/\*(.*)\*/.*", r"\1", i).strip(self.strip))
|
||||
count += 1
|
||||
|
||||
def get_lang(self, s):
|
||||
""" the output is 'german' or 'english' or 'german or english'. when
|
||||
unsure, just don't warn, there are strings where you just can't
|
||||
teremine the results reliably, like '#110680#' """
|
||||
cwd = os.getcwd()
|
||||
# change to our directory
|
||||
os.chdir(os.path.split(os.path.abspath(sys.argv[0]))[0])
|
||||
sock = subprocess.Popen(["text_cat/text_cat", "-d", "text_cat/LM"], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
|
||||
sock.stdin.write(s)
|
||||
sock.stdin.close()
|
||||
lang = sock.stdout.read().strip()
|
||||
sock.stdout.close()
|
||||
os.chdir(cwd)
|
||||
return lang
|
||||
|
||||
def is_german(self, s):
|
||||
"""
|
||||
determines if a string is german or not
|
||||
"""
|
||||
# for short strings we can't do reliable recognition, so skip
|
||||
# short strings and less than 4 words
|
||||
s = s.replace('\n', ' ')
|
||||
if len(s) < 32 or len(s.split()) < 4:
|
||||
return False
|
||||
return "german" == self.get_lang(s)
|
||||
|
||||
def check_file(self, path):
|
||||
"""
|
||||
checks each comment in a file
|
||||
"""
|
||||
for linenum, s in self.get_comments(path):
|
||||
if self.is_german(s):
|
||||
print "%s:%s: %s" % (path, linenum, s)
|
||||
|
||||
def check_source_files(self, dir):
|
||||
"""
|
||||
checks each _tracked_ file in a directory recursively
|
||||
"""
|
||||
sock = os.popen(r"git ls-files '%s' |egrep '\.(c|h)xx$'" % dir)
|
||||
lines = sock.readlines()
|
||||
sock.close()
|
||||
for path in lines:
|
||||
self.check_file(path.strip())
|
||||
|
||||
try:
|
||||
Parser()
|
||||
except KeyboardInterrupt:
|
||||
print "Interrupted!"
|
||||
sys.exit(0)
|
||||
|
||||
# vim:set shiftwidth=4 softtabstop=4 expandtab:
|
504
bin/text_cat/COPYING
Normal file
504
bin/text_cat/COPYING
Normal file
|
@ -0,0 +1,504 @@
|
|||
GNU LESSER GENERAL PUBLIC LICENSE
|
||||
Version 2.1, February 1999
|
||||
|
||||
Copyright (C) 1991, 1999 Free Software Foundation, Inc.
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
Everyone is permitted to copy and distribute verbatim copies
|
||||
of this license document, but changing it is not allowed.
|
||||
|
||||
[This is the first released version of the Lesser GPL. It also counts
|
||||
as the successor of the GNU Library Public License, version 2, hence
|
||||
the version number 2.1.]
|
||||
|
||||
Preamble
|
||||
|
||||
The licenses for most software are designed to take away your
|
||||
freedom to share and change it. By contrast, the GNU General Public
|
||||
Licenses are intended to guarantee your freedom to share and change
|
||||
free software--to make sure the software is free for all its users.
|
||||
|
||||
This license, the Lesser General Public License, applies to some
|
||||
specially designated software packages--typically libraries--of the
|
||||
Free Software Foundation and other authors who decide to use it. You
|
||||
can use it too, but we suggest you first think carefully about whether
|
||||
this license or the ordinary General Public License is the better
|
||||
strategy to use in any particular case, based on the explanations below.
|
||||
|
||||
When we speak of free software, we are referring to freedom of use,
|
||||
not price. Our General Public Licenses are designed to make sure that
|
||||
you have the freedom to distribute copies of free software (and charge
|
||||
for this service if you wish); that you receive source code or can get
|
||||
it if you want it; that you can change the software and use pieces of
|
||||
it in new free programs; and that you are informed that you can do
|
||||
these things.
|
||||
|
||||
To protect your rights, we need to make restrictions that forbid
|
||||
distributors to deny you these rights or to ask you to surrender these
|
||||
rights. These restrictions translate to certain responsibilities for
|
||||
you if you distribute copies of the library or if you modify it.
|
||||
|
||||
For example, if you distribute copies of the library, whether gratis
|
||||
or for a fee, you must give the recipients all the rights that we gave
|
||||
you. You must make sure that they, too, receive or can get the source
|
||||
code. If you link other code with the library, you must provide
|
||||
complete object files to the recipients, so that they can relink them
|
||||
with the library after making changes to the library and recompiling
|
||||
it. And you must show them these terms so they know their rights.
|
||||
|
||||
We protect your rights with a two-step method: (1) we copyright the
|
||||
library, and (2) we offer you this license, which gives you legal
|
||||
permission to copy, distribute and/or modify the library.
|
||||
|
||||
To protect each distributor, we want to make it very clear that
|
||||
there is no warranty for the free library. Also, if the library is
|
||||
modified by someone else and passed on, the recipients should know
|
||||
that what they have is not the original version, so that the original
|
||||
author's reputation will not be affected by problems that might be
|
||||
introduced by others.
|
||||
|
||||
Finally, software patents pose a constant threat to the existence of
|
||||
any free program. We wish to make sure that a company cannot
|
||||
effectively restrict the users of a free program by obtaining a
|
||||
restrictive license from a patent holder. Therefore, we insist that
|
||||
any patent license obtained for a version of the library must be
|
||||
consistent with the full freedom of use specified in this license.
|
||||
|
||||
Most GNU software, including some libraries, is covered by the
|
||||
ordinary GNU General Public License. This license, the GNU Lesser
|
||||
General Public License, applies to certain designated libraries, and
|
||||
is quite different from the ordinary General Public License. We use
|
||||
this license for certain libraries in order to permit linking those
|
||||
libraries into non-free programs.
|
||||
|
||||
When a program is linked with a library, whether statically or using
|
||||
a shared library, the combination of the two is legally speaking a
|
||||
combined work, a derivative of the original library. The ordinary
|
||||
General Public License therefore permits such linking only if the
|
||||
entire combination fits its criteria of freedom. The Lesser General
|
||||
Public License permits more lax criteria for linking other code with
|
||||
the library.
|
||||
|
||||
We call this license the "Lesser" General Public License because it
|
||||
does Less to protect the user's freedom than the ordinary General
|
||||
Public License. It also provides other free software developers Less
|
||||
of an advantage over competing non-free programs. These disadvantages
|
||||
are the reason we use the ordinary General Public License for many
|
||||
libraries. However, the Lesser license provides advantages in certain
|
||||
special circumstances.
|
||||
|
||||
For example, on rare occasions, there may be a special need to
|
||||
encourage the widest possible use of a certain library, so that it becomes
|
||||
a de-facto standard. To achieve this, non-free programs must be
|
||||
allowed to use the library. A more frequent case is that a free
|
||||
library does the same job as widely used non-free libraries. In this
|
||||
case, there is little to gain by limiting the free library to free
|
||||
software only, so we use the Lesser General Public License.
|
||||
|
||||
In other cases, permission to use a particular library in non-free
|
||||
programs enables a greater number of people to use a large body of
|
||||
free software. For example, permission to use the GNU C Library in
|
||||
non-free programs enables many more people to use the whole GNU
|
||||
operating system, as well as its variant, the GNU/Linux operating
|
||||
system.
|
||||
|
||||
Although the Lesser General Public License is Less protective of the
|
||||
users' freedom, it does ensure that the user of a program that is
|
||||
linked with the Library has the freedom and the wherewithal to run
|
||||
that program using a modified version of the Library.
|
||||
|
||||
The precise terms and conditions for copying, distribution and
|
||||
modification follow. Pay close attention to the difference between a
|
||||
"work based on the library" and a "work that uses the library". The
|
||||
former contains code derived from the library, whereas the latter must
|
||||
be combined with the library in order to run.
|
||||
|
||||
GNU LESSER GENERAL PUBLIC LICENSE
|
||||
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
|
||||
|
||||
0. This License Agreement applies to any software library or other
|
||||
program which contains a notice placed by the copyright holder or
|
||||
other authorized party saying it may be distributed under the terms of
|
||||
this Lesser General Public License (also called "this License").
|
||||
Each licensee is addressed as "you".
|
||||
|
||||
A "library" means a collection of software functions and/or data
|
||||
prepared so as to be conveniently linked with application programs
|
||||
(which use some of those functions and data) to form executables.
|
||||
|
||||
The "Library", below, refers to any such software library or work
|
||||
which has been distributed under these terms. A "work based on the
|
||||
Library" means either the Library or any derivative work under
|
||||
copyright law: that is to say, a work containing the Library or a
|
||||
portion of it, either verbatim or with modifications and/or translated
|
||||
straightforwardly into another language. (Hereinafter, translation is
|
||||
included without limitation in the term "modification".)
|
||||
|
||||
"Source code" for a work means the preferred form of the work for
|
||||
making modifications to it. For a library, complete source code means
|
||||
all the source code for all modules it contains, plus any associated
|
||||
interface definition files, plus the scripts used to control compilation
|
||||
and installation of the library.
|
||||
|
||||
Activities other than copying, distribution and modification are not
|
||||
covered by this License; they are outside its scope. The act of
|
||||
running a program using the Library is not restricted, and output from
|
||||
such a program is covered only if its contents constitute a work based
|
||||
on the Library (independent of the use of the Library in a tool for
|
||||
writing it). Whether that is true depends on what the Library does
|
||||
and what the program that uses the Library does.
|
||||
|
||||
1. You may copy and distribute verbatim copies of the Library's
|
||||
complete source code as you receive it, in any medium, provided that
|
||||
you conspicuously and appropriately publish on each copy an
|
||||
appropriate copyright notice and disclaimer of warranty; keep intact
|
||||
all the notices that refer to this License and to the absence of any
|
||||
warranty; and distribute a copy of this License along with the
|
||||
Library.
|
||||
|
||||
You may charge a fee for the physical act of transferring a copy,
|
||||
and you may at your option offer warranty protection in exchange for a
|
||||
fee.
|
||||
|
||||
2. You may modify your copy or copies of the Library or any portion
|
||||
of it, thus forming a work based on the Library, and copy and
|
||||
distribute such modifications or work under the terms of Section 1
|
||||
above, provided that you also meet all of these conditions:
|
||||
|
||||
a) The modified work must itself be a software library.
|
||||
|
||||
b) You must cause the files modified to carry prominent notices
|
||||
stating that you changed the files and the date of any change.
|
||||
|
||||
c) You must cause the whole of the work to be licensed at no
|
||||
charge to all third parties under the terms of this License.
|
||||
|
||||
d) If a facility in the modified Library refers to a function or a
|
||||
table of data to be supplied by an application program that uses
|
||||
the facility, other than as an argument passed when the facility
|
||||
is invoked, then you must make a good faith effort to ensure that,
|
||||
in the event an application does not supply such function or
|
||||
table, the facility still operates, and performs whatever part of
|
||||
its purpose remains meaningful.
|
||||
|
||||
(For example, a function in a library to compute square roots has
|
||||
a purpose that is entirely well-defined independent of the
|
||||
application. Therefore, Subsection 2d requires that any
|
||||
application-supplied function or table used by this function must
|
||||
be optional: if the application does not supply it, the square
|
||||
root function must still compute square roots.)
|
||||
|
||||
These requirements apply to the modified work as a whole. If
|
||||
identifiable sections of that work are not derived from the Library,
|
||||
and can be reasonably considered independent and separate works in
|
||||
themselves, then this License, and its terms, do not apply to those
|
||||
sections when you distribute them as separate works. But when you
|
||||
distribute the same sections as part of a whole which is a work based
|
||||
on the Library, the distribution of the whole must be on the terms of
|
||||
this License, whose permissions for other licensees extend to the
|
||||
entire whole, and thus to each and every part regardless of who wrote
|
||||
it.
|
||||
|
||||
Thus, it is not the intent of this section to claim rights or contest
|
||||
your rights to work written entirely by you; rather, the intent is to
|
||||
exercise the right to control the distribution of derivative or
|
||||
collective works based on the Library.
|
||||
|
||||
In addition, mere aggregation of another work not based on the Library
|
||||
with the Library (or with a work based on the Library) on a volume of
|
||||
a storage or distribution medium does not bring the other work under
|
||||
the scope of this License.
|
||||
|
||||
3. You may opt to apply the terms of the ordinary GNU General Public
|
||||
License instead of this License to a given copy of the Library. To do
|
||||
this, you must alter all the notices that refer to this License, so
|
||||
that they refer to the ordinary GNU General Public License, version 2,
|
||||
instead of to this License. (If a newer version than version 2 of the
|
||||
ordinary GNU General Public License has appeared, then you can specify
|
||||
that version instead if you wish.) Do not make any other change in
|
||||
these notices.
|
||||
|
||||
Once this change is made in a given copy, it is irreversible for
|
||||
that copy, so the ordinary GNU General Public License applies to all
|
||||
subsequent copies and derivative works made from that copy.
|
||||
|
||||
This option is useful when you wish to copy part of the code of
|
||||
the Library into a program that is not a library.
|
||||
|
||||
4. You may copy and distribute the Library (or a portion or
|
||||
derivative of it, under Section 2) in object code or executable form
|
||||
under the terms of Sections 1 and 2 above provided that you accompany
|
||||
it with the complete corresponding machine-readable source code, which
|
||||
must be distributed under the terms of Sections 1 and 2 above on a
|
||||
medium customarily used for software interchange.
|
||||
|
||||
If distribution of object code is made by offering access to copy
|
||||
from a designated place, then offering equivalent access to copy the
|
||||
source code from the same place satisfies the requirement to
|
||||
distribute the source code, even though third parties are not
|
||||
compelled to copy the source along with the object code.
|
||||
|
||||
5. A program that contains no derivative of any portion of the
|
||||
Library, but is designed to work with the Library by being compiled or
|
||||
linked with it, is called a "work that uses the Library". Such a
|
||||
work, in isolation, is not a derivative work of the Library, and
|
||||
therefore falls outside the scope of this License.
|
||||
|
||||
However, linking a "work that uses the Library" with the Library
|
||||
creates an executable that is a derivative of the Library (because it
|
||||
contains portions of the Library), rather than a "work that uses the
|
||||
library". The executable is therefore covered by this License.
|
||||
Section 6 states terms for distribution of such executables.
|
||||
|
||||
When a "work that uses the Library" uses material from a header file
|
||||
that is part of the Library, the object code for the work may be a
|
||||
derivative work of the Library even though the source code is not.
|
||||
Whether this is true is especially significant if the work can be
|
||||
linked without the Library, or if the work is itself a library. The
|
||||
threshold for this to be true is not precisely defined by law.
|
||||
|
||||
If such an object file uses only numerical parameters, data
|
||||
structure layouts and accessors, and small macros and small inline
|
||||
functions (ten lines or less in length), then the use of the object
|
||||
file is unrestricted, regardless of whether it is legally a derivative
|
||||
work. (Executables containing this object code plus portions of the
|
||||
Library will still fall under Section 6.)
|
||||
|
||||
Otherwise, if the work is a derivative of the Library, you may
|
||||
distribute the object code for the work under the terms of Section 6.
|
||||
Any executables containing that work also fall under Section 6,
|
||||
whether or not they are linked directly with the Library itself.
|
||||
|
||||
6. As an exception to the Sections above, you may also combine or
|
||||
link a "work that uses the Library" with the Library to produce a
|
||||
work containing portions of the Library, and distribute that work
|
||||
under terms of your choice, provided that the terms permit
|
||||
modification of the work for the customer's own use and reverse
|
||||
engineering for debugging such modifications.
|
||||
|
||||
You must give prominent notice with each copy of the work that the
|
||||
Library is used in it and that the Library and its use are covered by
|
||||
this License. You must supply a copy of this License. If the work
|
||||
during execution displays copyright notices, you must include the
|
||||
copyright notice for the Library among them, as well as a reference
|
||||
directing the user to the copy of this License. Also, you must do one
|
||||
of these things:
|
||||
|
||||
a) Accompany the work with the complete corresponding
|
||||
machine-readable source code for the Library including whatever
|
||||
changes were used in the work (which must be distributed under
|
||||
Sections 1 and 2 above); and, if the work is an executable linked
|
||||
with the Library, with the complete machine-readable "work that
|
||||
uses the Library", as object code and/or source code, so that the
|
||||
user can modify the Library and then relink to produce a modified
|
||||
executable containing the modified Library. (It is understood
|
||||
that the user who changes the contents of definitions files in the
|
||||
Library will not necessarily be able to recompile the application
|
||||
to use the modified definitions.)
|
||||
|
||||
b) Use a suitable shared library mechanism for linking with the
|
||||
Library. A suitable mechanism is one that (1) uses at run time a
|
||||
copy of the library already present on the user's computer system,
|
||||
rather than copying library functions into the executable, and (2)
|
||||
will operate properly with a modified version of the library, if
|
||||
the user installs one, as long as the modified version is
|
||||
interface-compatible with the version that the work was made with.
|
||||
|
||||
c) Accompany the work with a written offer, valid for at
|
||||
least three years, to give the same user the materials
|
||||
specified in Subsection 6a, above, for a charge no more
|
||||
than the cost of performing this distribution.
|
||||
|
||||
d) If distribution of the work is made by offering access to copy
|
||||
from a designated place, offer equivalent access to copy the above
|
||||
specified materials from the same place.
|
||||
|
||||
e) Verify that the user has already received a copy of these
|
||||
materials or that you have already sent this user a copy.
|
||||
|
||||
For an executable, the required form of the "work that uses the
|
||||
Library" must include any data and utility programs needed for
|
||||
reproducing the executable from it. However, as a special exception,
|
||||
the materials to be distributed need not include anything that is
|
||||
normally distributed (in either source or binary form) with the major
|
||||
components (compiler, kernel, and so on) of the operating system on
|
||||
which the executable runs, unless that component itself accompanies
|
||||
the executable.
|
||||
|
||||
It may happen that this requirement contradicts the license
|
||||
restrictions of other proprietary libraries that do not normally
|
||||
accompany the operating system. Such a contradiction means you cannot
|
||||
use both them and the Library together in an executable that you
|
||||
distribute.
|
||||
|
||||
7. You may place library facilities that are a work based on the
|
||||
Library side-by-side in a single library together with other library
|
||||
facilities not covered by this License, and distribute such a combined
|
||||
library, provided that the separate distribution of the work based on
|
||||
the Library and of the other library facilities is otherwise
|
||||
permitted, and provided that you do these two things:
|
||||
|
||||
a) Accompany the combined library with a copy of the same work
|
||||
based on the Library, uncombined with any other library
|
||||
facilities. This must be distributed under the terms of the
|
||||
Sections above.
|
||||
|
||||
b) Give prominent notice with the combined library of the fact
|
||||
that part of it is a work based on the Library, and explaining
|
||||
where to find the accompanying uncombined form of the same work.
|
||||
|
||||
8. You may not copy, modify, sublicense, link with, or distribute
|
||||
the Library except as expressly provided under this License. Any
|
||||
attempt otherwise to copy, modify, sublicense, link with, or
|
||||
distribute the Library is void, and will automatically terminate your
|
||||
rights under this License. However, parties who have received copies,
|
||||
or rights, from you under this License will not have their licenses
|
||||
terminated so long as such parties remain in full compliance.
|
||||
|
||||
9. You are not required to accept this License, since you have not
|
||||
signed it. However, nothing else grants you permission to modify or
|
||||
distribute the Library or its derivative works. These actions are
|
||||
prohibited by law if you do not accept this License. Therefore, by
|
||||
modifying or distributing the Library (or any work based on the
|
||||
Library), you indicate your acceptance of this License to do so, and
|
||||
all its terms and conditions for copying, distributing or modifying
|
||||
the Library or works based on it.
|
||||
|
||||
10. Each time you redistribute the Library (or any work based on the
|
||||
Library), the recipient automatically receives a license from the
|
||||
original licensor to copy, distribute, link with or modify the Library
|
||||
subject to these terms and conditions. You may not impose any further
|
||||
restrictions on the recipients' exercise of the rights granted herein.
|
||||
You are not responsible for enforcing compliance by third parties with
|
||||
this License.
|
||||
|
||||
11. If, as a consequence of a court judgment or allegation of patent
|
||||
infringement or for any other reason (not limited to patent issues),
|
||||
conditions are imposed on you (whether by court order, agreement or
|
||||
otherwise) that contradict the conditions of this License, they do not
|
||||
excuse you from the conditions of this License. If you cannot
|
||||
distribute so as to satisfy simultaneously your obligations under this
|
||||
License and any other pertinent obligations, then as a consequence you
|
||||
may not distribute the Library at all. For example, if a patent
|
||||
license would not permit royalty-free redistribution of the Library by
|
||||
all those who receive copies directly or indirectly through you, then
|
||||
the only way you could satisfy both it and this License would be to
|
||||
refrain entirely from distribution of the Library.
|
||||
|
||||
If any portion of this section is held invalid or unenforceable under any
|
||||
particular circumstance, the balance of the section is intended to apply,
|
||||
and the section as a whole is intended to apply in other circumstances.
|
||||
|
||||
It is not the purpose of this section to induce you to infringe any
|
||||
patents or other property right claims or to contest validity of any
|
||||
such claims; this section has the sole purpose of protecting the
|
||||
integrity of the free software distribution system which is
|
||||
implemented by public license practices. Many people have made
|
||||
generous contributions to the wide range of software distributed
|
||||
through that system in reliance on consistent application of that
|
||||
system; it is up to the author/donor to decide if he or she is willing
|
||||
to distribute software through any other system and a licensee cannot
|
||||
impose that choice.
|
||||
|
||||
This section is intended to make thoroughly clear what is believed to
|
||||
be a consequence of the rest of this License.
|
||||
|
||||
12. If the distribution and/or use of the Library is restricted in
|
||||
certain countries either by patents or by copyrighted interfaces, the
|
||||
original copyright holder who places the Library under this License may add
|
||||
an explicit geographical distribution limitation excluding those countries,
|
||||
so that distribution is permitted only in or among countries not thus
|
||||
excluded. In such case, this License incorporates the limitation as if
|
||||
written in the body of this License.
|
||||
|
||||
13. The Free Software Foundation may publish revised and/or new
|
||||
versions of the Lesser General Public License from time to time.
|
||||
Such new versions will be similar in spirit to the present version,
|
||||
but may differ in detail to address new problems or concerns.
|
||||
|
||||
Each version is given a distinguishing version number. If the Library
|
||||
specifies a version number of this License which applies to it and
|
||||
"any later version", you have the option of following the terms and
|
||||
conditions either of that version or of any later version published by
|
||||
the Free Software Foundation. If the Library does not specify a
|
||||
license version number, you may choose any version ever published by
|
||||
the Free Software Foundation.
|
||||
|
||||
14. If you wish to incorporate parts of the Library into other free
|
||||
programs whose distribution conditions are incompatible with these,
|
||||
write to the author to ask for permission. For software which is
|
||||
copyrighted by the Free Software Foundation, write to the Free
|
||||
Software Foundation; we sometimes make exceptions for this. Our
|
||||
decision will be guided by the two goals of preserving the free status
|
||||
of all derivatives of our free software and of promoting the sharing
|
||||
and reuse of software generally.
|
||||
|
||||
NO WARRANTY
|
||||
|
||||
15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
|
||||
WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
|
||||
EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
|
||||
OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
|
||||
KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
|
||||
LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
|
||||
THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
|
||||
|
||||
16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
|
||||
WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
|
||||
AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
|
||||
FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
|
||||
CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
|
||||
LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
|
||||
RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
|
||||
FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
|
||||
SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
|
||||
DAMAGES.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
How to Apply These Terms to Your New Libraries
|
||||
|
||||
If you develop a new library, and you want it to be of the greatest
|
||||
possible use to the public, we recommend making it free software that
|
||||
everyone can redistribute and change. You can do so by permitting
|
||||
redistribution under these terms (or, alternatively, under the terms of the
|
||||
ordinary General Public License).
|
||||
|
||||
To apply these terms, attach the following notices to the library. It is
|
||||
safest to attach them to the start of each source file to most effectively
|
||||
convey the exclusion of warranty; and each file should have at least the
|
||||
"copyright" line and a pointer to where the full notice is found.
|
||||
|
||||
<one line to give the library's name and a brief idea of what it does.>
|
||||
Copyright (C) <year> <name of author>
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
|
||||
Also add information on how to contact you by electronic and paper mail.
|
||||
|
||||
You should also get your employer (if you work as a programmer) or your
|
||||
school, if any, to sign a "copyright disclaimer" for the library, if
|
||||
necessary. Here is a sample; alter the names:
|
||||
|
||||
Yoyodyne, Inc., hereby disclaims all copyright interest in the
|
||||
library `Frob' (a library for tweaking knobs) written by James Random Hacker.
|
||||
|
||||
<signature of Ty Coon>, 1 April 1990
|
||||
Ty Coon, President of Vice
|
||||
|
||||
That's all there is to it!
|
||||
|
||||
|
21
bin/text_cat/Copyright
Normal file
21
bin/text_cat/Copyright
Normal file
|
@ -0,0 +1,21 @@
|
|||
Copyright (c) 1994, 1995, 1996, 1997 by Gertjan van Noord.
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the
|
||||
Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston,
|
||||
MA 02110-1301 USA
|
||||
|
||||
cf. the file COPYING
|
||||
|
||||
|
400
bin/text_cat/LM/english.lm
Normal file
400
bin/text_cat/LM/english.lm
Normal file
|
@ -0,0 +1,400 @@
|
|||
_ 20326
|
||||
e 6617
|
||||
t 4843
|
||||
o 3834
|
||||
n 3653
|
||||
i 3602
|
||||
a 3433
|
||||
s 2945
|
||||
r 2921
|
||||
h 2507
|
||||
e_ 2000
|
||||
d 1816
|
||||
_t 1785
|
||||
c 1639
|
||||
l 1635
|
||||
th 1535
|
||||
he 1351
|
||||
_th 1333
|
||||
u 1309
|
||||
f 1253
|
||||
m 1175
|
||||
p 1151
|
||||
_a 1145
|
||||
the 1142
|
||||
_the 1060
|
||||
s_ 978
|
||||
er 968
|
||||
_o 967
|
||||
he_ 928
|
||||
d_ 888
|
||||
t_ 885
|
||||
the_ 844
|
||||
_the_ 843
|
||||
on 842
|
||||
in 817
|
||||
y 783
|
||||
n_ 773
|
||||
b 761
|
||||
re 754
|
||||
, 734
|
||||
,_ 732
|
||||
an 732
|
||||
g 728
|
||||
w 718
|
||||
_i 707
|
||||
en 676
|
||||
f_ 599
|
||||
y_ 595
|
||||
of 594
|
||||
_of 592
|
||||
es 589
|
||||
ti 587
|
||||
v 580
|
||||
_of_ 575
|
||||
of_ 575
|
||||
nd 568
|
||||
at 549
|
||||
r_ 540
|
||||
_w 534
|
||||
it 522
|
||||
ed 496
|
||||
_p 494
|
||||
nt 485
|
||||
_c 462
|
||||
o_ 457
|
||||
io 450
|
||||
_an 439
|
||||
te 432
|
||||
or 425
|
||||
_b 418
|
||||
nd_ 407
|
||||
to 406
|
||||
st 402
|
||||
is 401
|
||||
_s 396
|
||||
_in 389
|
||||
ion 385
|
||||
and 385
|
||||
de 384
|
||||
ve 382
|
||||
ha 375
|
||||
ar 366
|
||||
_m 361
|
||||
and_ 360
|
||||
_and 360
|
||||
_and_ 358
|
||||
se 353
|
||||
_to 347
|
||||
me 346
|
||||
to_ 344
|
||||
ed_ 339
|
||||
. 330
|
||||
be 329
|
||||
_f 329
|
||||
._ 329
|
||||
_to_ 320
|
||||
co 317
|
||||
ic 316
|
||||
ns 308
|
||||
al 307
|
||||
le 304
|
||||
ou 304
|
||||
ce 293
|
||||
ent 279
|
||||
l_ 278
|
||||
_co 277
|
||||
tio 275
|
||||
on_ 274
|
||||
_d 274
|
||||
tion 268
|
||||
ri 266
|
||||
_e 264
|
||||
ng 253
|
||||
hi 251
|
||||
er_ 249
|
||||
ea 246
|
||||
as 245
|
||||
_be 242
|
||||
pe 242
|
||||
h_ 234
|
||||
_r 232
|
||||
ec 227
|
||||
ch 223
|
||||
ro 222
|
||||
ct 220
|
||||
_h 219
|
||||
pr 217
|
||||
in_ 217
|
||||
ne 214
|
||||
ll 214
|
||||
rt 213
|
||||
s,_ 210
|
||||
s, 210
|
||||
li 209
|
||||
ra 208
|
||||
T 207
|
||||
wh 204
|
||||
a_ 203
|
||||
ac 201
|
||||
_wh 199
|
||||
_n 196
|
||||
ts 196
|
||||
di 196
|
||||
es_ 195
|
||||
si 194
|
||||
re_ 193
|
||||
at_ 192
|
||||
nc 192
|
||||
ie 190
|
||||
_a_ 188
|
||||
_in_ 185
|
||||
ing 184
|
||||
us 182
|
||||
_re 182
|
||||
g_ 179
|
||||
ng_ 178
|
||||
op 178
|
||||
con 177
|
||||
tha 175
|
||||
_l 174
|
||||
_tha 174
|
||||
ver 173
|
||||
ma 173
|
||||
ion_ 171
|
||||
_con 171
|
||||
ci 170
|
||||
ons 170
|
||||
_it 170
|
||||
po 169
|
||||
ere 168
|
||||
is_ 167
|
||||
ta 167
|
||||
la 166
|
||||
_pr 165
|
||||
fo 164
|
||||
ho 164
|
||||
ir 162
|
||||
ss 161
|
||||
men 160
|
||||
be_ 160
|
||||
un 159
|
||||
ty 159
|
||||
_be_ 158
|
||||
ing_ 157
|
||||
om 156
|
||||
ot 156
|
||||
hat 155
|
||||
ly 155
|
||||
_g 155
|
||||
em 153
|
||||
_T 151
|
||||
rs 150
|
||||
mo 148
|
||||
ch_ 148
|
||||
wi 147
|
||||
we 147
|
||||
ad 147
|
||||
ts_ 145
|
||||
res 143
|
||||
_wi 143
|
||||
I 143
|
||||
hat_ 142
|
||||
ei 141
|
||||
ly_ 141
|
||||
ni 140
|
||||
os 140
|
||||
ca 139
|
||||
ur 139
|
||||
A 138
|
||||
ut 138
|
||||
that 138
|
||||
_that 137
|
||||
ati 137
|
||||
_fo 137
|
||||
st_ 137
|
||||
il 136
|
||||
or_ 136
|
||||
for 136
|
||||
pa 136
|
||||
ul 135
|
||||
ate 135
|
||||
ter 134
|
||||
it_ 134
|
||||
nt_ 133
|
||||
that_ 132
|
||||
_ha 129
|
||||
al_ 128
|
||||
el 128
|
||||
as_ 127
|
||||
ll_ 127
|
||||
_ma 125
|
||||
no 124
|
||||
ment 124
|
||||
an_ 124
|
||||
tion_ 122
|
||||
su 122
|
||||
bl 122
|
||||
_de 122
|
||||
nce 120
|
||||
pl 120
|
||||
fe 119
|
||||
tr 118
|
||||
so 118
|
||||
int 115
|
||||
ov 114
|
||||
e, 114
|
||||
e,_ 114
|
||||
_u 113
|
||||
ent_ 113
|
||||
Th 113
|
||||
her 113
|
||||
j 112
|
||||
atio 112
|
||||
ation 112
|
||||
_Th 111
|
||||
le_ 110
|
||||
ai 110
|
||||
_it_ 110
|
||||
_on 110
|
||||
_for 109
|
||||
ect 109
|
||||
k 109
|
||||
hic 108
|
||||
est 108
|
||||
der 107
|
||||
tu 107
|
||||
na 106
|
||||
_by_ 106
|
||||
by_ 106
|
||||
E 106
|
||||
by 106
|
||||
_by 106
|
||||
ve_ 106
|
||||
_di 106
|
||||
en_ 104
|
||||
vi 104
|
||||
m_ 103
|
||||
_whi 102
|
||||
iv 102
|
||||
whi 102
|
||||
ns_ 102
|
||||
_A 101
|
||||
ich 100
|
||||
ge 100
|
||||
pro 99
|
||||
ess 99
|
||||
_whic 99
|
||||
ers 99
|
||||
hich 99
|
||||
ce_ 99
|
||||
which 99
|
||||
whic 99
|
||||
all 98
|
||||
ove 98
|
||||
_is 98
|
||||
ich_ 97
|
||||
ee 97
|
||||
hich_ 97
|
||||
n,_ 96
|
||||
n, 96
|
||||
im 95
|
||||
ir_ 94
|
||||
hei 94
|
||||
ions 94
|
||||
sti 94
|
||||
se_ 94
|
||||
per 93
|
||||
The 93
|
||||
_pa 93
|
||||
heir 93
|
||||
id 93
|
||||
eir 93
|
||||
eir_ 93
|
||||
ig 93
|
||||
heir_ 93
|
||||
_no 93
|
||||
ev 93
|
||||
era 92
|
||||
_int 92
|
||||
ted 91
|
||||
_The 91
|
||||
ies 91
|
||||
art 91
|
||||
thei 90
|
||||
_ar 90
|
||||
_thei 90
|
||||
their 90
|
||||
_pro 90
|
||||
et 89
|
||||
_pe 88
|
||||
_mo 88
|
||||
ther 88
|
||||
x 87
|
||||
gh 87
|
||||
S 87
|
||||
_is_ 87
|
||||
ol 87
|
||||
ty_ 87
|
||||
_I 86
|
||||
nde 86
|
||||
am 86
|
||||
rn 86
|
||||
nte 86
|
||||
mp 85
|
||||
_su 84
|
||||
_we 84
|
||||
par 84
|
||||
_v 84
|
||||
pu 82
|
||||
his 82
|
||||
ow 82
|
||||
mi 82
|
||||
go 81
|
||||
N 81
|
||||
ue 81
|
||||
ple 81
|
||||
ep 80
|
||||
ab 80
|
||||
;_ 80
|
||||
; 80
|
||||
ex 80
|
||||
ain 80
|
||||
over 80
|
||||
_un 79
|
||||
q 79
|
||||
qu 79
|
||||
pp 79
|
||||
ith 79
|
||||
ry 79
|
||||
_as 79
|
||||
ber 79
|
||||
ub 78
|
||||
av 78
|
||||
uc 78
|
||||
s._ 77
|
||||
s. 77
|
||||
enc 77
|
||||
are 77
|
||||
iti 77
|
||||
gr 76
|
||||
his_ 76
|
||||
ua 76
|
||||
part 76
|
||||
ff 75
|
||||
eve 75
|
||||
O 75
|
||||
rea 74
|
||||
ous 74
|
||||
ia 74
|
||||
The_ 73
|
||||
ag 73
|
||||
mb 73
|
||||
_go 73
|
||||
fa 72
|
||||
on,_ 72
|
||||
ern 72
|
||||
t,_ 72
|
||||
on, 72
|
||||
t, 72
|
||||
_me 71
|
400
bin/text_cat/LM/german.lm
Normal file
400
bin/text_cat/LM/german.lm
Normal file
|
@ -0,0 +1,400 @@
|
|||
_ 31586
|
||||
e 15008
|
||||
n 9058
|
||||
i 7299
|
||||
r 6830
|
||||
t 5662
|
||||
s 5348
|
||||
a 4618
|
||||
h 4176
|
||||
d 4011
|
||||
er 3415
|
||||
en 3412
|
||||
u 3341
|
||||
l 3266
|
||||
n_ 2848
|
||||
c 2636
|
||||
ch 2460
|
||||
g 2407
|
||||
o 2376
|
||||
e_ 2208
|
||||
r_ 2128
|
||||
m 2077
|
||||
_d 1948
|
||||
de 1831
|
||||
en_ 1786
|
||||
ei 1718
|
||||
er_ 1570
|
||||
in 1568
|
||||
te 1505
|
||||
ie 1505
|
||||
b 1458
|
||||
t_ 1425
|
||||
f 1306
|
||||
k 1176
|
||||
ge 1144
|
||||
s_ 1137
|
||||
un 1113
|
||||
, 1104
|
||||
,_ 1099
|
||||
w 1099
|
||||
z 1060
|
||||
nd 1039
|
||||
he 1004
|
||||
st 989
|
||||
_s 952
|
||||
_de 949
|
||||
. 909
|
||||
_e 906
|
||||
ne 906
|
||||
der 880
|
||||
._ 847
|
||||
be 841
|
||||
es 829
|
||||
ic 796
|
||||
_a 791
|
||||
ie_ 779
|
||||
is 769
|
||||
ich 763
|
||||
an 755
|
||||
re 749
|
||||
di 732
|
||||
ein 730
|
||||
se 730
|
||||
" 720
|
||||
ng 709
|
||||
_i 706
|
||||
sc 683
|
||||
sch 681
|
||||
it 673
|
||||
der_ 652
|
||||
h_ 651
|
||||
ch_ 642
|
||||
S 630
|
||||
le 609
|
||||
p 609
|
||||
ä 607
|
||||
ü 603
|
||||
au 603
|
||||
v 602
|
||||
che 599
|
||||
_w 596
|
||||
d_ 585
|
||||
die 576
|
||||
_di 572
|
||||
m_ 562
|
||||
_die 559
|
||||
el 548
|
||||
_S 540
|
||||
_der 529
|
||||
li 527
|
||||
_der_ 523
|
||||
si 515
|
||||
al 514
|
||||
ns 507
|
||||
on 501
|
||||
or 495
|
||||
ti 490
|
||||
ten 487
|
||||
ht 486
|
||||
die_ 485
|
||||
_die_ 483
|
||||
D 479
|
||||
rt 478
|
||||
nd_ 476
|
||||
_u 470
|
||||
nt 468
|
||||
A 466
|
||||
in_ 464
|
||||
den 461
|
||||
cht 447
|
||||
und 443
|
||||
me 440
|
||||
_z 429
|
||||
ung 426
|
||||
ll 423
|
||||
_un 421
|
||||
_ei 419
|
||||
_n 415
|
||||
hr 412
|
||||
ine 412
|
||||
_A 408
|
||||
_ein 405
|
||||
ar 404
|
||||
ra 403
|
||||
_v 400
|
||||
_g 400
|
||||
as 395
|
||||
zu 392
|
||||
et 389
|
||||
em 385
|
||||
_D 380
|
||||
eine 376
|
||||
gen 376
|
||||
g_ 376
|
||||
da 368
|
||||
we 366
|
||||
K 365
|
||||
lt 360
|
||||
B 354
|
||||
_" 353
|
||||
nde 349
|
||||
ni 347
|
||||
und_ 345
|
||||
E 345
|
||||
ur 345
|
||||
_m 342
|
||||
ri 341
|
||||
ha 340
|
||||
eh 339
|
||||
ten_ 338
|
||||
es_ 336
|
||||
_K 336
|
||||
_und 335
|
||||
ig 335
|
||||
_b 335
|
||||
hen 334
|
||||
_und_ 332
|
||||
_au 329
|
||||
_B 327
|
||||
_da 325
|
||||
_zu 324
|
||||
_in 322
|
||||
at 321
|
||||
us 318
|
||||
wi 307
|
||||
n, 305
|
||||
n,_ 304
|
||||
nn 304
|
||||
te_ 301
|
||||
eit 301
|
||||
_h 300
|
||||
ter 299
|
||||
M 298
|
||||
n. 295
|
||||
ß 294
|
||||
ng_ 289
|
||||
sche 289
|
||||
- 283
|
||||
rs 282
|
||||
den_ 282
|
||||
_si 280
|
||||
G 280
|
||||
im 278
|
||||
_ge 277
|
||||
chen 276
|
||||
rd 273
|
||||
_E 273
|
||||
n._ 270
|
||||
icht 270
|
||||
rn 268
|
||||
uf 267
|
||||
isch 264
|
||||
isc 264
|
||||
nen 263
|
||||
_in_ 262
|
||||
_M 260
|
||||
_er 257
|
||||
ich_ 255
|
||||
ac 253
|
||||
lic 252
|
||||
_G 252
|
||||
ber 252
|
||||
la 251
|
||||
vo 251
|
||||
eb 250
|
||||
ke 249
|
||||
F 248
|
||||
as_ 248
|
||||
hen_ 248
|
||||
ach 245
|
||||
en, 244
|
||||
ung_ 243
|
||||
lich 243
|
||||
ste 243
|
||||
en,_ 243
|
||||
_k 241
|
||||
ben 241
|
||||
_f 241
|
||||
en. 241
|
||||
_be 239
|
||||
it_ 239
|
||||
L 238
|
||||
_se 237
|
||||
mi 236
|
||||
ve 236
|
||||
na 236
|
||||
on_ 236
|
||||
P 235
|
||||
ss 234
|
||||
ist 234
|
||||
ö 234
|
||||
ht_ 233
|
||||
ru 233
|
||||
st_ 229
|
||||
_F 229
|
||||
ts 227
|
||||
ab 226
|
||||
W 226
|
||||
ol 225
|
||||
_eine 225
|
||||
hi 225
|
||||
so 224
|
||||
em_ 223
|
||||
"_ 223
|
||||
ren 222
|
||||
en._ 221
|
||||
chen_ 221
|
||||
R 221
|
||||
ta 221
|
||||
ere 220
|
||||
ische 219
|
||||
ers 218
|
||||
ert 217
|
||||
_P 217
|
||||
tr 217
|
||||
ed 215
|
||||
ze 215
|
||||
eg 215
|
||||
ens 215
|
||||
ür 213
|
||||
ah 212
|
||||
_vo 212
|
||||
ne_ 211
|
||||
cht_ 210
|
||||
uc 209
|
||||
_wi 209
|
||||
nge 208
|
||||
lle 208
|
||||
fe 207
|
||||
_L 207
|
||||
ver 206
|
||||
hl 205
|
||||
V 204
|
||||
ma 203
|
||||
wa 203
|
||||
auf 201
|
||||
H 198
|
||||
_W 195
|
||||
T 195
|
||||
nte 193
|
||||
uch 193
|
||||
l_ 192
|
||||
sei 192
|
||||
nen_ 190
|
||||
u_ 189
|
||||
_den 189
|
||||
_al 189
|
||||
_V 188
|
||||
t. 188
|
||||
lte 187
|
||||
ut 186
|
||||
ent 184
|
||||
sich 183
|
||||
sic 183
|
||||
il 183
|
||||
ier 182
|
||||
am 181
|
||||
gen_ 180
|
||||
sen 179
|
||||
fü 178
|
||||
um 178
|
||||
t._ 177
|
||||
f_ 174
|
||||
he_ 174
|
||||
ner 174
|
||||
nst 174
|
||||
ls 174
|
||||
_sei 173
|
||||
ro 173
|
||||
ir 173
|
||||
ebe 173
|
||||
mm 173
|
||||
ag 172
|
||||
ern 169
|
||||
t,_ 169
|
||||
t, 169
|
||||
eu 169
|
||||
ft 168
|
||||
icht_ 167
|
||||
hre 167
|
||||
Be 166
|
||||
nz 165
|
||||
nder 165
|
||||
_T 164
|
||||
_den_ 164
|
||||
iche 163
|
||||
tt 163
|
||||
zu_ 162
|
||||
and 162
|
||||
J 161
|
||||
rde 160
|
||||
rei 160
|
||||
_we 159
|
||||
_H 159
|
||||
ige 159
|
||||
_Be 158
|
||||
rte 157
|
||||
hei 156
|
||||
das 155
|
||||
aus 155
|
||||
che_ 154
|
||||
_das 154
|
||||
_zu_ 154
|
||||
tz 154
|
||||
_ni 153
|
||||
das_ 153
|
||||
_R 153
|
||||
N 153
|
||||
des 153
|
||||
_ve 153
|
||||
_J 152
|
||||
I 152
|
||||
_das_ 152
|
||||
men 151
|
||||
_so 151
|
||||
_ver 151
|
||||
_auf 150
|
||||
ine_ 150
|
||||
_ha 150
|
||||
rg 149
|
||||
ind 148
|
||||
eben 148
|
||||
kt 147
|
||||
mit 147
|
||||
_an 147
|
||||
her 146
|
||||
Ge 146
|
||||
Sc 145
|
||||
_sich 145
|
||||
U 145
|
||||
Sch 145
|
||||
_sic 145
|
||||
end 145
|
||||
Di 144
|
||||
abe 143
|
||||
ck 143
|
||||
sse 142
|
||||
ür_ 142
|
||||
ell 142
|
||||
ik 141
|
||||
o_ 141
|
||||
nic 141
|
||||
nich 141
|
||||
sa 141
|
||||
_fü 140
|
||||
hn 140
|
||||
zi 140
|
||||
no 140
|
||||
nicht 140
|
||||
im_ 139
|
||||
von_ 139
|
||||
von 139
|
||||
_nic 139
|
||||
_nich 139
|
||||
eine_ 139
|
||||
oc 138
|
||||
wei 138
|
||||
io 138
|
||||
schen 138
|
||||
gt 138
|
229
bin/text_cat/text_cat
Executable file
229
bin/text_cat/text_cat
Executable file
|
@ -0,0 +1,229 @@
|
|||
#!/usr/bin/perl -w
|
||||
# © Gertjan van Noord, 1997.
|
||||
# mailto:vannoord@let.rug.nl
|
||||
|
||||
use strict;
|
||||
use vars qw($opt_d $opt_f $opt_h $opt_i $opt_l $opt_n $opt_s $opt_t $opt_v $opt_u $opt_a);
|
||||
use Getopt::Std;
|
||||
use Benchmark;
|
||||
|
||||
my $non_word_characters='0-9\s';
|
||||
|
||||
# OPTIONS
|
||||
getopts('a:d:f:hi:lnst:u:v');
|
||||
|
||||
# defaults: set $opt_X unless already defined (Perl Cookbook p. 6):
|
||||
$opt_a ||= 10;
|
||||
$opt_d ||= '/users1/vannoord/Perl/TextCat/LM';
|
||||
$opt_f ||= 0;
|
||||
$opt_t ||= 400;
|
||||
$opt_u ||= 1.05;
|
||||
|
||||
sub help {
|
||||
print <<HELP
|
||||
Text Categorization. Typically used to determine the language of a
|
||||
given document.
|
||||
|
||||
Usage
|
||||
-----
|
||||
|
||||
* print help message:
|
||||
|
||||
$0 -h
|
||||
|
||||
* for guessing:
|
||||
|
||||
$0 [-a Int] [-d Dir] [-f Int] [-i N] [-l] [-t Int] [-u Int] [-v]
|
||||
|
||||
-a the program returns the best-scoring language together
|
||||
with all languages which are $opt_u times worse (cf option -u).
|
||||
If the number of languages to be printed is larger than the value
|
||||
of this option (default: $opt_a) then no language is returned, but
|
||||
instead a message that the input is of an unknown language is
|
||||
printed. Default: $opt_a.
|
||||
-d indicates in which directory the language models are
|
||||
located (files ending in .lm). Currently only a single
|
||||
directory is supported. Default: $opt_d.
|
||||
-f Before sorting is performed the Ngrams which occur this number
|
||||
of times or less are removed. This can be used to speed up
|
||||
the program for longer inputs. For short inputs you should use
|
||||
-f 0.
|
||||
Default: $opt_f.
|
||||
-i N only read first N lines
|
||||
-l indicates that input is given as an argument on the command line,
|
||||
e.g. text_cat -l "this is english text"
|
||||
Cannot be used in combination with -n.
|
||||
-s Determine language of each line of input. Not very efficient yet,
|
||||
because language models are re-loaded after each line.
|
||||
-t indicates the topmost number of ngrams that should be used.
|
||||
If used in combination with -n this determines the size of the
|
||||
output. If used with categorization this determines
|
||||
the number of ngrams that are compared with each of the language
|
||||
models (but each of those models is used completely).
|
||||
-u determines how much worse result must be in order not to be
|
||||
mentioned as an alternative. Typical value: 1.05 or 1.1.
|
||||
Default: $opt_u.
|
||||
-v verbose. Continuation messages are written to standard error.
|
||||
|
||||
* for creating new language model, based on text read from standard input:
|
||||
|
||||
$0 -n [-v]
|
||||
|
||||
-v verbose. Continuation messages are written to standard error.
|
||||
|
||||
|
||||
HELP
|
||||
}
|
||||
|
||||
if ($opt_h) { help(); exit 0; };
|
||||
|
||||
if ($opt_n) {
|
||||
my %ngram=();
|
||||
my @result = create_lm(input(),\%ngram);
|
||||
print join("\n",map { "$_\t $ngram{$_}" ; } @result),"\n";
|
||||
} elsif ($opt_l) {
|
||||
classify($ARGV[0]);
|
||||
} elsif ($opt_s) {
|
||||
while (<>) {
|
||||
chomp;
|
||||
classify($_);
|
||||
}
|
||||
} else {
|
||||
classify(input());
|
||||
}
|
||||
|
||||
# CLASSIFICATION
|
||||
sub classify {
|
||||
my ($input)=@_;
|
||||
my %results=();
|
||||
my $maxp = $opt_t;
|
||||
# open directory to find which languages are supported
|
||||
opendir DIR, "$opt_d" or die "directory $opt_d: $!\n";
|
||||
my @languages = sort(grep { s/\.lm// && -r "$opt_d/$_.lm" } readdir(DIR));
|
||||
closedir DIR;
|
||||
@languages or die "sorry, can't read any language models from $opt_d\n" .
|
||||
"language models must reside in files with .lm ending\n";
|
||||
|
||||
|
||||
# create ngrams for input. Note that hash %unknown is not used;
|
||||
# it contains the actual counts which are only used under -n: creating
|
||||
# new language model (and even then they are not really required).
|
||||
my @unknown=create_lm($input);
|
||||
# load model and count for each language.
|
||||
my $language;
|
||||
my $t1 = new Benchmark;
|
||||
foreach $language (@languages) {
|
||||
# loads the language model into hash %$language.
|
||||
my %ngram=();
|
||||
my $rang=1;
|
||||
open(LM,"$opt_d/$language.lm") || die "cannot open $language.lm: $!\n";
|
||||
while (<LM>) {
|
||||
chomp;
|
||||
# only use lines starting with appropriate character. Others are
|
||||
# ignored.
|
||||
if (/^[^$non_word_characters]+/o) {
|
||||
$ngram{$&} = $rang++;
|
||||
}
|
||||
}
|
||||
close(LM);
|
||||
#print STDERR "loaded language model $language\n" if $opt_v;
|
||||
|
||||
# compares the language model with input ngrams list
|
||||
my ($i,$p)=(0,0);
|
||||
while ($i < @unknown) {
|
||||
if ($ngram{$unknown[$i]}) {
|
||||
$p=$p+abs($ngram{$unknown[$i]}-$i);
|
||||
} else {
|
||||
$p=$p+$maxp;
|
||||
}
|
||||
++$i;
|
||||
}
|
||||
#print STDERR "$language: $p\n" if $opt_v;
|
||||
|
||||
$results{$language} = $p;
|
||||
}
|
||||
print STDERR "read language models done (" .
|
||||
timestr(timediff(new Benchmark, $t1)) .
|
||||
".\n" if $opt_v;
|
||||
my @results = sort { $results{$a} <=> $results{$b} } keys %results;
|
||||
|
||||
print join("\n",map { "$_\t $results{$_}"; } @results),"\n" if $opt_v;
|
||||
my $a = $results{$results[0]};
|
||||
|
||||
my @answers=(shift(@results));
|
||||
while (@results && $results{$results[0]} < ($opt_u *$a)) {
|
||||
@answers=(@answers,shift(@results));
|
||||
}
|
||||
if (@answers > $opt_a) {
|
||||
print "I don't know; " .
|
||||
"Perhaps this is a language I haven't seen before?\n";
|
||||
} else {
|
||||
print join(" or ", @answers), "\n";
|
||||
}
|
||||
}
|
||||
|
||||
# first and only argument is reference to hash.
|
||||
# this hash is filled, and a sorted list (opt_n elements)
|
||||
# is returned.
|
||||
sub input {
|
||||
my $read="";
|
||||
if ($opt_i) {
|
||||
while(<>) {
|
||||
if ($. == $opt_i) {
|
||||
return $read . $_;
|
||||
}
|
||||
$read = $read . $_;
|
||||
}
|
||||
return $read;
|
||||
} else {
|
||||
local $/; # so it doesn't affect $/ elsewhere
|
||||
undef $/;
|
||||
$read = <>; # swallow input.
|
||||
$read || die "determining the language of an empty file is hard...\n";
|
||||
return $read;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
sub create_lm {
|
||||
my $t1 = new Benchmark;
|
||||
my $ngram;
|
||||
($_,$ngram) = @_; #$ngram contains reference to the hash we build
|
||||
# then add the ngrams found in each word in the hash
|
||||
my $word;
|
||||
foreach $word (split("[$non_word_characters]+")) {
|
||||
$word = "_" . $word . "_";
|
||||
my $len = length($word);
|
||||
my $flen=$len;
|
||||
my $i;
|
||||
for ($i=0;$i<$flen;$i++) {
|
||||
$$ngram{substr($word,$i,5)}++ if $len > 4;
|
||||
$$ngram{substr($word,$i,4)}++ if $len > 3;
|
||||
$$ngram{substr($word,$i,3)}++ if $len > 2;
|
||||
$$ngram{substr($word,$i,2)}++ if $len > 1;
|
||||
$$ngram{substr($word,$i,1)}++;
|
||||
$len--;
|
||||
}
|
||||
}
|
||||
###print "@{[%$ngram]}";
|
||||
my $t2 = new Benchmark;
|
||||
print STDERR "count_ngrams done (".
|
||||
timestr(timediff($t2, $t1)) .").\n" if $opt_v;
|
||||
|
||||
# as suggested by Karel P. de Vos, k.vos@elsevier.nl, we speed up
|
||||
# sorting by removing singletons
|
||||
map { my $key=$_; if ($$ngram{$key} <= $opt_f)
|
||||
{ delete $$ngram{$key}; }; } keys %$ngram;
|
||||
#however I have very bad results for short inputs, this way
|
||||
|
||||
|
||||
# sort the ngrams, and spit out the $opt_t frequent ones.
|
||||
# adding `or $a cmp $b' in the sort block makes sorting five
|
||||
# times slower..., although it would be somewhat nicer (unique result)
|
||||
my @sorted = sort { $$ngram{$b} <=> $$ngram{$a} } keys %$ngram;
|
||||
splice(@sorted,$opt_t) if (@sorted > $opt_t);
|
||||
print STDERR "sorting done (" .
|
||||
timestr(timediff(new Benchmark, $t2)) .
|
||||
").\n" if $opt_v;
|
||||
return @sorted;
|
||||
}
|
2
bin/text_cat/version
Normal file
2
bin/text_cat/version
Normal file
|
@ -0,0 +1,2 @@
|
|||
1.10
|
||||
|
Loading…
Reference in a new issue