summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichiel Buddingh <michiel@michielbuddingh.net>2014-05-03 16:48:32 (GMT)
committerMichiel Buddingh <michiel@michielbuddingh.net>2014-05-03 16:48:32 (GMT)
commit40d02b28ae949a188a90327f087fd50da2b796c2 (patch)
tree461a0bc1862988bfe22b66eadaa3365acbec5d9f
Initial commitHEADv0.1.0master
-rw-r--r--LICENSE202
-rw-r--r--cmeclax/20480.testbin0 -> 20480 bytes
-rw-r--r--cmeclax/COPYING674
-rw-r--r--cmeclax/cluster.c98
-rw-r--r--cmeclax/cluster.h18
-rw-r--r--cmeclax/cmeclax.go75
-rw-r--r--cmeclax/cmeclax_test.go39
-rw-r--r--cmeclax/main.c630
-rw-r--r--cmeclax/main.h9
-rw-r--r--cmeclax/nilsimsa.h52
-rw-r--r--cmeclax/rules.c124
-rw-r--r--cmeclax/rules.h22
-rw-r--r--nilsimsa.go351
-rw-r--r--nilsimsa_test.go335
-rw-r--r--randreader_test.go85
15 files changed, 2714 insertions, 0 deletions
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..d645695
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,202 @@
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff --git a/cmeclax/20480.test b/cmeclax/20480.test
new file mode 100644
index 0000000..a5ce50b
--- /dev/null
+++ b/cmeclax/20480.test
Binary files differ
diff --git a/cmeclax/COPYING b/cmeclax/COPYING
new file mode 100644
index 0000000..94a9ed0
--- /dev/null
+++ b/cmeclax/COPYING
@@ -0,0 +1,674 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+ The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works. By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users. We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors. You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+ To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights. Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received. You must make sure that they, too, receive
+or can get the source code. And you must show them these terms so they
+know their rights.
+
+ Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+ For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software. For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+ Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so. This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software. The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable. Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products. If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+ Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary. To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ TERMS AND CONDITIONS
+
+ 0. Definitions.
+
+ "This License" refers to version 3 of the GNU General Public License.
+
+ "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+ "The Program" refers to any copyrightable work licensed under this
+License. Each licensee is addressed as "you". "Licensees" and
+"recipients" may be individuals or organizations.
+
+ To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy. The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+ A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+ To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy. Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+ To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies. Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+ An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License. If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+ 1. Source Code.
+
+ The "source code" for a work means the preferred form of the work
+for making modifications to it. "Object code" means any non-source
+form of a work.
+
+ A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+ The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form. A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+ The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities. However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work. For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+ The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+ The Corresponding Source for a work in source code form is that
+same work.
+
+ 2. Basic Permissions.
+
+ All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met. This License explicitly affirms your unlimited
+permission to run the unmodified Program. The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work. This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+ You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force. You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright. Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+ Conveying under any other circumstances is permitted solely under
+the conditions stated below. Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+ 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+ No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+ When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+ 4. Conveying Verbatim Copies.
+
+ You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+ You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+ 5. Conveying Modified Source Versions.
+
+ You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+ a) The work must carry prominent notices stating that you modified
+ it, and giving a relevant date.
+
+ b) The work must carry prominent notices stating that it is
+ released under this License and any conditions added under section
+ 7. This requirement modifies the requirement in section 4 to
+ "keep intact all notices".
+
+ c) You must license the entire work, as a whole, under this
+ License to anyone who comes into possession of a copy. This
+ License will therefore apply, along with any applicable section 7
+ additional terms, to the whole of the work, and all its parts,
+ regardless of how they are packaged. This License gives no
+ permission to license the work in any other way, but it does not
+ invalidate such permission if you have separately received it.
+
+ d) If the work has interactive user interfaces, each must display
+ Appropriate Legal Notices; however, if the Program has interactive
+ interfaces that do not display Appropriate Legal Notices, your
+ work need not make them do so.
+
+ A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit. Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+ 6. Conveying Non-Source Forms.
+
+ You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+ a) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by the
+ Corresponding Source fixed on a durable physical medium
+ customarily used for software interchange.
+
+ b) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by a
+ written offer, valid for at least three years and valid for as
+ long as you offer spare parts or customer support for that product
+ model, to give anyone who possesses the object code either (1) a
+ copy of the Corresponding Source for all the software in the
+ product that is covered by this License, on a durable physical
+ medium customarily used for software interchange, for a price no
+ more than your reasonable cost of physically performing this
+ conveying of source, or (2) access to copy the
+ Corresponding Source from a network server at no charge.
+
+ c) Convey individual copies of the object code with a copy of the
+ written offer to provide the Corresponding Source. This
+ alternative is allowed only occasionally and noncommercially, and
+ only if you received the object code with such an offer, in accord
+ with subsection 6b.
+
+ d) Convey the object code by offering access from a designated
+ place (gratis or for a charge), and offer equivalent access to the
+ Corresponding Source in the same way through the same place at no
+ further charge. You need not require recipients to copy the
+ Corresponding Source along with the object code. If the place to
+ copy the object code is a network server, the Corresponding Source
+ may be on a different server (operated by you or a third party)
+ that supports equivalent copying facilities, provided you maintain
+ clear directions next to the object code saying where to find the
+ Corresponding Source. Regardless of what server hosts the
+ Corresponding Source, you remain obligated to ensure that it is
+ available for as long as needed to satisfy these requirements.
+
+ e) Convey the object code using peer-to-peer transmission, provided
+ you inform other peers where the object code and Corresponding
+ Source of the work are being offered to the general public at no
+ charge under subsection 6d.
+
+ A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+ A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling. In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage. For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product. A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+ "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source. The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+ If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information. But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+ The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed. Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+ Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+ 7. Additional Terms.
+
+ "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law. If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+ When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it. (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.) You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+ Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+ a) Disclaiming warranty or limiting liability differently from the
+ terms of sections 15 and 16 of this License; or
+
+ b) Requiring preservation of specified reasonable legal notices or
+ author attributions in that material or in the Appropriate Legal
+ Notices displayed by works containing it; or
+
+ c) Prohibiting misrepresentation of the origin of that material, or
+ requiring that modified versions of such material be marked in
+ reasonable ways as different from the original version; or
+
+ d) Limiting the use for publicity purposes of names of licensors or
+ authors of the material; or
+
+ e) Declining to grant rights under trademark law for use of some
+ trade names, trademarks, or service marks; or
+
+ f) Requiring indemnification of licensors and authors of that
+ material by anyone who conveys the material (or modified versions of
+ it) with contractual assumptions of liability to the recipient, for
+ any liability that these contractual assumptions directly impose on
+ those licensors and authors.
+
+ All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10. If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term. If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+ If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+ Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+ 8. Termination.
+
+ You may not propagate or modify a covered work except as expressly
+provided under this License. Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+ However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+ Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+ Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License. If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+ 9. Acceptance Not Required for Having Copies.
+
+ You are not required to accept this License in order to receive or
+run a copy of the Program. Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance. However,
+nothing other than this License grants you permission to propagate or
+modify any covered work. These actions infringe copyright if you do
+not accept this License. Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+ 10. Automatic Licensing of Downstream Recipients.
+
+ Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License. You are not responsible
+for enforcing compliance by third parties with this License.
+
+ An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations. If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+ You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License. For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+ 11. Patents.
+
+ A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based. The
+work thus licensed is called the contributor's "contributor version".
+
+ A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version. For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+ Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+ In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement). To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+ If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients. "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+ If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+ A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License. You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+ Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+ 12. No Surrender of Others' Freedom.
+
+ If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all. For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+ 13. Use with the GNU Affero General Public License.
+
+ Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work. The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+ 14. Revised Versions of this License.
+
+ The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+ Each version is given a distinguishing version number. If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation. If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+ If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+ Later license versions may give you additional or different
+permissions. However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+ 15. Disclaimer of Warranty.
+
+ THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+ 16. Limitation of Liability.
+
+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+ 17. Interpretation of Sections 15 and 16.
+
+ If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the program's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+ If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+ <program> Copyright (C) <year> <name of author>
+ This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+ You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<http://www.gnu.org/licenses/>.
+
+ The GNU General Public License does not permit incorporating your program
+into proprietary programs. If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library. If this is what you want to do, use the GNU Lesser General
+Public License instead of this License. But first, please read
+<http://www.gnu.org/philosophy/why-not-lgpl.html>.
diff --git a/cmeclax/cluster.c b/cmeclax/cluster.c
new file mode 100644
index 0000000..78e233d
--- /dev/null
+++ b/cmeclax/cluster.c
@@ -0,0 +1,98 @@
+/***************************************************************************
+ cluster.c - finds clusters
+ -------------------
+ begin : Mon May 14 2001
+ copyright : (C) 2001 by cmeclax
+ email : cmeclax@ixazon.dynip.com
+ ***************************************************************************/
+
+/***************************************************************************
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU General Public License as published by *
+ * the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ ***************************************************************************/
+
+#include <math.h>
+#include "nilsimsa.h"
+#include "cluster.h"
+
+extern struct nsrecord *selkarbi,terkarbi,*rules,gunma;
+extern int nilselkarbi,nrules,comparethreshold,minclustersize,exhaustiveflag;
+extern int debugflag;
+
+int findclosepair(int n)
+/* Finds a close pair and returns one of them, unless there
+ are no pairs, in which case it returns -129. */
+{int i,nlogn,nsmax,ns,posmax,offset;
+ double slope;
+ if (n>1)
+ if (exhaustiveflag)
+ nlogn=(n*n/2);
+ else
+ nlogn=n*log(n);
+ else
+ return -129;
+ slope=((double)n/2+!exhaustiveflag)/nlogn;
+ offset=n?(unsigned)time(NULL)%n:0;
+ for (i=0,nsmax=posmax=-129;i<nlogn;i++)
+ {/*if (debugflag)
+ {printf("%4d %4d ",(i+offset)%n,(int)(i+offset+1+slope*i)%n);
+ if ((i&7)==7)
+ printf("\n");
+ }*/
+ if ((ns=nilsimsa(selkarbi+((i+offset)%n),
+ selkarbi+((int)(i+offset+1+slope*i)%n)))>nsmax)
+ {nsmax=ns;
+ posmax=(i+offset)%n;
+ }
+ }
+ return posmax;
+ }
+
+int smimau(const struct nsrecord *a,const struct nsrecord *b)
+{return b->nilsmi-a->nilsmi;
+ }
+
+void simsasort(int n)
+{int i;
+ for (i=0;i<n;i++)
+ selkarbi[i].nilsmi=nilsimsa(selkarbi+i,&terkarbi);
+ qsort(selkarbi,n,sizeof(struct nsrecord),(__compar_fn_t)smimau);
+ }
+
+int clustersize(int n)
+{int i,gappos=0,gapsize;
+ for (i=gapsize=0;i<n-1;i++)
+ if (selkarbi[i].nilsmi-selkarbi[i+1].nilsmi>gapsize)
+ {gapsize=selkarbi[i].nilsmi-selkarbi[i+1].nilsmi;
+ gappos=i+1;
+ }
+ return gappos;
+ }
+
+int findcluster(int n)
+/* Returns the size of a cluster if there is one, else 0.
+ The average of the cluster is in gunma. */
+{int csize,ccenter;
+ ccenter=findclosepair(n);
+ csize=0;
+ if (comparethreshold<-128)
+ comparethreshold=24;
+ if (ccenter>=0)
+ {terkarbi=selkarbi[ccenter];
+ simsasort(n);
+ aggregate(clustersize(n));
+ terkarbi=gunma;
+ simsasort(n);
+ csize=clustersize(n);
+ aggregate(csize);
+ gunma.nilsmi=(selkarbi[csize].nilsmi+selkarbi[csize-1].nilsmi+1)/2;
+ if (gunma.nilsmi<comparethreshold || csize<minclustersize) /* 24 is 3*sigma, and a set with <2 elements is not a cluster */
+ csize=0;
+ }
+ gunma.filepos=-1 /* make writerule put it at the end */;
+ return csize;
+ }
diff --git a/cmeclax/cluster.h b/cmeclax/cluster.h
new file mode 100644
index 0000000..6173466
--- /dev/null
+++ b/cmeclax/cluster.h
@@ -0,0 +1,18 @@
+/***************************************************************************
+ cluster.h - finds clusters
+ -------------------
+ begin : Mon May 14 2001
+ copyright : (C) 2001 by cmeclax
+ email : cmeclax@ixazon.dynip.com
+ ***************************************************************************/
+
+/***************************************************************************
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU General Public License as published by *
+ * the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ ***************************************************************************/
+
+int findcluster(int n);
diff --git a/cmeclax/cmeclax.go b/cmeclax/cmeclax.go
new file mode 100644
index 0000000..e6f7cf6
--- /dev/null
+++ b/cmeclax/cmeclax.go
@@ -0,0 +1,75 @@
+// This file is part of nilsimsa/cmeclax, a Go package.
+//
+// nilsimsa/cmeclax is free software: you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation, either version 3 of the
+// License, or (at your option) any later version.
+//
+// nilsimsa/cmeclax is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with nilsimsa/cmeclax. If not, see
+// <http://www.gnu.org/licenses/>
+
+// Package cmeclax is a simple interface to the 0.2.4 C
+// implementation of nilsimsa by cmeclax.
+package cmeclax
+
+// #define VERSION "0.2.4"
+// #cgo LDFLAGS: -lpopt -lm
+// #include "main.h"
+// #include <stdio.h>
+// #include <stdlib.h>
+import "C"
+import "unsafe"
+
+func init() {
+ C.filltran()
+}
+
+// Klani is a nilsisma frequency table
+type Klani struct {
+ nsrecord C.struct_nsrecord
+}
+
+const (
+ codeLength = 32
+)
+
+var (
+ readPermission = &([]C.char{'r', 0})[0]
+)
+
+// Buckets returns a copy of the raw nilsimsa frequency table
+// of a Klani.
+func (k Klani) Buckets() (buckets []int) {
+ buckets = make([]int, 256)
+ for i := 0; i < 256; i++ {
+ buckets[i] = int(k.nsrecord.acc[i])
+ }
+ return
+}
+
+// Accumulate takes a byte slice and returns a nilsimsa frequency
+// table.
+func Accumulate(buf []byte) Klani {
+ var terkarbi Klani
+
+ file := C.fmemopen(unsafe.Pointer(&buf[0]), C.size_t(len(buf)), readPermission)
+ defer C.fclose(file)
+
+ C.accfile(file, &(terkarbi.nsrecord), C.int(0))
+ return terkarbi
+}
+
+// String returns a 64-character hexadecimal representation of a
+// nilsimsa code.
+func (k Klani) String() string {
+ C.makecode(&k.nsrecord)
+ charBuffer := &(make([]C.char, (2*codeLength)+1))[0]
+ C.codetostr(&k.nsrecord, charBuffer)
+ return C.GoString(charBuffer)
+}
diff --git a/cmeclax/cmeclax_test.go b/cmeclax/cmeclax_test.go
new file mode 100644
index 0000000..13cee50
--- /dev/null
+++ b/cmeclax/cmeclax_test.go
@@ -0,0 +1,39 @@
+// This file is part of nilsimsa/cmeclax, a Go package.
+//
+// nilsimsa/cmeclax is free software: you can redistribute it and/or
+// modify it under the terms of the GNU General Public License as
+// published by the Free Software Foundation, either version 3 of the
+// License, or (at your option) any later version.
+//
+// nilsimsa/cmeclax is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with nilsimsa/cmeclax. If not, see
+// <http://www.gnu.org/licenses/>
+
+package cmeclax
+
+import (
+ "bytes"
+ "io"
+ "os"
+ "testing"
+)
+
+func TestAccumulate(t *testing.T) {
+ f, err := os.Open("20480.test")
+ if err != nil {
+ t.Fatalf("Could not open file 20480.test, %s", err)
+ }
+ var buf bytes.Buffer
+ io.Copy(&buf, f)
+ f.Close()
+ acc := Accumulate(buf.Bytes())
+
+ if acc.String() != "ff0391a13788fe959469ec70df488e5a269bf54fad4de9614e6ae30196b5110e" {
+ t.Errorf("Unexpected summary hash: %s\n", acc)
+ }
+}
diff --git a/cmeclax/main.c b/cmeclax/main.c
new file mode 100644
index 0000000..2d936f8
--- /dev/null
+++ b/cmeclax/main.c
@@ -0,0 +1,630 @@
+/***************************************************************************
+ main.c - nilsimsa
+ -------------------
+ begin : Fri Mar 16 01:41:08 EST 2001
+ copyright : (C) 2001-2002 by cmeclax
+ email : cmeclax@ixazon.dynip.com
+ ***************************************************************************/
+
+/***************************************************************************
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU General Public License as published by *
+ * the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ ***************************************************************************/
+
+/* Computes a nilsimsa code for a file fed to stdin. Files with
+ similar content often have the same nilsimsa code.
+ Options eventually to be implemented:
+ --long-code Instead of a 256-bit vector, write the entire histogram as a code.
+ This will allow more accurate detection of certain kinds of viruses and worms.
+ --code-list Read a list of codes from a file.
+ Anywhere a code is called for, a file may be given; if a code is
+ intended and such a file exists, prepend a few zeros to the code.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <dirent.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include "nilsimsa.h"
+#include "cluster.h"
+#include "rules.h"
+
+unsigned int acc[256],threshold;
+unsigned char tran[256],popcount[256];
+struct nsrecord *selkarbi,terkarbi,*rules,gunma;
+int nilselkarbi,nrules;
+char *comparestr,*rulefile,*checkrulefile;
+int comparethreshold,minclustersize;
+int helpflag,versionflag,aggregateflag,noheaderflag,verboseflag,catflag,mboxflag;
+int exhaustiveflag,debugflag,recursiveflag;
+int popterror;
+int exitcode;
+
+void clear(struct nsrecord *a)
+{a->total=a->threshold=0;
+ memset(a->acc,0,sizeof(a->acc));
+ memset(a->code,0,sizeof(a->code));
+ }
+
+void filltran()
+{int i,j,k;
+ for (i=j=0;i<256;i++)
+ {j=(j*53+1)&255;
+ j+=j;
+ if (j>255)
+ j-=255;
+ for (k=0;k<i;k++)
+ if (j==tran[k])
+ {j=(j+1)&255;
+ k=0;
+ }
+ tran[i]=j;
+ }
+ }
+
+void dumptran()
+{int i;
+ for (i=0;i<256;i++)
+ {printf("%02x ",tran[i]);
+ if ((i&15)==15)
+ putchar('\n');
+ }
+ }
+
+void fillpopcount()
+{int i,j;
+ memset(popcount,0,256);
+ for (i=0;i<256;i++)
+ for (j=0;j<8;j++)
+ popcount[i]+=1&(i>>j);
+ }
+
+int defromulate(FILE *file)
+#define CANY 258
+/* apply this line to the character in any, instead of reading one */
+#define ANY 257
+/* read a character into any, or emit any */
+#define NONE 256
+/* don't read a character, or don't emit one */
+/* Reads a character from a mailbox file according to these rules:
+ Ignore all lines up to and including the first one that begins
+ with "From ".
+ Return -2 on encountering "\nFrom ". Ignore that and the rest
+ of the line.
+ Return -3 if the file does not contain a "From " line (usually
+ this means that the mailbox is empty).
+ On seeing "\n>From ", return "\nFrom ". Generally, if "\n>"
+ is followed by any number of greater-than signs and "From ",
+ drop the last greater-than sign.
+
+ State table:
+
+ 00 \n 0 01 F 0 03 r 0 06 o 0 10 m 0 15 bl 0 21 \n -2 01
+ * * * * * * *
+ * \n \n \n \n \n 0
+ 00 02 04 07 11 16 21
+ 0 0 0 0
+ F F F F
+ 05 08 12 17
+ 0 0 0
+ r r r
+ 09 13 18
+ 0 0
+ o o
+ 02,05,09,14,20 0 * 00 14 19
+ 0
+ m
+ 20
+
+ 01 > \n 22 F 0 23 r 0 24 o 0 25 m 0 26 bl F 27 0 r 28 0 o 29 0 m 30 0 bl 0
+ * * * * *
+ > > > > >
+ 02 04 07 11 16
+
+ 22 > > 22
+
+ * EOF EOF 31 F 0 32 r 0 33 o 0 34 m 0 35 bl 0 36 \n 0 01
+ * * * * * *
+ * * * * * *
+ 31 31 31 31 31 36
+ */
+{static const short statetable[][5][3]=
+ {{{'\n',NONE, 1},{EOF ,EOF ,31},{ANY ,ANY , 0},{0 ,0 , 0},{0 ,0 , 0}}, /* 0*/
+ {{'F' ,NONE, 3},{EOF ,EOF ,31},{'>' ,'\n',22},{'\n','\n',01},{ANY ,'\n',02}},
+ {{NONE,ANY ,38},{NONE,ANY , 0},{NONE,ANY , 0},{NONE,ANY , 0},{NONE,ANY , 0}},
+ {{'r' ,NONE, 6},{EOF ,EOF ,31},{ANY ,'\n', 4},{0 ,0 , 0},{0 ,0 , 0}},
+ {{NONE,'F' , 5},{0 ,0 , 0},{0 ,0 , 0},{0 ,0 , 0},{0 ,0 , 0}},
+ {{NONE,ANY ,38},{NONE,ANY , 0},{NONE,ANY , 0},{NONE,ANY , 0},{NONE,ANY , 0}}, /* 5*/
+ {{'o' ,NONE,10},{EOF ,EOF ,31},{ANY ,'\n', 7},{0 ,0 , 0},{0 ,0 , 0}},
+ {{NONE,'F' , 8},{0 ,0 , 0},{0 ,0 , 0},{0 ,0 , 0},{0 ,0 , 0}},
+ {{NONE,'r' , 9},{0 ,0 , 0},{0 ,0 , 0},{0 ,0 , 0},{0 ,0 , 0}},
+ {{NONE,ANY ,38},{NONE,ANY , 0},{NONE,ANY , 0},{NONE,ANY , 0},{NONE,ANY , 0}},
+ {{'m' ,NONE,15},{EOF ,EOF ,31},{ANY ,'\n',11},{0 ,0 , 0},{0 ,0 , 0}}, /*10*/
+ {{NONE,'F' ,12},{0 ,0 , 0},{0 ,0 , 0},{0 ,0 , 0},{0 ,0 , 0}},
+ {{NONE,'r' ,13},{0 ,0 , 0},{0 ,0 , 0},{0 ,0 , 0},{0 ,0 , 0}},
+ {{NONE,'o' ,14},{0 ,0 , 0},{0 ,0 , 0},{0 ,0 , 0},{0 ,0 , 0}},
+ {{NONE,ANY ,38},{NONE,ANY , 0},{NONE,ANY , 0},{NONE,ANY , 0},{NONE,ANY , 0}},
+ {{' ' ,NONE,21},{EOF ,EOF ,31},{ANY ,'\n',16},{0 ,0 , 0},{0 ,0 , 0}}, /*15*/
+ {{NONE,'F' ,17},{0 ,0 , 0},{0 ,0 , 0},{0 ,0 , 0},{0 ,0 , 0}},
+ {{NONE,'r' ,18},{0 ,0 , 0},{0 ,0 , 0},{0 ,0 , 0},{0 ,0 , 0}},
+ {{NONE,'o' ,19},{0 ,0 , 0},{0 ,0 , 0},{0 ,0 , 0},{0 ,0 , 0}},
+ {{NONE,'m' ,20},{0 ,0 , 0},{0 ,0 , 0},{0 ,0 , 0},{0 ,0 , 0}},
+ {{NONE,ANY ,38},{NONE,ANY , 0},{NONE,ANY , 0},{NONE,ANY , 0},{NONE,ANY , 0}}, /*20*/
+ {{'\n',-2 ,37},{EOF ,EOF ,31},{ANY ,NONE,21},{0 ,0 , 0},{0 ,0 , 0}},
+ {{'>' ,'>' ,22},{'F' ,NONE,23},{ANY,'>' , 2},{0 ,0 , 0},{0 ,0 , 0}},
+ {{'r' ,NONE,24},{ANY,'>' , 4},{0 ,0 , 0},{0 ,0 , 0},{0 ,0 , 0}},
+ {{'o' ,NONE,25},{ANY,'>' , 7},{0 ,0 , 0},{0 ,0 , 0},{0 ,0 , 0}},
+ {{'m' ,NONE,26},{ANY,'>' ,11},{0 ,0 , 0},{0 ,0 , 0},{0 ,0 , 0}}, /*25*/
+ {{' ' ,'F' ,27},{ANY,'>' ,16},{0 ,0 , 0},{0 ,0 , 0},{0 ,0 , 0}},
+ {{NONE,'r' ,28},{0 ,0 , 0},{0 ,0 , 0},{0 ,0 , 0},{0 ,0 , 0}},
+ {{NONE,'o' ,29},{0 ,0 , 0},{0 ,0 , 0},{0 ,0 , 0},{0 ,0 , 0}},
+ {{NONE,'m' ,30},{0 ,0 , 0},{0 ,0 , 0},{0 ,0 , 0},{0 ,0 , 0}},
+ {{NONE,' ' , 0},{0 ,0 , 0},{0 ,0 , 0},{0 ,0 , 0},{0 ,0 , 0}}, /*30*/
+ {{'F' ,NONE,32},{EOF ,-3 ,31},{ANY ,NONE,31},{0 ,0 , 0},{0 ,0 , 0}},
+ {{'r' ,NONE,33},{EOF ,-3 ,31},{ANY ,NONE,31},{0 ,0 , 0},{0 ,0 , 0}},
+ {{'o' ,NONE,34},{EOF ,-3 ,31},{ANY ,NONE,31},{0 ,0 , 0},{0 ,0 , 0}},
+ {{'m' ,NONE,35},{EOF ,-3 ,31},{ANY ,NONE,31},{0 ,0 , 0},{0 ,0 , 0}},
+ {{' ' ,NONE,36},{EOF ,-3 ,31},{ANY ,NONE,31},{0 ,0 , 0},{0 ,0 , 0}}, /*35*/
+ {{'\n',NONE,37},{EOF ,-3 ,31},{ANY ,NONE,36},{0 ,0 , 0},{0 ,0 , 0}},
+ {{'F' ,NONE,40},{EOF ,EOF ,31},{'>' ,NONE,22},{ANY ,NONE, 2},{ANY ,NONE, 2}},
+ {{CANY,NONE, 0},{'\n',NONE,39},{ANY ,NONE, 0},{0 ,0 , 0},{0 ,0 , 0}},
+ {{'F' ,NONE, 3},{EOF ,EOF ,31},{'>' ,NONE,22},{'\n',NONE,01},{ANY ,NONE,02}},
+ {{'r' ,NONE,41},{EOF ,EOF ,31},{ANY ,NONE, 4},{0 ,0 , 0},{0 ,0 , 0}}, /*40*/
+ {{'o' ,NONE,42},{EOF ,EOF ,31},{ANY ,NONE, 7},{0 ,0 , 0},{0 ,0 , 0}},
+ {{'m' ,NONE,43},{EOF ,EOF ,31},{ANY ,NONE,11},{0 ,0 , 0},{0 ,0 , 0}},
+ {{' ' ,NONE,21},{EOF ,EOF ,31},{ANY ,NONE,16},{0 ,0 , 0},{0 ,0 , 0}}};
+ static int any,state=31,ch,i;
+ do {for (i=0,ch=NONE;;i++)
+ {if (statetable[state][i][0]==NONE)
+ break;
+ if (statetable[state][i][0]==CANY)
+ {ch=any;
+ continue;
+ }
+ if (i==0)
+ ch=getc(file);
+ if (statetable[state][i][0]==ANY)
+ any=ch;
+ if (statetable[state][i][0]==ANY || statetable[state][i][0]==ch)
+ break;
+ }
+ /*if (ch==NONE)
+ printf(" ");
+ else
+ printf("%3x",ch&0xfff);*/
+ ch=statetable[state][i][1];
+ if (ch==ANY)
+ ch=any;
+ state=statetable[state][i][2];
+ /*printf("%3d%3x %c\n",state,ch&0xfff,isprint(ch)?ch:' ');*/
+ }
+ while (ch==NONE);
+ return ch;
+ }
+
+int accfile(FILE *file,struct nsrecord *a,int mboxflag)
+/* Returns -1 on reaching end of file, -2 on reaching end of message,
+ -3 if the file contains no messages. */
+{unsigned int chcount;
+ int ch,lastch[4],hflag;
+ lastch[0]=lastch[1]=lastch[2]=lastch[3]=-1;
+ chcount=0;
+ hflag=noheaderflag;
+ do {ch=mboxflag?defromulate(file):getc(file);
+ if (ch>=0)
+ if (hflag)
+ if ((lastch[0]=='\n' && lastch[1]=='\n')
+ ||(lastch[0]=='\r' && lastch[1]=='\r')
+ ||(lastch[0]=='\n' && lastch[1]=='\r' && lastch[2]=='\n' && lastch[3]=='\r'))
+ {hflag=0;
+ lastch[0]=lastch[1]=lastch[2]=lastch[3]=-1;
+ }
+ else
+ ;
+ else
+ ;
+ if (!hflag && ch>=0)
+ {chcount++;
+ if (catflag)
+ putchar(ch);
+ if (lastch[1]>=0)
+ a->acc[tran3(ch,lastch[0],lastch[1],0)]++;
+ if (lastch[2]>=0)
+ {a->acc[tran3(ch,lastch[0],lastch[2],1)]++;
+ a->acc[tran3(ch,lastch[1],lastch[2],2)]++;
+ }
+ if (lastch[3]>=0)
+ {a->acc[tran3(ch,lastch[0],lastch[3],3)]++;
+ a->acc[tran3(ch,lastch[1],lastch[3],4)]++;
+ a->acc[tran3(ch,lastch[2],lastch[3],5)]++;
+ a->acc[tran3(lastch[3],lastch[0],ch,6)]++;
+ a->acc[tran3(lastch[3],lastch[2],ch,7)]++;
+ }
+ }
+ lastch[3]=lastch[2];
+ lastch[2]=lastch[1];
+ lastch[1]=lastch[0];
+ lastch[0]=ch;
+ }
+ while (ch>=0);
+ switch (chcount)
+ {case 0: ;
+ case 1: ;
+ case 2: break;
+ case 3: a->total++;
+ break;
+ case 4: a->total+=4;
+ break;
+ default:a->total+=(8*chcount)-28;
+ }
+ a->threshold=(a->total)/256; /* round down because criterion is >threshold */
+ return ch;
+ }
+
+void makecode(struct nsrecord *a)
+{int i;
+ memset(a->code,0,32);
+ for (i=0;i<256;i++)
+ {a->code[i>>3]+=((a->acc[i]>a->threshold)<<(i&7));
+ }
+ }
+
+int nilsimsa(struct nsrecord *a,struct nsrecord *b)
+{int i,bits;
+ for (i=bits=0;i<32;i++)
+ bits+=popcount[255&(a->code[i]^b->code[i])];
+ return 128-bits;
+ }
+
+void codetostr(struct nsrecord *a,char *str)
+{int i;
+ for (i=0;i<32;i++)
+ sprintf(str+i+i,"%02x",0xff&(a->code[31-i]));
+ }
+
+int strtocode(const char *str,struct nsrecord *a)
+/* Returns 0 if error, or 1 if valid code. */
+{unsigned int i,len,valid,byte;
+ len=strlen(str);
+ valid=(len>63) && (isxdigit(*str));
+ a->total=0;
+ if (len&1)
+ str++;
+ for (;*str;str+=2)
+ {memmove(a->code+1,a->code,31);
+ if (!isxdigit(str[0]) || !isxdigit(str[1]))
+ valid=0;
+ sscanf(str,"%2x",&byte);
+ a->code[0]=byte;
+ memmove(a->acc+8,a->acc,248*sizeof(int));
+ for (i=0;i<8;i++)
+ a->acc[i]=1&(byte>>i);
+ }
+ if (!valid)
+ clear(a);
+ for (i=0;i<256;i++)
+ a->total+=a->acc[i];
+ a->threshold=0;
+ return valid;
+ }
+
+int codeorfile(struct nsrecord *a,char *str,int mboxflag)
+/* Attempts to read the file named str. If that fails,
+ attempts to convert str to a code. Returns 2 if directory, 1 if success,
+ 0 if failure, -1 if mboxflag is set and there are more messages,
+ -2 if mbox file is empty (failure again, but different error message). */
+{static FILE *file;
+ struct stat statbuf;
+ static int msgnum=0;
+ int ret;
+ if (!strcmp(str,"-"))
+ {ret=accfile(stdin,a,mboxflag);
+ file=stdin;
+ a->name="";
+ if (mboxflag)
+ {a->name=malloc(24);
+ sprintf(a->name,"#%u",msgnum);
+ a->name=realloc(a->name,strlen(a->name)+1);
+ }
+ a->flag=FILECODE;
+ msgnum++;
+ if (ret!=-2)
+ msgnum=0;
+ }
+ else
+ {ret=stat(str,&statbuf);
+ if (ret==0 && (statbuf.st_mode&S_IFMT)==S_IFDIR)
+ return 2;
+ if (msgnum==0 || !mboxflag)
+ file=fopen(str,"rb");
+ a->name=str;
+ if (file)
+ {ret=accfile(file,a,mboxflag);
+ a->flag=FILECODE;
+ if (mboxflag)
+ {a->name=malloc(strlen(str)+24);
+ sprintf(a->name,"%s#%u",str,msgnum);
+ a->name=realloc(a->name,strlen(a->name)+1);
+ }
+ else
+ a->name=strdup(str);
+ msgnum++;
+ if (ret!=-2)
+ {fclose(file);
+ msgnum=0;
+ }
+ }
+ else
+ {ret=strtocode(str,a);
+ if (ret)
+ a->flag=LITERALCODE;
+ return ret;
+ }
+ }
+ makecode(a);
+ if (ret==-3)
+ a->flag=INVALID;
+ ret+=!++ret;
+ return ret;
+ }
+
+int fileordirectory(char *str,int sofar)
+/* Process one file or directory. Recursive if directory and -r. */
+{int ret;
+ DIR * dir;
+ struct dirent *ent;
+ char *file;
+ file=NULL;
+ ret=-1;
+ while (ret==-1)
+ {selkarbi=realloc(selkarbi,sizeof(terkarbi)*(sofar+1));
+ clear(selkarbi+sofar);
+ ret=codeorfile(selkarbi+sofar,str,mboxflag);
+ if (debugflag)
+ printf("%2d %s\n",ret,str);
+ switch (ret)
+ {case 0: /* no such file, and not a valid code */
+ fprintf(stderr,"nilsimsa: %s: invalid code or file name\n",str);
+ break;
+ case 1: /* valid code or regular file */
+ sofar++;
+ break;
+ case -2: /* empty mbox file */
+ fprintf(stderr,"nilsimsa: %s: empty mbox file\n",str);
+ break;
+ case -1: /* message in mbox file */
+ ++sofar;
+ break;
+ case 2: /* directory */
+ if (recursiveflag)
+ {dir=opendir(str);
+ if (dir)
+ for (ent=readdir(dir);ent;ent=readdir(dir))
+ {
+ file=realloc(file,strlen(str)+strlen(ent->d_name)+2);
+ strcpy(file,str);
+ strcat(file,"/");
+ strcat(file,ent->d_name);
+ if (strcmp(ent->d_name,".") && strcmp(ent->d_name,".."))
+ sofar=fileordirectory(file,sofar);
+ }
+ }
+ else
+ fprintf(stderr,"nilsimsa: %s: is a directory\n",str);
+ break;
+ }
+ }
+ free(file);
+ return sofar;
+ }
+
+void aggregate(int n)
+/* Aggregate the first n codes in selkarbi into gunma. */
+{int i,j;
+ clear(&gunma);
+ for (i=0;i<n;i++)
+ {gunma.total+=selkarbi[i].total;
+ for (j=0;j<256;j++)
+ gunma.acc[j]+=selkarbi[i].acc[j];
+ }
+ gunma.threshold=gunma.total/256;
+ makecode(&gunma);
+ }
+
+void dump1code(struct nsrecord *code)
+{char str[65];
+ codetostr(code,str);
+ printf("%s %4d %c %d \n",str,code->nilsmi,"ILFAD"[code->flag],code->nilsmi);
+ }
+
+void dumpcodes(struct nsrecord *code,int n)
+{int i;
+ for (i=0;i<n;i++)
+ dump1code(code+i);
+ }
+
+void version()
+{printf("nilsimsa %s \nŠ 2001-2002 cmeclax\nDistributed under GNU GPL\n","0.2.4");
+ }
+
+void help()
+{puts("\
+nilsimsa [options] [files]\n\
+--help\n\
+ Prints this message.\n\
+--version\n\
+ Prints the version number.\n\
+file1 file2 ...\n\
+ Compute the nilsimsa codes of the files.\n\
+-r --recursive\n\
+ Process all files inside listed directories.\n\
+-c code file1 file2 --compare-to\n\
+ Compare files to code.\n\
+-c code -t 48 file --threshold\n\
+ Return true if file is more than 48 bits similar to code.\n\
+-s --skip-headers\n\
+ file is a mail message; ignore the headers.\n\
+--mbox\n\
+ files are mailbox folders; take the code of each message.\n\
+-a file1 file2 ... --aggregate\n\
+ Compute the aggregate nilsimsa code of the files.\n\
+-C clusterlist file1 file2 ... --find-clusters\n\
+ Find a cluster of similar files.\n\
+-H clusterlist file1 file2 ... --check-clusters\n\
+ Compare files to all priority 0 clusters in clusterlist,\n\
+ then all priority 1 clusters. If the first match is an\n\
+ accept cluster, return 0. If it is a deny cluster, return\n\
+ 1. If there is no match, return 0.\n\
+-m n --min-cluster-size\n\
+ Set the minimum cluster size (default 3).\n\
+-v --verbose\n\
+ In -C, lists all the codes, the ones which are in the\n\
+ cluster first; in -H, lists which cluster matches.\n\
+-x --exhaustive\n\
+ In -C, compare all pairs of codes, not just n*log(n) pairs.\n\
+\n\
+Any time a file is called for, a code may be given, and vice versa,\n\
+except after -C and -H.");
+ }
+
+static struct poptOption options[]=
+{{"compare-to", 'c',POPT_ARG_STRING,&comparestr,0},
+ {"threshold", 't',POPT_ARG_INT, &comparethreshold,0},
+ {"min-cluster-size", 'm',POPT_ARG_INT, &minclustersize,0},
+ {"help", 'h',POPT_ARG_NONE, &helpflag,0},
+ {"version", 0, POPT_ARG_NONE, &versionflag,0},
+ {"verbose", 'v',POPT_ARG_NONE, &verboseflag,0},
+ {"cat", 0, POPT_ARG_NONE, &catflag,0},
+ {"debug", 0, POPT_ARG_NONE, &debugflag,0},
+ {"mbox", 0, POPT_ARG_NONE, &mboxflag,0},
+ {"aggregate", 'a',POPT_ARG_NONE, &aggregateflag,0},
+ {"skip-headers", 's',POPT_ARG_NONE, &noheaderflag,0},
+ {"find-clusters", 'C',POPT_ARG_STRING,&rulefile,0},
+ {"check-clusters", 'H',POPT_ARG_STRING,&checkrulefile,0},
+ {"exhaustive", 'x',POPT_ARG_NONE, &exhaustiveflag,0},
+ {"recursive", 'r',POPT_ARG_NONE, &recursiveflag,0},
+ {NULL, 0, POPT_ARG_NONE, NULL,0}};
+
+
+/* int omain(int argc, char *argv[]) */
+/* {int i,j,ret,matched; */
+/* char str[65]; */
+/* const char **poptargv; */
+/* int poptargc; */
+/* poptContext context; */
+/* filltran(); */
+/* /\*dumptran();*\/ */
+/* comparestr=exitcode=helpflag=versionflag=verboseflag= */
+/* noheaderflag=catflag=mboxflag=NULL; */
+/* comparethreshold=-999; */
+/* minclustersize=3; */
+/* context=poptGetContext("nilsimsa",argc,argv,options,0); */
+/* popterror=poptGetNextOpt(context); */
+/* poptargv=poptGetArgs(context); */
+/* for (poptargc=0;poptargv && poptargv[poptargc];poptargc++); */
+/* //if (popterror<-1) */
+/* // die(poptStrerror(popterror),3); */
+/* fillpopcount(); */
+/* if (versionflag) */
+/* version(); */
+/* if (helpflag) */
+/* help(); */
+/* if (comparestr) */
+/* codeorfile(&terkarbi,comparestr,0); */
+/* if (aggregateflag && rulefile) */
+/* {fprintf(stderr,"nilsimsa: aggregate and find-clusters are incompatible"); */
+/* aggregateflag=0; */
+/* rulefile=NULL; */
+/* } */
+/* nilselkarbi=poptargc; */
+/* if (!nilselkarbi) /\* if there are no arguments, there is one file, stdin *\/ */
+/* {nilselkarbi++; */
+/* poptargv=realloc(poptargv,sizeof(char *)); */
+/* poptargv[0]="-"; */
+/* } */
+/* selkarbi=malloc(sizeof(terkarbi)*nilselkarbi); */
+/* if (helpflag || versionflag) */
+/* nilselkarbi=0; */
+/* for (i=j=0;i<nilselkarbi;i++) { */
+/* /\* j is the number of messages hashed; i is the number of arguments processed *\/ */
+/* j=fileordirectory(poptargv[i],j); */
+
+/* } */
+/* nilselkarbi=j; */
+/* aggregate(nilselkarbi); */
+/* if (rulefile) */
+/* {readrules(rulefile); */
+/* nilselkarbi=removematches(nilselkarbi); */
+/* if (verboseflag) */
+/* printf("%d left\n",nilselkarbi); */
+/* i=findcluster(nilselkarbi); */
+/* if (i) */
+/* {gunma.flag=DENYRULE; */
+/* writerule(&gunma,rulefile); */
+/* writerule(&gunma,NULL); /\* writes to stdout *\/ */
+/* if (verboseflag) */
+/* {printf("%d %d %d\n",poptargc,nilselkarbi,i); */
+/* dumpcodes(selkarbi,poptargc); */
+/* } */
+/* } */
+/* } */
+/* if (checkrulefile) */
+/* {readrules(checkrulefile); */
+/* if (aggregateflag) */
+/* {exitcode=(matched=matchany(&gunma,&terkarbi))==DENYRULE; */
+/* codetostr(&gunma,str); */
+/* printf("%d %s",exitcode,str); */
+/* if (matched&&verboseflag) */
+/* {putchar(' '); */
+/* writerule(&terkarbi,NULL); */
+/* } */
+/* printf("\n"); */
+/* } */
+/* else */
+/* for (i=0;i<nilselkarbi;i++) */
+/* if (selkarbi[i].flag) */
+/* {exitcode=(matched=matchany(selkarbi+i,&terkarbi))==DENYRULE; */
+/* codetostr(selkarbi+i,str); */
+/* printf("%d %s",exitcode,selkarbi[i].name); */
+/* if (matched&&verboseflag) */
+/* {putchar(' '); */
+/* writerule(&terkarbi,NULL); */
+/* } */
+/* printf("\n"); */
+/* } */
+/* } */
+/* if (aggregateflag && !checkrulefile) */
+/* if (comparestr) */
+/* if (comparethreshold>=-256) */
+/* printf("%d\n",exitcode=nilsimsa(&gunma,&terkarbi)>comparethreshold); */
+/* else */
+/* printf("%4d\n",nilsimsa(&gunma,&terkarbi)); */
+/* else */
+/* {codetostr(&gunma,str); */
+/* printf("%s\n",str); */
+/* } */
+/* else */
+/* if (!rulefile && !checkrulefile) */
+/* for (i=0;i<nilselkarbi;i++) */
+/* {if (selkarbi[i].flag) */
+/* {if (comparestr) */
+/* if (comparethreshold>=-256) */
+/* printf("%d %s\n",exitcode=nilsimsa(selkarbi+i,&terkarbi)>comparethreshold,selkarbi[i].name); */
+/* else */
+/* printf("%4d %s\n",nilsimsa(selkarbi+i,&terkarbi),selkarbi[i].name); */
+/* else */
+/* {codetostr(selkarbi+i,str); */
+/* printf("%s %s\n",str,selkarbi[i].name); */
+/* } */
+/* } */
+/* } */
+/* poptFreeContext(context); */
+/* return exitcode; */
+/* } */
diff --git a/cmeclax/main.h b/cmeclax/main.h
new file mode 100644
index 0000000..c106c32
--- /dev/null
+++ b/cmeclax/main.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include "nilsimsa.h"
+
+void filltran(void);
+void makecode(struct nsrecord *a);
+void codetostr(struct nsrecord *a,char *str);
+int codeorfile(struct nsrecord *a,char *str,int mboxflag);
+int accfile(FILE *file,struct nsrecord *a,int mboxflag);
diff --git a/cmeclax/nilsimsa.h b/cmeclax/nilsimsa.h
new file mode 100644
index 0000000..b9bba67
--- /dev/null
+++ b/cmeclax/nilsimsa.h
@@ -0,0 +1,52 @@
+/***************************************************************************
+ nilsimsa.h - global defines
+ -------------------
+ begin : Mon May 14 2001
+ copyright : (C) 2001 by cmeclax
+ email : cmeclax@ixazon.dynip.com
+ ***************************************************************************/
+
+/***************************************************************************
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU General Public License as published by *
+ * the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ ***************************************************************************/
+
+#ifndef NILSIMSA_H
+#define NILSIMSA_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <popt.h>
+#define tran3(a,b,c,n) (((tran[((a)+(n))&255]^tran[(b)]*((n)+(n)+1))+tran[(c)^tran[n]])&255)
+#define INVALID 0
+#define LITERALCODE 1
+#define FILECODE 2
+#define ALLOWRULE 3
+#define DENYRULE 4
+
+
+typedef char nscode[32];
+struct nsrecord
+{int acc[256] /* counts each trigram's hash */;
+ int total /* total number of trigrams counted */;
+ int threshold /* mean of all numbers in acc */;
+ int nilsmi /* nilsimsa be ti bei le terkarbi; used as sort key */;
+ int flag /* is this a literal code, a file, an allow rule, or a deny rule? */;
+ int priority /* try all priority 1 rules after all priority 0 rules */;
+ long filepos /* position of this line in the rule file */;
+ char code[32] /* the nilsimsa code as a bit vector */;
+ char *name /* the filename or code from the command line */;
+ };
+
+int nilsimsa(struct nsrecord *a,struct nsrecord *b);
+void aggregate(int n);
+void codetostr(struct nsrecord *a,char *str);
+int strtocode(const char *str,struct nsrecord *a);
+
+#endif
diff --git a/cmeclax/rules.c b/cmeclax/rules.c
new file mode 100644
index 0000000..60d2fe1
--- /dev/null
+++ b/cmeclax/rules.c
@@ -0,0 +1,124 @@
+/***************************************************************************
+ rules.c - reads and writes rule file
+ -------------------
+ begin : Wed Jul 4 2001
+ copyright : (C) 2001 by cmeclax
+ email : cmeclax@ixazon.dynip.com
+ ***************************************************************************/
+
+/***************************************************************************
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU General Public License as published by *
+ * the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ ***************************************************************************/
+
+#include "nilsimsa.h"
+#include "rules.h"
+
+extern struct nsrecord *selkarbi,terkarbi,*rules,gunma;
+extern int nilselkarbi,nrules;
+
+void readrules(char *rulefilename)
+/* Format for a rule:
+ 0123456789abcdef0369cf258be147ad05af49e38d27c16b07e5c3a18f6d4b29 101 A 0 comment
+ 0123... is the nilsimsa code at the center of the cluster.
+ 101 is the nilsimsa threshold of the cluster (radius is 27).
+ The extra space is for a minus sign.
+ A is A for allow or D for deny.
+ 0 is the priority. There are two priorities. All priority 0 rules are checked, then
+ all priority 1 rules. Rules added by nilsimsa are always priority 0.
+ Priority 1 rules are used for default actions. */
+{FILE *rulefile;
+ char line[256],*fgotten,*thresh,*allow;
+ long filepos;
+ rulefile=fopen(rulefilename,"r");
+ for (nrules=0,fgotten=line;rulefile && fgotten;nrules++)
+ {filepos=ftell(rulefile);
+ fgotten=fgets(line,256,rulefile);
+ if (fgotten)
+ {rules=realloc(rules,(nrules+1)*sizeof(struct nsrecord));
+ rules[nrules].filepos=filepos;
+ rules[nrules].flag=INVALID;
+ thresh=strchr(line,' ');
+ if (thresh)
+ {*thresh++=0;
+ while (isspace(*thresh))
+ thresh++;
+ allow=strchr(thresh,' ');
+ if (allow)
+ allow++;
+ }
+ }
+ if (fgotten && thresh && allow)
+ {if (*allow=='D')
+ rules[nrules].flag=DENYRULE;
+ if (*allow=='A')
+ rules[nrules].flag=ALLOWRULE;
+ rules[nrules].flag*=strtocode(line,rules+nrules);
+ rules[nrules].nilsmi=atoi(thresh);
+ rules[nrules].priority=atoi(allow+2);
+ }
+ }
+ if (rulefile)
+ {fclose(rulefile);
+ nrules--;
+ }
+ }
+
+void writerule(struct nsrecord *rule,char *rulefilename)
+/* Write the rule to the file. If rule->filepos<0, it is written
+ at the end of the file, and followed with a line feed; otherwise
+ it is written at filepos and not followed with a linefeed. */
+{FILE *rulefile;
+ char str[65];
+ rulefile=NULL;
+ if (rulefilename)
+ rulefile=fopen(rulefilename,"a+");
+ if (rulefile)
+ if (rule->filepos<0)
+ fseek(rulefile,0,SEEK_END);
+ else
+ fseek(rulefile,rule->filepos,SEEK_SET);
+ codetostr(rule,str);
+ fprintf(rulefile?rulefile:stdout,"%s %4d %c %d ",str,rule->nilsmi,"ILFAD"[rule->flag],rule->priority);
+ if (rule->filepos<0)
+ fprintf(rulefile?rulefile:stdout,"\n");
+ if (rulefile)
+ fclose(rulefile);
+ }
+
+/* matches() is a macro in rules.h */
+
+int matchany(struct nsrecord *a,struct nsrecord *rule)
+/* Checks a against all rules, first the priority 0 rules,
+ then the priority 1 rules. Returns ALLOWRULE or DENYRULE
+ according to the type of the first matched rule.
+ Copies the matched rule into rule, unless it's NULL. */
+{int i,j,result;
+ result=0;
+ for (i=0;i<2;i++)
+ for (j=0;j<nrules && result==0;j++)
+ if ((rules[j].priority==i) && (matches(a,rules+j)))
+ {result=rules[j].flag;
+ if (rule)
+ *rule=rules[j];
+ }
+ return result;
+ }
+
+int removematches(int n)
+/* Puts all codes in selkarbi that match rules or are invalid at the end,
+ and returns the number of codes that don't. */
+{int i;
+ struct nsrecord temp;
+ for (i=0;i<n;i++)
+ if (selkarbi[i].flag == INVALID || matchany(selkarbi+i,NULL))
+ {temp=selkarbi[i];
+ selkarbi[i--]=selkarbi[--n];
+ selkarbi[n]=temp;
+ }
+ return n;
+ }
diff --git a/cmeclax/rules.h b/cmeclax/rules.h
new file mode 100644
index 0000000..c33c6df
--- /dev/null
+++ b/cmeclax/rules.h
@@ -0,0 +1,22 @@
+/***************************************************************************
+ rules.h - reads and writes rule file
+ -------------------
+ begin : Wed Jul 4 2001
+ copyright : (C) 2001 by cmeclax
+ email : cmeclax@ixazon.dynip.com
+ ***************************************************************************/
+
+/***************************************************************************
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU General Public License as published by *
+ * the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ ***************************************************************************/
+
+void writerule(struct nsrecord *rule,char *rulefilename);
+void readrules(char *rulefilename);
+#define matches(a,rule) (nilsimsa((a),(rule))>(rule)->nilsmi)
+int removematches(int n);
+int matchany(struct nsrecord *a,struct nsrecord *rule);
diff --git a/nilsimsa.go b/nilsimsa.go
new file mode 100644
index 0000000..e439b53
--- /dev/null
+++ b/nilsimsa.go
@@ -0,0 +1,351 @@
+// Copyright 2013, Michiel Buddingh, All rights reserved. Use of this
+// code is governed by version 2.0 or later of the Apache License,
+// available at http://www.apache.org/licenses/LICENSE-2.0
+
+// Package nilsimsa implements the nilsimsa fuzzy hash by cmeclax.
+//
+// About Nilsimsa
+//
+// In summary, nilsimsa is a trigram frequency table, with a bit depth
+// of 1 bit. Table positions are zero if the frequency of a specific
+// hash value is lower than average, and 1 if it is higher than
+// average.
+//
+// Nilsimsa codes of two texts can be compared; similar texts will
+// have very similar frequency distributions.
+package nilsimsa
+
+import (
+ "errors"
+ "fmt"
+ "runtime"
+ "strconv"
+)
+
+const (
+ segmentMinimum = 5
+ segmentSize = 4096 // seems to be optimal om amd64 Core i3
+ segmentLimit = 536870912
+)
+
+func init() {
+ filltran(&tran)
+ fillpopcount(&popcount)
+}
+
+var tran [256]byte
+var popcount [256]byte
+
+// filltran initializes the 256-byte permutation
+// table
+func filltran(buf *[256]byte) {
+ j := 0
+ for i := 0; i < 256; i++ {
+ j = (j*53 + 1) & 255
+ j = j + j
+ if j > 255 {
+ j -= 255
+ }
+ switch i {
+ case 98:
+ j += 4
+ case 236:
+ j += 37
+ case 248:
+ j += 12
+ case 255:
+ j += 23
+ default:
+ j += 0
+ }
+ j &= 255
+ buf[i] = byte(j)
+ }
+ return
+}
+
+// popcount initializes a population count lookup table.
+func fillpopcount(buf *[256]byte) {
+ for i := 0; i < len(buf); i++ {
+ count := 0
+ dup := i
+ for dup != 0 {
+ count++
+ dup &= (dup - 1)
+ }
+ buf[i] = byte(count)
+ }
+}
+
+// tran3 produces a pseudo-random byte value based on four byte
+// parameters
+func tran3(a, b, c, n byte) byte {
+ return (((tran[((a)+(n))&255] ^ tran[(b)]*((n)+(n)+1)) + tran[(c)^tran[n]]) & 255)
+}
+
+// Code is a 256-bit nilsimsa code. It is a type alias for [32]byte.
+type Code [32]byte
+
+var (
+ // ErrScanIncorrectLength is returned by Code.Scan when an
+ // attempt is made to scan a go code that contains less or
+ // more than 64 hex characters.
+ ErrScanIncorrectLength = errors.New("nilsimsa Code.Scan: incorrect length of hex string")
+)
+
+// Scan implements the fmt.Scanner interface
+func (c *Code) Scan(state fmt.ScanState, verb rune) error {
+ var accept func(rune) bool
+
+ switch verb {
+ case 'x', 'X':
+ accept = func(r rune) bool {
+ return ('0' <= r && r <= '9') ||
+ ('a' <= r && r <= 'f') ||
+ ('A' <= r && r <= 'F')
+ }
+ default:
+ return errors.New("nilsimsa Code.Scan: only hexadecimal codes are supported")
+ }
+
+ byteRange, err := state.Token(false, accept)
+ if err != nil {
+ return err
+ }
+
+ if len(byteRange) != 64 {
+ return ErrScanIncorrectLength
+ }
+
+ for i := 0; i < 32; i++ {
+ bits, err := strconv.ParseUint(string(byteRange[i*2:(i+1)*2]), 16, 8)
+ if err != nil {
+ return err
+ }
+
+ c[(31 - i)] = uint8(bits)
+ }
+
+ return nil
+}
+
+// String returns a 64-character hexadecimal representation of the
+// nilsimsa code.
+func (code Code) String() string {
+ var result string
+
+ for i := 0; i < len(code); i++ {
+ result += fmt.Sprintf("%02x", code[len(code)-(1+i)])
+ }
+ return result
+}
+
+// Distance is the nilsimsa distance, ranging from -127 (as different
+// as two codes can be) to 128 (identical codes)
+func (a Code) Distance(b Code) int {
+ return 128 - hammingDistance([32]byte(a), [32]byte(b))
+}
+
+// hammingDistance is the Metric Hamming distance between two 32-byte slices.
+func hammingDistance(a, b [32]byte) (count int) {
+ for i := 0; i < 32; i++ {
+ difference := a[i] ^ b[i]
+
+ count += int(popcount[difference])
+ }
+ return
+}
+
+// Writer computes frequency tables for the data written to it, and returns
+// a Code for this data.
+type Writer struct {
+ count uint64
+ buckets [256]int64
+ tail []byte
+}
+
+// Reset returns a nilsimsa.Writer to its zero state.
+func (n *Writer) Reset() {
+ n.count = 0
+ for i := range n.buckets {
+ n.buckets[i] = 0
+ }
+ n.tail = n.tail[0:0]
+}
+
+// Size is the length of a nilsimsa code, in bytes. The length is 256
+// bits, or 32 bytes
+func (n Writer) Size() int {
+ return 256 / 8
+}
+
+// BlockSize returns the optimal block size to use for writing to a writer.
+func (n Writer) BlockSize() int {
+ return runtime.NumCPU() * segmentSize
+}
+
+// Sum takes a byte slice, adds it to the current nilsimsa state, and
+// returns a 32-byte nilsimsa hash of the total data.
+func (sim Writer) Sum(b []byte) []byte {
+ c := sim
+ c.Write(b)
+ array := [32]byte(c.Code())
+ return array[:]
+}
+
+// Code returns the nilsimsa code for the data written to the Writer
+// thusfar.
+func (sim Writer) Code() Code {
+ var threshold int64
+ if sim.count < 29 {
+ threshold = 0
+ } else {
+ threshold = int64(((sim.count * 8) - 28) / 256)
+ }
+
+ var result [32]byte
+ for i := 0; i < 256; i++ {
+ if sim.buckets[i] > threshold {
+
+ result[i>>3] += 1 << uint(i&7)
+ }
+ }
+ return Code(result)
+}
+
+type buckets struct {
+ count uint32
+ buckets [256]uint32
+}
+
+func (n *Writer) aggregate(pipe <-chan *buckets, done chan<- int) {
+ for buckets := range pipe {
+ n.count += uint64(buckets.count)
+ for i := 0; i < 256; i++ {
+ n.buckets[i] += int64(buckets.buckets[i])
+ }
+ }
+ done <- 1
+}
+
+type slice struct {
+ tail []byte
+ rest []byte
+}
+
+func eater(slices <-chan slice, pipe chan<- *buckets, done chan<- int) {
+ for slice := range slices {
+ pipe <- block(slice.tail, slice.rest)
+ }
+ done <- 1
+}
+
+// Write takes a slice of bytes and passes them through the nilsimsa
+// frequency table algorithm.
+func (sim *Writer) Write(buf []byte) (n int, err error) {
+ pipe := make(chan *buckets)
+ done := make(chan int)
+ length := len(buf)
+
+ slices := make(chan slice, runtime.NumCPU())
+
+ go sim.aggregate(pipe, done)
+
+ for c := 0; c < runtime.NumCPU(); c++ {
+ go eater(slices, pipe, done)
+ }
+
+ index := 0
+
+ if len(sim.tail) > 0 {
+ slices <- slice{
+ sim.tail,
+ buf[index:min(index+segmentSize, length)]}
+ index += segmentSize
+ }
+
+ for {
+ if index > length {
+ break
+ }
+
+ slices <- slice{
+ buf[max(0, index-segmentMinimum):index],
+ buf[index:min(index+segmentSize, length)]}
+
+ index += segmentSize
+ }
+
+ close(slices)
+
+ tail := append(sim.tail, buf[max(0, length-segmentMinimum):]...)
+ sim.tail = tail[max(0, len(tail)-segmentMinimum):]
+
+ for c := 0; c < runtime.NumCPU(); c++ {
+ <-done
+ }
+
+ close(pipe)
+
+ <-done
+
+ return length, nil
+}
+
+func block(tail, rest []byte) *buckets {
+ b := new(buckets)
+ var lookback [5]byte
+
+ taillength := len(tail)
+
+ for i, bb := range tail {
+ lookback[taillength-(i+1)] = bb
+ }
+
+ for i := 0; i < len(rest); i++ {
+
+ lookback[4] = lookback[3]
+ lookback[3] = lookback[2]
+ lookback[2] = lookback[1]
+ lookback[1] = lookback[0]
+
+ lookback[0] = rest[i]
+
+ if i+taillength > 1 {
+ b.buckets[tran3(lookback[0], lookback[1], lookback[2], 0)]++
+ }
+ if i+taillength > 2 {
+ b.buckets[tran3(lookback[0], lookback[1], lookback[3], 1)]++
+ b.buckets[tran3(lookback[0], lookback[2], lookback[3], 2)]++
+ }
+ if i+taillength > 3 {
+ b.buckets[tran3(lookback[0], lookback[1], lookback[4], 3)]++
+ b.buckets[tran3(lookback[0], lookback[2], lookback[4], 4)]++
+ b.buckets[tran3(lookback[0], lookback[3], lookback[4], 5)]++
+ b.buckets[tran3(lookback[4], lookback[1], lookback[0], 6)]++
+ b.buckets[tran3(lookback[4], lookback[3], lookback[0], 7)]++
+ }
+ }
+ b.count = uint32(len(rest))
+ return b
+}
+
+func min(values ...int) (min int) {
+ min = 2147483647
+ for _, v := range values {
+ if v < min {
+ min = v
+ }
+ }
+ return
+}
+
+func max(values ...int) (max int) {
+ max = -2147483648
+ for _, v := range values {
+ if v > max {
+ max = v
+ }
+ }
+ return
+}
diff --git a/nilsimsa_test.go b/nilsimsa_test.go
new file mode 100644
index 0000000..019e6fe
--- /dev/null
+++ b/nilsimsa_test.go
@@ -0,0 +1,335 @@
+package nilsimsa
+
+import (
+ "bytes"
+ "fmt"
+ "io"
+ "testing"
+)
+
+var testCases = []struct {
+ seed, length int64
+ expectedCode string
+ expected [256]int64
+}{
+ {3029, 3,
+ "0000010000000000000000000000000000000000000000000000000000000000",
+ [256]int64{
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ }},
+ {91710, 1024 * 20,
+ "594f74509a5d3d27c5f90412d5ee1f987c140273bdcf1d5a4681b095aada9982",
+ [256]int64{
+ 630, 662, 634, 622, 624, 626, 615, 651, 671, 635, 600, 676, 674, 608, 596, 660,
+ 627, 665, 621, 664, 682, 631, 644, 650, 638, 687, 633, 654, 633, 641, 614, 649,
+ 641, 626, 653, 617, 644, 633, 611, 646, 630, 609, 620, 637, 641, 666, 636, 646,
+ 677, 633, 632, 627, 634, 626, 601, 649, 634, 644, 647, 616, 618, 628, 655, 612,
+ 638, 647, 636, 642, 668, 635, 661, 613, 651, 637, 652, 642, 677, 606, 632, 578,
+ 700, 641, 657, 648, 632, 621, 663, 647, 647, 620, 723, 672, 646, 673, 629, 653,
+ 658, 651, 619, 615, 649, 645, 669, 612, 637, 641, 627, 582, 632, 618, 634, 607,
+ 624, 634, 665, 632, 660, 613, 616, 632, 626, 622, 658, 641, 650, 652, 661, 613,
+ 584, 635, 612, 669, 652, 599, 628, 663, 649, 647, 663, 674, 642, 607, 624, 627,
+ 621, 649, 661, 646, 618, 657, 675, 679, 647, 624, 673, 613, 679, 624, 642, 666,
+ 582, 688, 606, 629, 691, 609, 638, 617, 626, 621, 676, 611, 623, 628, 629, 611,
+ 641, 625, 633, 648, 660, 665, 674, 678, 645, 632, 658, 639, 625, 615, 652, 651,
+ 648, 648, 692, 623, 634, 658, 616, 622, 660, 618, 683, 652, 656, 695, 615, 620,
+ 640, 634, 648, 649, 650, 616, 655, 635, 632, 655, 628, 667, 690, 631, 635, 646,
+ 637, 631, 630, 609, 672, 639, 659, 617, 594, 619, 655, 627, 656, 640, 640, 626,
+ 648, 667, 643, 681, 621, 616, 681, 614, 678, 618, 637, 646, 654, 621, 657, 565,
+ }},
+ // // Longer test vectors commented out to save time.
+ // {91710, 1024 * 1024 * 20, "1dd8c51ad70c5e075541638493254f2d7f9b772b399171ccc7ef8571a67af395", [256]int64{
+ // 655698, 654998, 657191, 655291, 655622, 654656, 655048, 655734, 656161, 655930, 653602, 654693, 656215, 655394, 656332, 655885,
+ // 653433, 655485, 654454, 655737, 655705, 656801, 655690, 654601, 655336, 655792, 656326, 654850, 655316, 655364, 655028, 656820,
+ // 655683, 655168, 655023, 655344, 656308, 657138, 655970, 654551, 655785, 654650, 655719, 654933, 654805, 655335, 655188, 655958,
+ // 655445, 655428, 656313, 655670, 654805, 655978, 656466, 656821, 656450, 656107, 655372, 653697, 654446, 654871, 656021, 657469,
+ // 655176, 654408, 655467, 656262, 654531, 655262, 657245, 655469, 655972, 653811, 655102, 653840, 655826, 655535, 656348, 655032,
+ // 655989, 654646, 653992, 653405, 655754, 655262, 654575, 656561, 655798, 654656, 654225, 655465, 655457, 655684, 654621, 653765,
+ // 655493, 655421, 654188, 656067, 653043, 656098, 654152, 654777, 655416, 655786, 655890, 654493, 655585, 656101, 656027, 654469,
+ // 655749, 655723, 655182, 655461, 656331, 655133, 654686, 655996, 655399, 655705, 655789, 655489, 656082, 656754, 655526, 655281,
+ // 655598, 653899, 656520, 656196, 654355, 656561, 654599, 654671, 656255, 655713, 655561, 657179, 655250, 654542, 655382, 654337,
+ // 656822, 654976, 656325, 654887, 655296, 655603, 654890, 655139, 656048, 655829, 654852, 654983, 655746, 655037, 654169, 655662,
+ // 653847, 655084, 655539, 655133, 654359, 653605, 655250, 656112, 657085, 655817, 654998, 654846, 654134, 655898, 655972, 655243,
+ // 656216, 654116, 655281, 655151, 653506, 655151, 655525, 655244, 656592, 655196, 655668, 655017, 656132, 654046, 655431, 653728,
+ // 655698, 656824, 657290, 653780, 655052, 655118, 653981, 655026, 654818, 656101, 655859, 655416, 656219, 655144, 656409, 654429,
+ // 654482, 655014, 655371, 656330, 654724, 655052, 655302, 655046, 655617, 657147, 656950, 654113, 655727, 654606, 655506, 656476,
+ // 655061, 655734, 654898, 655866, 656344, 653833, 654307, 654268, 655847, 655020, 655817, 654404, 654210, 655122, 655593, 655405,
+ // 655266, 653339, 654951, 655661, 655471, 654473, 656068, 655936, 656394, 654766, 656081, 656329, 656700, 654703, 654115, 655181,
+ // }},
+ // {91709, 1024 * 1024 * 20, "53aaa649adb93541e0254b5d51ce5a5cf56e399f19e21bae9597ed915f773733", [256]int64{
+ // 655506, 655845, 655028, 654699, 656315, 655726, 653680, 653557, 655983, 655718, 655961, 654870, 655630, 656054, 655215, 654369,
+ // 656410, 656360, 658005, 653751, 655637, 655751, 656227, 653754, 655745, 655732, 655595, 657013, 655433, 654237, 656619, 654700,
+ // 655926, 655021, 654527, 654901, 655438, 654163, 654891, 655993, 655390, 654976, 655696, 655823, 654746, 656223, 656546, 655726,
+ // 656865, 655780, 656480, 654472, 656364, 655061, 654502, 655994, 655530, 653553, 655970, 654765, 655756, 654679, 654355, 656082,
+ // 655173, 656436, 656025, 655444, 654760, 656717, 654806, 655528, 655625, 656941, 655251, 655961, 656830, 655145, 654623, 654987,
+ // 655115, 656127, 655348, 655117, 654798, 655425, 655501, 656118, 656306, 654361, 654814, 655711, 655620, 655184, 655312, 655056,
+ // 656611, 656295, 655541, 655454, 655872, 655208, 655071, 656722, 655843, 654605, 654975, 655916, 657032, 655859, 655159, 654329,
+ // 654626, 655922, 655793, 656076, 655223, 655827, 656484, 654913, 655852, 655152, 656461, 654553, 656733, 656475, 656909, 655810,
+ // 654516, 654291, 655531, 655463, 655397, 654147, 655580, 653909, 654086, 656195, 655080, 655713, 656495, 655001, 655897, 655202,
+ // 654853, 655930, 655490, 656943, 655203, 654702, 656320, 655464, 656113, 654497, 653094, 655244, 655893, 655117, 655549, 654757,
+ // 656585, 654635, 656721, 655667, 656571, 654186, 656429, 654302, 656693, 656367, 654985, 655799, 654351, 653577, 655383, 654408,
+ // 655753, 654813, 655483, 654615, 654104, 655535, 653275, 653812, 655120, 654359, 655328, 654196, 654098, 655545, 655922, 655426,
+ // 656157, 655163, 653719, 654978, 655185, 653726, 656296, 653536, 655364, 653508, 655491, 654702, 656015, 656216, 654835, 655189,
+ // 655822, 654431, 654652, 656749, 655398, 656156, 653938, 656229, 656185, 654040, 655394, 655917, 654440, 656725, 654836, 655558,
+ // 655880, 654275, 654928, 656066, 655352, 654940, 655365, 654827, 654810, 656176, 656175, 654866, 654631, 655676, 655199, 655387,
+ // 654484, 655812, 655081, 656002, 653278, 655418, 655064, 655371, 655397, 656883, 654769, 654854, 655402, 654974, 655862, 653878,
+ // }},
+ {100, 1, "0000000000000000000000000000000000000000000000000000000000000000",
+ [256]int64{}},
+ {321, 200, "e169fdbf16c0c57d500ca022e54d9060e56cfd2818ca201dbae46735040200c9",
+ [256]int64{
+ 8, 4, 5, 7, 4, 4, 11, 8, 5, 5, 3, 5, 6, 6, 6, 3,
+ 3, 8, 5, 5, 6, 5, 4, 2, 4, 2, 7, 5, 3, 6, 6, 4,
+ 7, 4, 7, 3, 8, 7, 4, 5, 10, 8, 7, 6, 5, 8, 10, 4,
+ 4, 3, 9, 4, 5, 9, 9, 12, 3, 9, 3, 9, 7, 8, 4, 9,
+ 10, 5, 9, 9, 11, 6, 1, 6, 3, 5, 6, 6, 4, 7, 4, 5,
+ 5, 9, 2, 8, 4, 5, 9, 8, 5, 6, 6, 9, 11, 5, 3, 6,
+ 4, 4, 4, 8, 4, 13, 6, 6, 10, 3, 9, 8, 9, 7, 7, 8,
+ 6, 3, 7, 9, 5, 7, 7, 6, 10, 5, 7, 5, 6, 7, 8, 10,
+ 5, 5, 5, 4, 2, 10, 7, 4, 2, 5, 4, 5, 10, 4, 4, 7,
+ 8, 5, 10, 8, 2, 6, 10, 3, 7, 5, 9, 6, 1, 7, 8, 11,
+ 6, 7, 4, 3, 5, 8, 1, 5, 5, 3, 1, 4, 4, 7, 4, 9,
+ 4, 4, 9, 8, 2, 5, 2, 4, 5, 5, 6, 3, 14, 3, 7, 5,
+ 11, 4, 7, 7, 9, 9, 7, 2, 10, 4, 7, 3, 4, 5, 7, 8,
+ 5, 3, 6, 4, 6, 6, 8, 10, 3, 10, 7, 5, 9, 3, 6, 3,
+ 12, 8, 10, 7, 9, 8, 6, 11, 8, 5, 11, 7, 7, 9, 8, 7,
+ 7, 6, 5, 7, 4, 9, 7, 4, 9, 6, 3, 4, 5, 7, 9, 7,
+ }},
+}
+
+func TestFilltran(t *testing.T) {
+ var buf [256]byte
+ filltran(&buf)
+ reference := []byte{
+ 0x02, 0xd6, 0x9e, 0x6f, 0xf9, 0x1d, 0x04, 0xab,
+ 0xd0, 0x22, 0x16, 0x1f, 0xd8, 0x73, 0xa1, 0xac,
+ 0x3b, 0x70, 0x62, 0x96, 0x1e, 0x6e, 0x8f, 0x39,
+ 0x9d, 0x05, 0x14, 0x4a, 0xa6, 0xbe, 0xae, 0x0e,
+ 0xcf, 0xb9, 0x9c, 0x9a, 0xc7, 0x68, 0x13, 0xe1,
+ 0x2d, 0xa4, 0xeb, 0x51, 0x8d, 0x64, 0x6b, 0x50,
+ 0x23, 0x80, 0x03, 0x41, 0xec, 0xbb, 0x71, 0xcc,
+ 0x7a, 0x86, 0x7f, 0x98, 0xf2, 0x36, 0x5e, 0xee,
+ 0x8e, 0xce, 0x4f, 0xb8, 0x32, 0xb6, 0x5f, 0x59,
+ 0xdc, 0x1b, 0x31, 0x4c, 0x7b, 0xf0, 0x63, 0x01,
+ 0x6c, 0xba, 0x07, 0xe8, 0x12, 0x77, 0x49, 0x3c,
+ 0xda, 0x46, 0xfe, 0x2f, 0x79, 0x1c, 0x9b, 0x30,
+ 0xe3, 0x00, 0x06, 0x7e, 0x2e, 0x0f, 0x38, 0x33,
+ 0x21, 0xad, 0xa5, 0x54, 0xca, 0xa7, 0x29, 0xfc,
+ 0x5a, 0x47, 0x69, 0x7d, 0xc5, 0x95, 0xb5, 0xf4,
+ 0x0b, 0x90, 0xa3, 0x81, 0x6d, 0x25, 0x55, 0x35,
+ 0xf5, 0x75, 0x74, 0x0a, 0x26, 0xbf, 0x19, 0x5c,
+ 0x1a, 0xc6, 0xff, 0x99, 0x5d, 0x84, 0xaa, 0x66,
+ 0x3e, 0xaf, 0x78, 0xb3, 0x20, 0x43, 0xc1, 0xed,
+ 0x24, 0xea, 0xe6, 0x3f, 0x18, 0xf3, 0xa0, 0x42,
+ 0x57, 0x08, 0x53, 0x60, 0xc3, 0xc0, 0x83, 0x40,
+ 0x82, 0xd7, 0x09, 0xbd, 0x44, 0x2a, 0x67, 0xa8,
+ 0x93, 0xe0, 0xc2, 0x56, 0x9f, 0xd9, 0xdd, 0x85,
+ 0x15, 0xb4, 0x8a, 0x27, 0x28, 0x92, 0x76, 0xde,
+ 0xef, 0xf8, 0xb2, 0xb7, 0xc9, 0x3d, 0x45, 0x94,
+ 0x4b, 0x11, 0x0d, 0x65, 0xd5, 0x34, 0x8b, 0x91,
+ 0x0c, 0xfa, 0x87, 0xe9, 0x7c, 0x5b, 0xb1, 0x4d,
+ 0xe5, 0xd4, 0xcb, 0x10, 0xa2, 0x17, 0x89, 0xbc,
+ 0xdb, 0xb0, 0xe2, 0x97, 0x88, 0x52, 0xf7, 0x48,
+ 0xd3, 0x61, 0x2c, 0x3a, 0x2b, 0xd1, 0x8c, 0xfb,
+ 0xf1, 0xcd, 0xe4, 0x6a, 0xe7, 0xa9, 0xfd, 0xc4,
+ 0x37, 0xc8, 0xd2, 0xf6, 0xdf, 0x58, 0x72, 0x4e,
+ }
+
+ if bytes.Compare(buf[:], reference) != 0 {
+ t.Errorf("Buffers differ:\n\n %v\n\n %v", buf, reference)
+ }
+
+}
+
+func TestTran(t *testing.T) {
+
+ reference := []byte{
+ 0x39, 0x7d, 0x56, 0xb1, 0x7b, 0x24, 0x3d, 0x91,
+ 0x5e, 0x28, 0x30, 0xe1, 0x2b, 0x3e, 0xa0, 0x31,
+ 0x30, 0x7f, 0x1f, 0x4e, 0xd9, 0xaa, 0xcb, 0x61,
+ 0xcd, 0x21, 0xba, 0x99, 0xcf, 0xda, 0xe3, 0xbc,
+ }
+
+ l := 231
+ for i := 0; i < 32; i++ {
+ l = ((21 * l) + 221) % 256
+ if tran3(byte(i), byte(l), byte((l+i)%256), 4) != reference[i] {
+ t.Errorf("Expected %d at position %d, generated %d\n", reference[i], i, tran3(byte(i), 2, 3, 4))
+ }
+ }
+}
+
+func TestWrite(t *testing.T) {
+ for j, test := range testCases {
+ source := io.LimitReader(newRandReader(test.seed), test.length)
+
+ sim := new(Writer)
+ io.Copy(sim, source)
+
+ for i := 0; i < 256; i++ {
+ if sim.buckets[i] != test.expected[i] {
+ t.Errorf("Expected %d in test %d at position %d, was %d",
+ test.expected[i],
+ j,
+ i,
+ sim.buckets[i])
+ }
+ }
+
+ actualCode := sim.Code().String()
+ if actualCode != test.expectedCode {
+ t.Errorf("expected code %s, was %s\n", test.expectedCode, actualCode)
+ }
+ }
+}
+
+func TestWriteSmallChunks(t *testing.T) {
+ testCase := testCases[1]
+
+ sim := new(Writer)
+ for writeSize := 1; writeSize < 2*sim.BlockSize(); writeSize = (writeSize * 3) - 1 {
+ source := io.LimitReader(newRandReader(testCase.seed), testCase.length)
+ sim.Reset()
+ buf := make([]byte, writeSize)
+ written := int64(0)
+ for written < int64(testCase.length) {
+ length, _ := source.Read(buf)
+ sim.Write(buf[:length])
+ written += int64(length)
+ }
+
+ for i := 0; i < 256; i++ {
+ if sim.buckets[i] != testCase.expected[i] {
+ t.Errorf("Expected %d at position %d when writing %d-byte blocks, was %d",
+ testCase.expected[i],
+ i,
+ writeSize,
+ sim.buckets[i])
+ break
+ }
+ }
+ }
+}
+
+func TestHammingDistance(t *testing.T) {
+ bitvectors := []struct {
+ a, b string
+ expected int
+ }{
+ {
+ "0000000000000000000000000000000000000000000000000000000000000000",
+ "0000000000000000000000000000000000000000000000000000000000000000",
+ 0,
+ },
+ {
+ "1111111111111111111111111111111111111111111111111111111111111111",
+ "0000000000000000000000000000000000000000000000000000000000000000",
+ 64,
+ },
+ {
+ "A5A5A5A5A5A5A5A5A5A5A5A5A5A5A5A5A5A5A5A5A5A5A5A5A5A5A5A5A5A5A5A5",
+ "5A5A5A5A5A5A5A5A5A5A5A5A5A5A5A5A5A5A5A5A5A5A5A5A5A5A5A5A5A5A5A5A",
+ 256,
+ },
+ }
+
+ for _, bitvector := range bitvectors {
+ var abits, bbits [32]byte
+ for i := 0; i < len(abits); i++ {
+ fmt.Sscanf(bitvector.a[2*i:], "%02x", &abits[i])
+ fmt.Sscanf(bitvector.b[2*i:], "%02x", &bbits[i])
+ }
+
+ distance := hammingDistance(abits, bbits)
+
+ if distance != bitvector.expected {
+ t.Errorf("Expected %d, was %d\n", bitvector.expected, distance)
+ }
+ }
+}
+
+func TestScan(t *testing.T) {
+ for _, testCase := range testCases {
+ var c Code
+ fmt.Sscanf(testCase.expectedCode, "%x", &c)
+ if c.String() != testCase.expectedCode {
+ t.Errorf("expected %s, was %s\n", c.String(), testCase.expectedCode)
+ }
+ }
+}
+
+// func TestAgainstC(t *testing.T) {
+// for j, test := range testCases {
+// c_source := io.LimitReader(newRandReader(test.seed), test.length)
+// go_source := io.LimitReader(newRandReader(test.seed), test.length)
+
+// var buffer bytes.Buffer
+// io.Copy(&buffer, c_source)
+
+// c_buckets := cmeclax.Accumulate(buffer.Bytes()).Buckets()
+// sim := new(Writer)
+// io.Copy(sim, go_source)
+
+// for i := 0; i < 256; i++ {
+// if sim.buckets[i] != int64(c_buckets[i]) {
+// t.Errorf("Expected %d in test %d at position %d, was %d",
+// c_buckets[i],
+// j,
+// i,
+// sim.buckets[i])
+// }
+// }
+// }
+// }
+
+// func BenchmarkBlockGo(b *testing.B) {
+// for n := 0; n < b.N; n++ {
+// tests := []struct {
+// seed, length int64
+// }{
+// {91709, 3298000},
+// }
+
+// for _, test := range tests {
+// go_source := io.LimitReader(newRandReader(test.seed), test.length)
+// // buf := make([]byte, test.length)
+// // go_source.Read(buf)
+// sim := new(Writer)
+// io.Copy(sim, go_source)
+// // block([]byte{}, buf)
+// }
+// }
+
+// }
+
+// func BenchmarkBlockC(b *testing.B) {
+// for n := 0; n < b.N; n++ {
+// tests := []struct {
+// seed, length int64
+// }{
+// {91709, 3298000},
+// }
+
+// for _, test := range tests {
+// c_source := io.LimitReader(newRandReader(test.seed), test.length)
+// var buffer bytes.Buffer
+// io.Copy(&buffer, c_source)
+// cmeclax.Accumulate(buffer.Bytes())
+// }
+// }
+// }
diff --git a/randreader_test.go b/randreader_test.go
new file mode 100644
index 0000000..b9c9b70
--- /dev/null
+++ b/randreader_test.go
@@ -0,0 +1,85 @@
+package nilsimsa
+
+import (
+ "bytes"
+ "encoding/binary"
+ "math/rand"
+ "testing"
+)
+
+type randReader struct {
+ rand *rand.Rand
+ tail []byte
+}
+
+func newRandReader(seed int64) (r *randReader) {
+ r = new(randReader)
+ r.rand = rand.New(rand.NewSource(seed))
+ return
+}
+
+func (r *randReader) Read(p []byte) (n int, err error) {
+ i := 0
+ if len(r.tail) > 0 {
+ for i < len(p) && i < len(r.tail) {
+ p[i] = r.tail[i]
+ i++
+ }
+ r.tail = r.tail[i:]
+ }
+ for i+4 < len(p) {
+ binary.BigEndian.PutUint32(p[i:], r.rand.Uint32())
+ i += 4
+ }
+ if i < len(p) {
+ var tail [4]byte
+ binary.BigEndian.PutUint32(tail[:], r.rand.Uint32())
+ j := i
+ for i < len(p) {
+ p[i] = tail[i-j]
+ i++
+ }
+ r.tail = tail[i-j:]
+ }
+ return i, nil
+}
+
+func TestRandReader(t *testing.T) {
+ expected := []byte{
+ 217, 70, 104, 150, 165, 37, 204, 191, 188, 251, 50, 224,
+ 80, 178, 100, 184, 14, 151, 174, 178, 247, 157, 93, 39,
+ 157, 44, 25, 189, 3, 218, 247, 85, 34, 52, 230, 106,
+ 131, 7, 107, 76, 10, 97, 203, 169, 90, 180, 219, 210,
+ 203, 59, 19, 34, 149, 138, 159, 129, 128, 182, 201, 57,
+ 74, 145, 48, 165, 122, 55, 213, 227, 109, 44, 221, 171,
+ 62, 145, 2, 131, 232, 131, 50, 163, 204, 33, 210, 9,
+ 27, 21, 43, 22, 67, 40, 98, 73, 142, 246, 58, 35,
+ 177, 141, 104, 76, 170, 14, 109, 84, 100, 145, 132, 249,
+ 62, 81, 144, 240, 117, 209, 44, 25, 165, 152, 31, 165,
+ }
+
+ for bufSize := 1; bufSize < 28; bufSize *= 3 {
+ var results bytes.Buffer
+ var source = newRandReader(12345)
+
+ buf := make([]byte, bufSize)
+ written := 0
+ for written < len(expected) {
+ length, _ := source.Read(buf)
+ results.Write(buf[:length])
+ written += length
+ }
+
+ bytes := results.Bytes()
+
+ for i := 0; i < len(expected); i++ {
+ if expected[i] != bytes[i] {
+ t.Errorf("Expected %d, was %d at position %d",
+ expected[i],
+ bytes[i],
+ i)
+ break
+ }
+ }
+ }
+}