From defb1392867512265a9e4d1cc899771618ff0915 Mon Sep 17 00:00:00 2001 From: Oscar Andersson Date: Thu, 14 Jan 2021 17:24:22 +0100 Subject: [PATCH] Init, move from internal git server at Karlstad Universiy to GitHub. --- LICENSE | 674 +++++++++++++++++++++++++++++++ README.md | 55 +++ bin/queue.sh | 28 ++ bin/run.sh | 21 + bin/watch.sh | 17 + exitmaprc | 7 + makefile | 18 + src/aggregator.py | 267 +++++++++++++ src/analyzer.py | 307 ++++++++++++++ src/cross_validation.py | 859 ++++++++++++++++++++++++++++++++++++++++ src/timeddns.py | 168 ++++++++ src/visualize.py | 124 ++++++ theoracle.conf | 14 + 13 files changed, 2559 insertions(+) create mode 100644 LICENSE create mode 100644 README.md create mode 100644 bin/queue.sh create mode 100644 bin/run.sh create mode 100644 bin/watch.sh create mode 100644 exitmaprc create mode 100644 makefile create mode 100644 src/aggregator.py create mode 100644 src/analyzer.py create mode 100644 src/cross_validation.py create mode 100644 src/timeddns.py create mode 100644 src/visualize.py create mode 100644 theoracle.conf diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..20d40b6 --- /dev/null +++ b/LICENSE @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..ad46994 --- /dev/null +++ b/README.md @@ -0,0 +1,55 @@ +# The Oracle +A website oracle operating on Tor exit relays. +* Author: Oscar Andersson [oscaande.se](https://www.oscaande.se) +* Organization: Karlstads University [kau.se](https://www.kau.se) +* Course: Examnesarbete DVGC25 +* Term: Autmn 2020 (HT2020) +* Licence: GPL-3.0 License, see license file or [gpl-3.0 on gnu.org](https://www.gnu.org/licenses/gpl-3.0.en.html). + +## NOTICE +Do not use this on exit nodes and relays that you do not own! + +## Overview +This is tool that exploits DNS cache in the Tor exit nodes. + +## Requirements +Makefile supose that a UNIX enviorment is used. On Windows, manual building is required. +This repository requires python 3 and the dependencies requires python 2. +The current version of the tools is noted in case future versions dont support backwards compatability. (as of 2020-10-28) +### Command line tools +install using `pacman -S python python2 pip git tor` on Arch based systems and `apt-get install python python2 python-pip git tor` on Debian based systems. +* python2 (2.7.18) +* python3 (3.8.5) +* pip (20.1.1) +* git (2.28.0) +* autoconf (2.69) +* automake (1.16.2) +* libtool (2.4.6.42-b88ce-dirty) +* gcc (10.2.0) +* tor (0.3.5.12) +### Python3 packages +install using `python3 -m pip install stem pysocks seaborn`. +* stem +* pysocks +* seaborn +### Python2 packages +install using `python2 -m pip install stem`. +* stem +### You also need to build thses tools +These tools can be built using the makefile in this repository. More about this in "Running chapter". +* [exitmap](https://github.com/NullHypothesis/exitmap) (2019.05.30) by [Philipp Winter](https://nymity.ch/) +* [torsocks](https://git.torproject.org/torsocks.git) (2.3.0) by [The Tor Project](https://torproject.org) + +## Using the tool +### Setup +* All requirements can simply be aquired from running `make` or install [exitmap](https://github.com/NullHypothesis/exitmap) manually, then place the contents of `src/` in exitmaps modules folder and copy `theoracle.conf.example` to `theoracle.conf` to the same direcotry. +* After running `make` or manually installing. Configure the program in the `theoracle.conf` file. +### Reset +To reset the tool, run `make clean`. You will then have do redo the setup procedure, altough note that the configuration file is persistent. +### Run +Run the modules with exitmap using `./exitmap/bin/exitmap A --first-hop B --exit C --config-file exitmaprc` where A is a module, B is the fingerprint of the first hop relay and C is the fingerprint of the targeted exit relay. The supplied exitmap configuration file is called `exitmaprc` and should be specificed using `--config-file exitmaprc`. Read [exitmap documentation](https://github.com/NullHypothesis/exitmap/blob/master/README.md) or run `./exitmap/bin/exitmap` for more commands, modules and information. + +## This could not have been possible without: +* [Philipp Winter](https://nymity.ch/) for creating the wonderfull tool [exitmap](https://github.com/NullHypothesis/exitmap). + +> And don't worry about the vase. \ No newline at end of file diff --git a/bin/queue.sh b/bin/queue.sh new file mode 100644 index 0000000..6dd2733 --- /dev/null +++ b/bin/queue.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# Setup +if [ -z $1 ] + then + echo "Script $0 takes two arguments. Provide a period in between exitmap runs in hours 1-23 as an integer. Provide amount of runs as an integer." + echo "Syntax: sh $0 PERIOD RUNS" + exit 1 + fi + +echo "Period: $1" +echo "Runs: $2" + +# Variables and executables +WORKING_DIRECTORY=$( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"/../" +EXITMAP_CMD="( cd $WORKING_DIRECTORY && sh bin/run.sh )" +#CLEAN_CMD="( crontab -l | grep "$WORKING_DIRECTORY" -v | crontab - )" + +# Temporary directory +CRON_TMP="/tmp/"$(tr -dc A-Za-z0-9 $CRON_TMP + +# Set cron jobs +echo $(date --date 'now + 1 minutes' +"%M")' */'$1' * * * '$EXITMAP_CMD >> $CRON_TMP +#echo $(date --date 'now + 5 minutes' +"%M")' */'$(($1*$2))' * * * '$CLEAN_CMD >> $CRON_TMP +eval crontab $CRON_TMP +eval rm -f $CRON_TMP +eval crontab -l \ No newline at end of file diff --git a/bin/run.sh b/bin/run.sh new file mode 100644 index 0000000..aaac6a6 --- /dev/null +++ b/bin/run.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +# Variables and direcories +ID=$(tr -dc A-Za-z0-9 /dev/null 2>&1 ; pwd -P )"/../" +CONFIG=$WORKING_DIRECTORY"exitmaprc" +CACHE_DIR=$WORKING_DIRECTORY"tor_cache/"$ID +OUTPUT_FILE=$WORKING_DIRECTORY"logs/"$ID".txt" + +touch $OUTPUT_FILE +# Run command and log +echo "Running exitmap with module timeddns. Id: $ID" +echo "$0 Started at $(date) by user $(whoami)." >> $OUTPUT_FILE +echo "Current working directory $WORKING_DIRECTORY" >> $OUTPUT_FILE +eval $WORKING_DIRECTORY"exitmap/bin/exitmap" timeddns -f $CONFIG -t $CACHE_DIR &>> $OUTPUT_FILE +echo "$0 Exitmap done at $(date) by user $(whoami)." >> $OUTPUT_FILE + +# Cleanup +echo "Remving $CACHE_DIR" >> $OUTPUT_FILE +eval rm -rf $CACHE_DIR +echo "$0 Cache cleared, script exiting at $(date) by user $(whoami)." >> $OUTPUT_FILE \ No newline at end of file diff --git a/bin/watch.sh b/bin/watch.sh new file mode 100644 index 0000000..cd140c7 --- /dev/null +++ b/bin/watch.sh @@ -0,0 +1,17 @@ +#!/bin/bash +WORKING_DIRECTORY=$( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"/.." +watch " + echo 'Watching the oracle output and exitmap running processes.'; + echo 'Results files:'; + du -hs $WORKING_DIRECTORY'/results'; + echo 'Exit nodes scanned:'; + ls -1q results/** | wc -l; + echo 'Exitmap processes:'; + ps wwuxa | grep 'exitmap' | grep -E 'grep|Watching the oracle output' -v -c; + echo 'Run script processes:'; + ps wwuxa | grep -E 'run.sh' | grep 'grep' -v -c; + echo 'Memory:'; + free -ht; + echo 'CPU usage top 5:'; + ps -eo pcpu,pid,user,args | sort -k 1 -r | head -5; + " \ No newline at end of file diff --git a/exitmaprc b/exitmaprc new file mode 100644 index 0000000..db0fc25 --- /dev/null +++ b/exitmaprc @@ -0,0 +1,7 @@ +[Defaults] +verbosity = info +build_delay = 2 +delay-noise = 2 +analysis_dir = exitmap_scans +first_hop = EDAF30C58D6CCF359EA062C668C7180A17076440 +country = \ No newline at end of file diff --git a/makefile b/makefile new file mode 100644 index 0000000..afac8b5 --- /dev/null +++ b/makefile @@ -0,0 +1,18 @@ +make: + [ -d "exitmap" ] || git clone git@github.com:NullHypothesis/exitmap.git + [ -d "exitmap" ] && python2 -m pip install -r exitmap/requirements.txt + [ -d "exitmap" ] && [ -f "exitmap/src/modules/timeddns.py" ] || ln src/timeddns.py exitmap/src/modules/timeddns.py + [ -d "exitmap" ] && [ -f "exitmap/src/modules/theoracle.conf" ] || ln theoracle.conf exitmap/src/modules/theoracle.conf + [ -d "logs" ] || mkdir logs + [ -d "results" ] || mkdir results + [ -d "tor_cache" ] || mkdir tor_cache + python3 -m pip install stem pysocks seaborn + python2 -m pip install stem + +clean: + @echo "THIS WILL REMOVE ALL RESULTS, LOGS AND CACHE IN 5 SECONDS! Press Ctrl + C to cancel." + sleep 5 + rm -rf exitmap_scans/* + rm -rf tor_cache/* + rm -f logs/* + rm -rf results/* \ No newline at end of file diff --git a/src/aggregator.py b/src/aggregator.py new file mode 100644 index 0000000..f212857 --- /dev/null +++ b/src/aggregator.py @@ -0,0 +1,267 @@ +#!/usr/bin/env python3 +# This is a aggregtor of the results generated by theoracle.py and timeddns.py +# This program is standalone and does not require the above mentioned files + +import sys +import os +import logging +import csv +import time +import collections +import multiprocessing + +# +# LOGGER +# + +log_format = "%(asctime)s %(name)s [%(levelname)s] %(message)s" # From exitmap by Philipp Winter +logging.basicConfig(format=log_format, level=logging.DEBUG) +log = logging.getLogger("aggregator") + +# +# LOAD AND SAVE DATA +# + +def read_csv(path): + data = [] + with open(path, mode='r') as csv_file: + for line in csv.DictReader(csv_file): + data.append(line) + return data + +def load_data_from_result_files(files): + log.info("Found " + str(len(files)) + " csv files. Loading datapoints, please wait...") + data = [] + for f in files: + data.extend(read_csv(f)) + return data + +def find_result_files(path): + files = [] + for path in [f.path for f in os.scandir(path) if f.is_dir()]: + files.extend([f.path for f in os.scandir(path) if f.is_file()]) + return files + +def write_data(data, filename): + with open(str(filename + ".csv"), mode='w') as F: + csv_writer = csv.DictWriter(F, fieldnames=['directory', 'fingerprint', 'domain', 'time', 'timestamp', 'cached']) + csv_writer.writeheader() + csv_writer = csv.writer(F) + for row in data: + csv_writer.writerow([ + row['directory'], + row['fingerprint'], + row['domain'], + row['time'], + row['timestamp'], + row['cached'] + ]) + +# +# MULTIPROCESSING +# + +def mp_get_processors(): + return int(multiprocessing.cpu_count()) + +def mp_aggreator(data_by_fingerprint): + log.debug("Multiprocessing aggregator got " + str(len(data_by_fingerprint)) + " rows of data.") + + return_dict = multiprocessing.Manager().dict() + + # Setup processes + processes_waiting_list = list() + t = time.time() + + try: + for dbf in sorted(data_by_fingerprint, key=lambda d: len(d), reverse=False): # Sort by size to make the fastest process join first + #log.debug(str(len(dbd))) + processes_waiting_list.append(multiprocessing.Process(target=agg_main, args=(list(dbf), len(processes_waiting_list), return_dict))) + log.debug("Added " + str(len(processes_waiting_list)) + " processes to waiting list.") + except Exception as ex: + log.error("Setup processes error " + str(ex)) + exit(-1) + + processes_running_list = list() + # If either list is not empty + while len(processes_waiting_list) > 0 or len(processes_running_list) > 0: + while len(processes_running_list) < mp_get_processors() and len(processes_waiting_list) > 0: + # Running list is not full and waiting list is not empty. + # Add more process. + processes_running_list.append(processes_waiting_list.pop()) + processes_running_list[len(processes_running_list) - 1].start() # Start last process in list + + while len(processes_running_list) > 0: + # Running list is full or waiting list is empty + # Runnign list cannot be empty + # Wait for processes to join. + try: + p = processes_running_list.pop() + p.join() + except Exception as ex: + log.error("\u001b[31mCould not join process, error " + str(ex) + "\u001b[0m") + else: + log.debug("Process joined the main process.") + # Stats printout + if time.time() - t > 10: + t = time.time() + log.info("Aggregator running, " + str(len(processes_running_list) + len(processes_waiting_list)) + " processes left.") + + # Handle output + output = [] + [output.extend(value) for value in return_dict.values()] + log.debug("Multiprocessing aggregator returning " + str(len(output)) + " rows of data.") + return output + +# +# AGGREGATOR +# + +# Reformats the dataset according to new format with more information +def agg_format(datapoint, cached): + a = {} + a["directory"] = datapoint["directory"] + a["fingerprint"] = datapoint["fingerprint"] + a["domain"] = datapoint["domain"] + a["time"] = datapoint["time"] + a["timestamp"] = datapoint["timestamp"] + a["cached"] = str(cached) + return a + +# Is datapoint in data set cached +# Returns True or False +def agg_is_cached(data, datapoint): + # Domain is not unique and must be cached in at least once measurement + return len([x for x in data if x["domain"] == datapoint["domain"]]) > 1 + +# Is datapoint the last of its domain in the data set +# Presumption that the datapoint is not unique in the data set +# Returns True or False +def agg_is_last(data, is_this_datapoint_last): + for current_datapoint in data: + # Find domain + if is_this_datapoint_last["domain"] == current_datapoint["domain"]: + # Compare timestamp + if float(is_this_datapoint_last["timestamp"]) > float(current_datapoint["timestamp"]): + return True + return False + # log.error("Aggregator is last operation performed on a unique datapoint, exiting.") + # exit(-1) + +# Main aggreagtor function +# Capable of multi processing +# Returns format seen in agg_format(...) +def agg_main(main_data, procnum=0, return_dict=list()): + log.info("\u001b[36mAggregator #" + str(procnum) + " starting with " + str(len(main_data)) + " rows of data, please wait...\u001b[0m") + output = [] # Where resulting output is put, then returned when the function is finished + t0 = time.time() # Total running time + + for data_by_directory in agg_devide_data_by_directory(main_data): # Divide by directory and go trough + for datapoint in data_by_directory: # Go trough every datapoint in the directory + is_cached = agg_is_cached(data_by_directory, datapoint) + if (not is_cached) or (is_cached and agg_is_last(data_by_directory, datapoint)): + output.append(agg_format(datapoint, is_cached)) + + log.info("\u001b[32mAggregator #" + str(procnum) + " finished in " + str(float(time.time() - t0))[0:5] + " seconds, returning " + str(len(output)) + " rows of data.\u001b[0m") + return_dict[procnum] = output # Return resulting data to main process + return True + +# Devide data in terms of thier exit node (fingerprint) +# Returns list in list +def agg_devide_data_by_fingerprint(data): + fingerprints = set([x["fingerprint"] for x in data]) + data_by_fingerprint = [] + for fingerprint in fingerprints: + data_by_fingerprint.append([x for x in data if x["fingerprint"] == fingerprint]) + return data_by_fingerprint + +# Devides data in terms of thier directory +# Returns lists in list +def agg_devide_data_by_directory(data): + directories = set([x["directory"] for x in data]) + data_by_directory = [] + for directory in directories: + data_by_directory.append([x for x in data if x["directory"] == directory]) + return data_by_directory + +# Removes all datapoints where keyword exists in domain +def agg_remove_blacklist(data, keyword): + log.debug("Will remove rows by keyword " + str(keyword) + " from " + str(len(data)) + " rows of data.") + return [x for x in data if not str(keyword) in str(x["domain"])] + +# +# BOOTSTRAP +# + +if __name__ == "__main__": + try: + # Check input directory + input_directory = sys.argv[1] + log.debug("Input directory: " + str(input_directory)) + if not os.path.isdir(input_directory): + log.error("Input directory not valid.") + exit(-1) + # Check blacklist keyword + blacklist_keyword = sys.argv[2] + log.debug("Domain blacklist keyword: " + str(blacklist_keyword)) + except Exception as ex: + log.error("Command line argument error " + str(ex)) + log.info("Syntax: " + str(sys.argv[0]) + " input-directory blacklisted-domain") + log.info("The input directory contains data genreated by theoracle.py, the blacklisted domain will remove all domains containing that string in the data set provided as input.") + exit(-1) + else: + try: + # Load data + t = time.time() + log.info("Loading data, please wait...") + data = load_data_from_result_files(find_result_files(input_directory)) + log.info("Loaded " + str(len(data)) + " rows of data in " + str(float(time.time() - t))[0:5] + " seconds from " + str(input_directory) + " directory.") + except Exception as ex: + log.error("Load data from CSV error " + str(ex)) + + try: + # Remove blacklisted data + t = time.time() + log.info("Removing data according to blacklist, please wait...") + data = agg_remove_blacklist(data, blacklist_keyword) + log.info("Removed data according to blacklisted keyword " + str(blacklist_keyword) + " in " + str(float(time.time() - t))[0:5] + " seconds, resulting in " + str(len(data)) + " rows of data left.") + except Exception as ex: + log.error("Blacklist operation error " + str(ex)) + + try: + # Statistics + log.info("Data containing " + str(len(set([x["directory"] for x in data]))) + " directories.") + log.info("Aggregator should result in ~" + str(int(len(data) - len(data) / 3)) + " rows of data.") + except Exception as ex: + log.error("Statistics error " + str(ex)) + + # Run main aggregator + t = time.time() + log.info("Aggregating, please wait...") + data = mp_aggreator(agg_devide_data_by_fingerprint(data)) + log.info("Aggregator done in " + str(float(time.time() - t))[0:5] + " seconds.") + + # Sorting data set + try: + data_tmp = sorted(data, key=lambda d: float(d["timestamp"]), reverse=False) + except Exception as ex: + log.error("Data soring error " + str(ex)) + log.info("Data sorting failed, will save unsorted data instead") + else: + log.debug(str(len(data_tmp)) + " rows of data sorted by timestamp sucessfully.") + data = data_tmp + + # Saving to disk + t = time.time() + try: + # Write data to disk + write_data(data, "theoracle_aggregated_" + str(time.time())) + except Exception as ex: + log.error("Write data error " + str(ex)) + else: + log.info("Wrote data to disk in " + str(float(time.time() - t))[0:5] + " seconds.") + + finally: + # Exit + log.info(str(sys.argv[0]) + " exited normally.") + exit(0) \ No newline at end of file diff --git a/src/analyzer.py b/src/analyzer.py new file mode 100644 index 0000000..e07ddcf --- /dev/null +++ b/src/analyzer.py @@ -0,0 +1,307 @@ +#!/usr/bin/env python3 +# This is a analyzer of the results generated by theoracle.py and timeddns.py +# This program i standalone and does not require the above mentioned files + +import sys +import os +import logging +import csv +import matplotlib.pyplot as plt +import time +import collections +import multiprocessing + +# +# LOGGER +# + +log_format = "%(asctime)s %(name)s [%(levelname)s] %(message)s" # From exitmap by Philipp Winter +logging.basicConfig(format=log_format, level=logging.DEBUG) +log = logging.getLogger("analyzer") + +# +# LOAD DATA FROM SYSTEM +# + +def read_csv(path): + data = [] + with open(path, mode='r') as csv_file: + for line in csv.DictReader(csv_file): + data.append(line) + return data + +def load_data_from_result_files(files): + log.info("Found " + str(len(files)) + " csv files. Loading datapoints, please wait...") + data = [] + for f in files: + data.extend(read_csv(f)) + return data + +def find_result_files(path): + files = [] + for path in [f.path for f in os.scandir(path) if f.is_dir()]: + files.extend([f.path for f in os.scandir(path) if f.is_file()]) + return files + +# +# STATS +# + +def datapoints_count(data): + return int(len(data)) + +def datapoints_successfull(data): + return [x for x in data if float(x["time"]) > 0 ] + +def datapoints_successfull_count(data): + return int(len(datapoints_successfull(data))) + +def datapoints_failed(data): + return [x for x in data if float(x["time"]) <= 0 ] + +def datapoints_failed_count(data): + return int(len(datapoints_failed(data))) + +def directories(data): + return list(set([x["directory"] for x in data])) + +def directories_count(data): + return int(len(directories(data))) + +def exits(data): + return list(set([x["fingerprint"] for x in data])) + +def exits_count(data): + return int(len(exits(data))) + +# +# LIST +# + +def is_domain_unique(data, domain): + return is_unique([x["domain"] for x in data], domain) + +def is_unique(data, element): + return (len(filter(data, element)) == 1) + +def filter(data, element): + return [x for x in data if x == element] + +# +# ANALYZE +# + +# Reformats the dataset according to new format with more information +def analyze_format_dataset(datapoint, cached): + a = {} + a["directory"] = datapoint["directory"] + a["fingerprint"] = datapoint["fingerprint"] + a["domain"] = datapoint["domain"] + a["time"] = datapoint["time"] + a["timestamp"] = datapoint["timestamp"] + a["cached"] = str(cached) + return a + +# Selects/filters datapoints according to directory +def data_by_directory(data, directory): + return [x for x in data if x["directory"] == directory] + +# Selects/filters datapoints according to multiple directories +def data_by_directories(data, directories): + output = [] + [output.extend(data_by_directory(data, d)) for d in directories] + return output + +# Provide list dataset and single datapoint. +# Returns True if datapoint is the last datapoint of that domian in dataset. +# Else return False. +def is_last(dataset, datapoint_is_last): + for d in data_by_directory(data, datapoint_is_last["directory"]): # Filter by directory, separate scans should not interfeer + if (d["domain"] == datapoint_is_last["domain"]): # Filter by domain + if (float(datapoint_is_last["timestamp"]) > float(d["timestamp"])): # If datapoint came before or after another + return True + return False + +# Formats dataset according to analyze_format_dataset() and removes caching resolves. +def analyze_select(data, procnum, return_dict): + output = [] + l = str(len(data)) + i = 0 + output = [] + for d in data: + # Process dataset + dbd = data_by_directory(data, d["directory"]) + if is_domain_unique(dbd, d["domain"]): + # NON CACHED = Domain is unique + # + # Selects all non cached resolves from a greater list, will append "cached" attribute as False to result. + # Non cached are those which in the scop of a directory and a fingerprint are only resolved once. + output.append(analyze_format_dataset(d, False)) + #log.debug("Found non cached." + str(d["domain"]) + " " + str(d["time"])) + elif is_last(dbd, d): + # PRE CACHED = Domain is not unique and is last in its direcory + # + # Selects all pre cached resolves from a greater list, will append "cached" attribute as True to result. + # Cached are those results which in the scope of a directory and a fingerprint resolve the same domain twice. + # Out of the two resolves, only the second is selected since the first is only for caching in the exit node. + output.append(analyze_format_dataset(d, True)) + #log.debug("Found last pre cached." + str(d["domain"]) + " " + str(d["time"])) + #else: + # This will trigger when the first resolve of a pre cache resolve is found. + #log.debug("Found nothing " + str(d["domain"]) + " " + str(d["time"])) + + # Debug + i += 1 + if (i % 100 == 0): + log.debug("Analyze pre/non cached " + str(i) + " out of " + l + " done in process num #" + str(procnum) + ".") + return_dict[procnum] = output + return True + +def analyze_multithread(data): + dirs = directories(data) # Directories + procs = [] # Processes + processes = int(multiprocessing.cpu_count() - 2) # One cpu left for system reserve and one for leftover data due to roudning error + if (processes > len(dirs)): + processes = len(dirs) + log.debug("Number of processes to use " + str(processes) + " ( + 1 for picking up slack).") + + dirs_size_per_process = int(len(dirs) / processes) + log.debug("Directory size for each process " + str(dirs_size_per_process) + " out of " + str(len(dirs)) + " total directories") + + # Multi process manager + return_dict = multiprocessing.Manager().dict() + + process_data_list = [] + for _ in range(processes): + log.debug("Adding data from " + str(len(dirs)) + " directories.") + for _ in range(dirs_size_per_process): + process_data_list.append(data_by_directory(data, dirs.pop())) + # Pick up slack + if len(dirs) == 0: + log.debug("No slack data left.") + else: + log.debug("Adding data from " + str(len(dirs)) + " directories (to slack process).") + while (len(dirs) > 0): + process_data_list.append(data_by_directories(data, dirs)) + + # Allocate data to processes + for pd in process_data_list: + procs.append(multiprocessing.Process(target=analyze_select, args=(list(pd), len(procs), return_dict))) + log.debug("Process added with " + str(len(pd)) + " lines of data from " + str(directories_count(pd)) + " directories.") + + # Start processes + for proc in procs: + proc.start() + + # Join processes + output = [] + for proc in procs: + proc.join() + # Combine process output + for value in return_dict.values(): + log.debug("Process returned " + str(len(value)) + " values.") + output.extend(value) + log.info("Output currently have " + str(len(output)) + " values, " + str(len([x for x in output if str(x["cached"]) == str("True")])) + " cached and " + str(len([x for x in output if str(x["cached"]) == str("False")])) + " non cached.") + return output + +def write_data(data, filename): + with open(str(filename + ".csv"), mode='w') as F: + csv_writer = csv.DictWriter(F, fieldnames=['directory', 'fingerprint', 'domain', 'time', 'timestamp', 'cached']) + csv_writer.writeheader() + csv_writer = csv.writer(F) + for row in data: + csv_writer.writerow([ + row['directory'], + row['fingerprint'], + row['domain'], + row['time'], + row['timestamp'], + row['cached'] + ]) + +def analyze(data): + log.info("Datapoints\t\t\t" + str(datapoints_count(data))) + log.info("Sucessfull resolves\t\t" + str(datapoints_successfull_count(data))) + log.info("Failed resolves\t\t\t" + str(datapoints_failed_count(data))) + log.info("Directories (exitmap runs)\t" + str(directories_count(data))) + log.info("Fingerprints (exits scanned)\t" + str(exits_count(data))) + + # Aggregate and analyze data + t = time.time() + agg_data = analyze_multithread(data) + log.debug("Aggregated and analyzed in " + str(float(time.time() - t)) + " seconds.") + + # Validate data + # If a domain in the original dataset does not exist in the aggregated data + log.info("Validating data, please wait...") + t = time.time() + i = 0 + for d in data: + if i % 1000 == 0: + log.debug("At " + str(i)) + if not d["domain"] in [x["domain"] for x in agg_data]: + log.warning("Original domain " + str(d["domain"]) + " does not exist in aggregated data!") + i += 1 + log.debug("Validated aggregated dataset in " + str(float(time.time() - t)) + " seconds.") + + return agg_data + +# +# BOOSTRAP ANALYZER +# + +def remove_blacklisted(data, keyword): + return [x for x in data if not str(keyword) in str(x)] + +if __name__ == "__main__": + # Check input directory + try: + input_directory = sys.argv[1] + log.debug("Input directory: " + str(input_directory)) + if not os.path.isdir(input_directory): + log.error("Input directory not valid.") + exit(-1) + except Exception as ex: + log.error("Exception input directory " + str(ex)) + log.error("Supply a input directory with results from exitmap. syntax: python program input-directory keyword output-directory") + exit(-1) + + # Check blacklist keyword + try: + blacklist_keyword = sys.argv[2] + log.debug("Domain blacklist keyword: " + str(blacklist_keyword)) + except Exception as ex: + log.error("Exception input " + str(ex)) + log.error("Supply a keyword for blacklisting of domains, ex: keyword oscar will remove oscaradawdada.com. syntax: python program input-directory keyword output-directory") + exit(-1) + + # Check output directory + try: + output_directory = sys.argv[3] + log.debug("Output directory: " + str(output_directory)) + if not os.path.isdir(output_directory): + log.error("Output directory not valid.") + exit(-1) + except Exception as ex: + log.error("Exception output directory " + str(ex)) + log.error("Supply a directory for output file. syntax: python program input-directory keyword output-directory") + exit(-1) + + # Cleanup data + t = time.time() + data = load_data_from_result_files(find_result_files(input_directory)) + l = int(len(data)) + log.debug("Loaded " + str(l) + " datapoints in " + str(float(time.time() - t)) + " seconds.") + data = remove_blacklisted(data, blacklist_keyword) + log.debug("Removed according to blacklist keyword " + str(blacklist_keyword) + " " + str(len(data)) + " datapoints remaining, " + str(int(l - len(data))) + " removed.") + t = time.time() + + # Analyze and write dataset + data = analyze(data) + log.info("Writing data to disk, please wait...") + write_data(data, str(output_directory + "/theoracle_" + str(time.time()))) + log.debug("Analyzed data in " + str(float(time.time() - t)) + " seconds.") + + # Program quit successfully + log.info("Analyzer done.") + exit(0) diff --git a/src/cross_validation.py b/src/cross_validation.py new file mode 100644 index 0000000..f21eafa --- /dev/null +++ b/src/cross_validation.py @@ -0,0 +1,859 @@ +#!/usr/bin/env python3 +# This will run k-fold tests on the results generated by analyzer.py +# Provide a .csv file with test data + +import os +import sys +import logging +import csv +import time +import multiprocessing +import math +import statistics +import random + +# +# LOGGER +# + +log_format = "%(asctime)s %(name)s [%(levelname)s] %(message)s" # From exitmap by Philipp Winter +logging.basicConfig(format=log_format, level=logging.DEBUG) +log = logging.getLogger("analyzer") + +# +# LOAD FROM CSV +# + +def read_csv(path): + data = [] + with open(path, mode='r') as csv_file: + for line in csv.DictReader(csv_file): + data.append(line) + return data + +def write_data(data, filename): + with open(str(filename + ".csv"), mode='w') as F: + csv_writer = csv.DictWriter(F, fieldnames=[ + 'method', + 'C', + 'TPR', + 'FPR', + 'TNR', + 'PPV', + 'NPV', + 'FNR', + 'FDR', + 'FOR', + 'PT', + 'TS', + 'ACC', + 'BA', + 'F1', + 'MCC', + 'FM', + 'BM', + 'MK' + ]) + csv_writer.writeheader() + csv_writer = csv.writer(F) + for row in data: + csv_writer.writerow([ + row['method'], + row['C'], + row['TPR'], + row['FPR'], + row['TNR'], + row['PPV'], + row['NPV'], + row['FNR'], + row['FDR'], + row['FOR'], + row['PT'], + row['TS'], + row['ACC'], + row['BA'], + row['F1'], + row['MCC'], + row['FM'], + row['BM'], + row['MK'], + ]) + +# +# RESULT LIST OPERATIOSN +# + +def get_directories(data): + return set([x["directory"] for x in data]) + +def get_data_by_directories(data, directories=list()): + t = time.time() + if len(directories) == 0: + directories = get_directories(data) + output = [] + # N^2 TODO improve + for directory in directories: + output.append([x for x in data if x["directory"] == directory]) + if time.time() - t > 10: + t = time.time() + log.debug("Divide data by directories. " + str(len(directories) - len(output)) + " left.") + return output + +def get_fingerprints(data): + return set([x["fingerprint"] for x in data]) + +def get_data_by_fingerprints(data, fingerprints=list()): + t = time.time() + if len(fingerprints) == 0: + fingerprints = get_fingerprints(data) + output = [] + # N^2 TODO improve + for fingerprint in fingerprints: + output.append([x for x in data if x["fingerprint"] == fingerprint]) + if time.time() - t > 10: + t = time.time() + log.debug("Divide data by fingerprints. " + str(len(fingerprints) - len(output)) + " left.") + return output + +def get_methods(data): + return set([x["method"] for x in data]) + +def get_data_by_method(data, methods=list()): + t = time.time() + if len(methods) == 0: + methods = get_methods(data) + output = [] + # N^2 TODO improve + for method in methods: + output.append([x for x in data if x["method"] == method]) + if time.time() - t > 10: + t = time.time() + log.debug("Divide data by method. " + str(len(methods) - len(output)) + " left.") + return output + +# +# STATISTICS +# Sensitivity and specificity calculations +# https://en.wikipedia.org/wiki/Sensitivity_and_specificity +# + +# sensitivity, recall, hit rate, or true positive rate (TPR) +def S_TPR(TP, P): + return float(TP / P) + +# specificity, selectivity or true negative rate (TNR) +def S_TNR(TN, N): + if N == 0: + return -1 + return float(TN / N) + +# precision or positive predictive value (PPV) +def S_PPV(TP, FP): + if (TP + FP) == 0: + return -1 + return float(TP / (TP + FP)) + +# negative predictive value (NPV) +def S_NPV(TN, FN): + if (TN + FN) == 0: + return -1 + return float(TN / (TN + FN)) + +# miss rate or false negative rate (FNR) +def S_FNR(FN, P): + if P == 0: + return -1 + return float(FN / P) + +# fall-out or false positive rate (FPR) +def S_FPR(FP, N): + return float(FP / N) + +# false discovery rate (FDR) +def S_FDR(FP, TP): + if (FP + TP) == 0: + return -1 + return float(FP / (FP + TP)) + +# false omission rate (FOR) +def S_FOR(FN, TN): + if (FN + TN) == 0: + return -1 + return float(FN / (FN + TN)) + +# Prevalence Threshold (PT) +def S_PT(TN, N, TP, P): + return -1 + # TODO, implement fully + # dt = float(math.sqrt(S_TPR(TP=TP, P=P) * (1 + S_TNR(TN=TN, N=N))) + S_TNR(TN=TN, N=N) - 1) + # ValueError: math domain error + dt = float(math.sqrt(S_TPR(TP=TP, P=P) * (1 + S_TNR(TN=TN, N=N))) + S_TNR(TN=TN, N=N) - 1) + dn = float(S_TPR(TP=TP, P=P) + S_TNR(TN=TN, N=N) - 1) + if dn == 0: + return -1 + return float(dt / dn) + +# Threat score (TS) or critical success index (CSI) +def S_TS(TP, FN, FP): + if (TP + FN + FP) == 0: + return -1 + return float(TP / (TP + FN + FP)) + +# accuracy (ACC) +def S_ACC(TP, TN, P, N): + if (P + N) == 0: + return -1 + return float((TP + TN) / (P + N)) + +# balanced accuracy (BA) +def S_BA(TP, P, TN, N): + return float((S_TPR(TP=TP, P=P) + S_TNR(TN=TN, N=N)) / 2) + +# F1 score +def S_F1(TP, FP, P): + dt = float(S_PPV(TP=TP, FP=FP) * S_TPR(TP=TP, P=P)) + dn = float(S_PPV(TP=TP, FP=FP) + S_TPR(TP=TP, P=P)) + if dn == 0: + return -1 + return float(dt / dn * 2) + +# Matthews correlation coefficient (MCC) +# # TODO https://en.wikipedia.org/wiki/Matthews_correlation_coefficient +def S_MCC(TP, TN, FP, FN): + # dt = float() + p0 = float(TP + FP) + p1 = float(TP + FN) + p2 = float(TN + FP) + p3 = float(TN + FN) + dn = float(math.sqrt(p0 * p1 * p2 * p3)) + # return float(dt / dn) + return float(-1) + +# Fowlkes–Mallows index (FM) +def S_FM(TP, FP, FN): + if (TP + FP) == 0 or (TP + FN) == 0: + return -1 + p0 = float(TP / (TP + FP)) + p1 = float(TP / (TP + FN)) + return float(math.sqrt(p0 * p1)) + +# informedness or bookmaker informedness (BM) +def S_BM(TP, P, TN, N): + return float(S_TPR(TP=TP, P=P) + S_TNR(TN=TN, N=N) - 1) + +# markedness (MK) or deltaP +def S_MK(TP, FP, TN, FN): + return float(S_PPV(TP=TP, FP=FP) + S_NPV(TN=TN, FN=FN) - 1) + +# +# THRESHOLD METHODS +# + +# Using the fastest time ever for a non cached resolve. +# Return threshold time as float +def tm_fastest_non_cached_ever(training_data): + return float(sorted(set([float(y["time"]) for y in training_data if str(y["cached"]) == "False"]), key=lambda x: x, reverse=False)[0]) + +# Using the median value of non cached resolves response times. +# Return threshold time as float +def tm_median_non_cached(training_data): + return float(statistics.median(set([float(x["time"]) for x in training_data if str(x["cached"]) == "False"]))) + +# Using the mean value of non cached resolves response times. +# Return threshold time as float +def tm_mean_non_cached(training_data): + return float(statistics.mean(set([float(x["time"]) for x in training_data if str(x["cached"]) == "False"]))) + +# Using the lower quartile (Q1) value of non cached resolves response times. +# Return threshold time as float +def tm_lower_quartile_non_cached(training_data): + q2 = tm_median_non_cached(training_data) + return float(q2 / 2) + +# Using the upper quartile (Q3) value of non cached resolves response times. +# Return threshold time as float +def tm_upper_quartile_non_cached(training_data): + q2 = tm_median_non_cached(training_data) + return float(q2 / 2 * 3) + +# Randomly selects a value from the training data. +# Return threshold time as float +def tm_random_non_cached(training_data): + return float(random.choice(training_data)["time"]) + +# +# CROSS VALIDATION (K-FOLD) +# + +# Calculare new threshold from original and C value. +def threshold_c(threshold, C): + if C == 0: + return threshold + else: + return threshold + (threshold / C ) + +# Performs cross validation with all threshold methods +def cross_validate_all_threshold_methods(training_data, validation_data, k, C): + training_data_size = len(training_data) + results = list() + + # Fastest non cached + threshold = tm_fastest_non_cached_ever(training_data=training_data) + threshold = threshold_c(threshold, C) + results.append(cross_validate( + training_data_size=training_data_size, + validation_data=validation_data, + threshold=threshold, + method_name="fastest non cached", + k=k, + C=C)) + # Median non cached + threshold = tm_median_non_cached(training_data=training_data) + threshold = threshold_c(threshold, C) + results.append(cross_validate( + training_data_size=training_data_size, + validation_data=validation_data, + threshold=threshold, + method_name="median non cached", + k=k, + C=C)) + # Mean non cached + threshold = tm_mean_non_cached(training_data=training_data) + threshold = threshold_c(threshold, C) + results.append(cross_validate( + training_data_size=training_data_size, + validation_data=validation_data, + threshold=threshold, + method_name="mean non cached", + k=k, + C=C)) + # Lower quartile non cached + threshold = tm_lower_quartile_non_cached(training_data=training_data) + threshold = threshold_c(threshold, C) + results.append(cross_validate( + training_data_size=training_data_size, + validation_data=validation_data, + threshold=threshold, + method_name="lower quartile non cached", + k=k, + C=C)) + # Upper quartile non cached + threshold = tm_upper_quartile_non_cached(training_data=training_data) + threshold = threshold_c(threshold, C) + results.append(cross_validate( + training_data_size=training_data_size, + validation_data=validation_data, + threshold=threshold, + method_name="upper quartile non cached", + k=k, + C=C)) + # Random non cached + threshold = tm_random_non_cached(training_data=training_data) + threshold = threshold_c(threshold, C) + results.append(cross_validate( + training_data_size=training_data_size, + validation_data=validation_data, + threshold=threshold, + method_name="random non cached", + k=k, + C=C)) + + # Cleanup results if some are broken. + # Broken result datapoints are represented by a fingerprint set to none. + results = [x for x in results if not x["fingerprint"] == "none"] + return results + +# Format cross validation results +def cross_validate_result_format(fingerprint="none", training_data_size=-1, validation_data_size=-1, k=-1, threshold=-1, P=-1, N=-1, TP=-1, TN=-1, FP=-1, FN=-1, method_name="none", C=-1): + a = {} + # Basic + a["fingerprint"] = str(fingerprint) + a["training_data_size"] = str(training_data_size) + a["validation_data_size"] = str(validation_data_size) + a["k"] = str(k) + a["method"] = str(method_name) + a["C"] = str(C) + a["threshold"] = str(threshold) + a["P"] = str(P) + a["N"] = str(N) + # Statistics + a["TP"] = str(TP) + a["TN"] = str(TN) + a["FP"] = str(FP) + a["FN"] = str(FN) + # Derived statistics + a["TPR"] = str(S_TPR(TP=TP, P=P)) + a["FPR"] = str(S_FPR(FP=FP, N=N)) + a["TNR"] = str(S_TNR(TN=TN, N=N)) + a["PPV"] = str(S_PPV(TP=TP, FP=FP)) + a["NPV"] = str(S_NPV(TN=TN, FN=FN)) + a["FNR"] = str(S_FNR(FN=FN, P=P)) + a["FDR"] = str(S_FDR(FP=FP, TP=TP)) + a["FOR"] = str(S_FOR(FN=FN, TN=TN)) + a["PT"] = str(S_PT(TN=TN, N=N, TP=TP, P=P)) + a["TS"] = str(S_TS(TP=TP, FN=FN, FP=FP)) + a["ACC"] = str(S_ACC(TP=TP, TN=TN, P=P, N=N)) + a["BA"] = str(S_BA(TP=TP, P=P, TN=TN, N=N)) + a["F1"] = str(S_F1(TP=TP, FP=FP, P=P)) + a["MCC"] = str(S_MCC(TP=TP, TN=TN, FP=FP, FN=FN)) + a["FM"] = str(S_FM(TP=TP, FP=FP, FN=FN)) + a["BM"] = str(S_BM(TP=TP, P=P, TN=TN, N=N)) + a["MK"] = str(S_MK(TP=TP, FP=FP, TN=TN, FN=FN)) + return a + +# Perform a cross validation of trainig and validation data +# Return cross_validate_result_format(...) +def cross_validate(training_data_size, validation_data, threshold, method_name, k, C): + # https://en.wikipedia.org/wiki/Sensitivity_and_specificity + P = len([x for x in validation_data if str(x["cached"]) == "True"]) # Positives + N = len([x for x in validation_data if str(x["cached"]) == "False"]) # Negatives + TP = 0 # True positives (prediction = cached, validation = cached) (same) + FP = 0 # False positives (prediction = cached, validation = non cached) (not same) + TN = 0 # True negative (prediction = not cached, validation = non cached) (same) + FN = 0 # False negative (prediction = not cached, validation = cached) (not same) + fingerprint = str(validation_data[0]["fingerprint"]) # Fingerprint of exit node + + # Does this add up? + # P or N is zero, results will be discarded. + if P == 0 or N == 0: + #log.error("\u001b[31mCross validate, something does not add upp here! P=" + str(P) + " and N=" + str(N) + ". Will abandon and discard this data. Data belongs to fingerprint " + str(get_fingerprints(validation_data)) + " with training data size " + str(training_data_size) + " and validation data size " + str(len(validation_data)) + " from k=" + str(k) + ". Threshold was set at " + str(threshold) + " with C=" + str(C) + " and the method used was " + str(method_name) + ". Discarding data from results!\u001b[0m") + return cross_validate_result_format() # Return a empty datapoint + + # Start cross validation + for validation_datapoint in validation_data: + # Is datapoint cached according to training data + is_cached_predicted = bool(float(validation_datapoint["time"]) < threshold) + is_cached = validation_datapoint["cached"] == "True" + + # Set TP, FP, TN and FN + if is_cached_predicted == True: + # Is either TP or FP + if is_cached: + TP += 1 + else: + FP += 1 + else: + # Is either TN or FN + if is_cached: + FN += 1 + else: + TN += 1 + + return cross_validate_result_format( + fingerprint=fingerprint, + training_data_size=training_data_size, + validation_data_size=len(validation_data), + k=k, + threshold=threshold, + P=P, + N=N, + TP=TP, + TN=TN, + FP=FP, + FN=FN, + method_name=method_name, + C=C + ) + +# Perform cross validation of results using k-fold +# Provide data for one fingerprint only, fingerprint has to be unique +# Provide N times to split data in k-fold operation +# Provide process number for identification +# Provide a dictoionary to return output to main process +# Return True +def mp_k_fold(data, N, cs, procnum, return_dict): + # Time for stats + t = time.time() + # Print stats + log.info("\u001b[36mK-Fold #" + str(procnum) + " fingerprint=" + str(data[0]["fingerprint"]) + " N=" + str(N) + " c_range=" + str(len(cs)) + " data_size=" + str(len(data)) + "\u001b[0m") + # If data is insufficient, abort + if len(data) < N: + log.error("\u001b[31mData set too small!\u001b[0m") + exit(-1) + + # Output as list of cross_validate_result_format(...) + output = list() + # For N splits in k-fold, perform k-fold where k is validation data and (k)´ is training data + for k in range(N): + # List limits + # validate 0 + v0_lower = int(len(data) / N * k) # t0 upper + v0_upper = int(len(data) / N * (k + 1)) # t1 lower + # training 0 + t0_lower = 0 # Bottom of list + t0_upper = v0_lower + # validate 1 + t1_lower = v0_upper + t1_upper = int(len(data) + 1) # Top of list + + # Prepare data sets for traning + td0 = data[t0_lower:t0_upper] + td1 = data[t1_lower:t1_upper] + training_data = [] + training_data.extend(td0) + training_data.extend(td1) + + # Prepare data sets for validation + vd0 = data[v0_lower:v0_upper] + validation_data = [] + validation_data.extend(vd0) + + # Itterate over all C's in cs (list with C) + for C in cs: + # Perform cross validation with C + # Results is a list of data formatted as cross_validate_result_format(...) + cv_results = cross_validate_all_threshold_methods(training_data=training_data, validation_data=validation_data, k=k, C=C) + output.extend(cv_results) + + return_dict[procnum] = output + log.info("\u001b[32mK-Fold #" + str(procnum) + " done in " + str(float(time.time() - t))[0:5] + " seconds.\u001b[0m") + return True + +# Generate list of C values +def cross_validate_get_cs(c_min, c_max): + log.debug("C range from " + str(c_min) + " to " + str(c_max) + ".") + nums = [x for x in range(c_min, c_max)] + return nums + +# Perform cross validation as k-fold using multiple processes. +# Provide raw results data from aggregator.py. +# Provide N peices to devide data into. +# Provide number of cpus to use, 0 for all. +# Return result in list where each element is formatted according to cross_validate_result_format(...) +def mp_cross_validation_boostrap(raw_result_data, N=10, cpu_limit=0, c_min=0, c_max=1): + # Set allocated cpus + if cpu_limit < 1: + log.debug("No CPU limit was specified, will use all avaliable.") + cpu_limit = int(multiprocessing.cpu_count()) + log.info("Using " + str(cpu_limit) + " CPUs.") + + # Multi processing return dictionary + return_dict = multiprocessing.Manager().dict() + output = list() + processes_waiting_list = list() + + # Devide data by fingerprint + log.info("Dividing data by fingerprints, please wait...") + t = time.time() + data_by_fingerprint = get_data_by_fingerprints(raw_result_data) + log.info("Dividing data by fingerprints done. Took " + str(float(time.time() - t)) + " seconds.") + log.debug("Cross validation boostrapper will start " + str(len(data_by_fingerprint)) + " processes.") + t0 = time.time() + # Prepare cs (List of C) + cs = cross_validate_get_cs(c_min, c_max) + # Prepare processes, one for each fingerprint + for dbf in sorted(data_by_fingerprint, key=lambda x: len(x), reverse=False): + processes_waiting_list.append(multiprocessing.Process(target=mp_k_fold, args=(list(dbf), N, cs, len(processes_waiting_list), return_dict, ))) + log.debug("Process waiting list of size " + str(len(processes_waiting_list)) + ".") + # Start and run k-fold processes + processes_running_list = list() + # If either list is not empty + t = time.time() + while len(processes_waiting_list) > 0 or len(processes_running_list) > 0: + while len(processes_running_list) < cpu_limit and len(processes_waiting_list) > 0: + # Running list is not full and waiting list is not empty. + # Add more process. + p = processes_waiting_list.pop() + processes_running_list.append(p) + processes_running_list[len(processes_running_list) - 1].start() # Start last process in list + #log.debug("Add and start process " + str(p)) + + while len(processes_running_list) > 0: + # Running list is full or waiting list is empty + # Runnign list cannot be empty + # Wait for processes to join. + try: + p = processes_running_list.pop() + #log.debug("Join process " + str(p)) + p.join() + except Exception as ex: + log.error("\u001b[31mCould not join process, error " + str(ex) + "\u001b[0m") + + # Stats printout + if time.time() - t > 10: + t = time.time() + log.info("Cross validation running, " + str(len(processes_running_list) + len(processes_waiting_list)) + " processes left.") + # Handle output + output = [] + [output.extend(value) for value in return_dict.values()] + log.debug("Multiprocessing cross validation bootstrap, returning " + str(len(output)) + " rows of data. Took " + str(float(time.time() - t0)) + " seconds.") + return output + +# +# EVALUATE RESULTS +# + +# Perform using multiple processes. +# Results per fingerprint +# Return list of results +def mp_eval_results_fingerprint_boostrap(results, cpu_limit=0): + # Set allocated cpus + if cpu_limit < 1: + log.debug("No CPU limit was specified, will use all avaliable.") + cpu_limit = int(multiprocessing.cpu_count()) + log.info("Using " + str(cpu_limit) + " CPUs.") + + # Multi processing return dictionary + return_dict = multiprocessing.Manager().dict() + output = list() + processes_waiting_list = list() + + t0 = time.time() + # Prepare processes, one for each fingerprint + for dbf in get_data_by_fingerprints(results): + processes_waiting_list.append(multiprocessing.Process(target=mp_eval_results_fingerprint, args=(list(dbf), len(processes_waiting_list), return_dict, ))) + log.debug("Process waiting list of size " + str(len(processes_waiting_list)) + ".") + + processes_running_list = list() + # If either list is not empty + t = time.time() + while len(processes_waiting_list) > 0 or len(processes_running_list) > 0: + while len(processes_running_list) < cpu_limit and len(processes_waiting_list) > 0: + # Running list is not full and waiting list is not empty. + # Add more process. + p = processes_waiting_list.pop() + processes_running_list.append(p) + processes_running_list[len(processes_running_list) - 1].start() # Start last process in list + #log.debug("Add and start process " + str(p)) + + while len(processes_running_list) > 0: + # Running list is full or waiting list is empty + # Runnign list cannot be empty + # Wait for processes to join. + try: + p = processes_running_list.pop() + #log.debug("Join process " + str(p)) + p.join() + except Exception as ex: + log.error("\u001b[31mCould not join process, error " + str(ex) + "\u001b[0m") + + # Stats printout + if time.time() - t > 10: + t = time.time() + log.info("Evaluation per C, Method and Fingerprint running, " + str(len(processes_running_list) + len(processes_waiting_list)) + " processes left.") + # Handle output + output = [] + [output.extend(value) for value in return_dict.values()] + log.debug("Evaluation per C, Method and Fingerprint bootstrap, returning " + str(len(output)) + " rows of data. Took " + str(float(time.time() - t0)) + " seconds.") + return output + +# Evaluates results per fingerpint, C and method +# Effectively removes K from data +# Return list +def mp_eval_results_fingerprint(results_by_fingerprint, procnum, return_dict): + t = time.time() + fingerprint = get_fingerprints(results_by_fingerprint).pop() + log.info("\u001b[36mEvaluation #" + str(procnum) + ".\u001b[0m") + output = list() + for results_by_method in get_data_by_method(results_by_fingerprint): + method = get_methods(results_by_method).pop() + for C in set([x["C"] for x in results_by_method]): + data = [x for x in results if str(x["fingerprint"]) == str(fingerprint) and str(x["method"]) == str(method) and str(x["C"]) == str(C)] + a = {} + a["fingerprint"] = str(fingerprint) + a["method"] = str(method) + a["C"] = str(C) + a["TPR"] = str(statistics.mean([float(x["TPR"]) for x in data])) + a["FPR"] = str(statistics.mean([float(x["FPR"]) for x in data])) + a["TNR"] = str(statistics.mean([float(x["TNR"]) for x in data])) + a["PPV"] = str(statistics.mean([float(x["PPV"]) for x in data])) + a["NPV"] = str(statistics.mean([float(x["NPV"]) for x in data])) + a["FNR"] = str(statistics.mean([float(x["FNR"]) for x in data])) + a["FDR"] = str(statistics.mean([float(x["FDR"]) for x in data])) + a["FOR"] = str(statistics.mean([float(x["FOR"]) for x in data])) + a["PT"] = str(statistics.mean([float(x["PT"]) for x in data])) + a["TS"] = str(statistics.mean([float(x["TS"]) for x in data])) + a["ACC"] = str(statistics.mean([float(x["ACC"]) for x in data])) + a["BA"] = str(statistics.mean([float(x["BA"]) for x in data])) + a["F1"] = str(statistics.mean([float(x["F1"]) for x in data])) + a["MCC"] = str(statistics.mean([float(x["MCC"]) for x in data])) + a["FM"] = str(statistics.mean([float(x["FM"]) for x in data])) + a["BM"] = str(statistics.mean([float(x["BM"]) for x in data])) + a["MK"] = str(statistics.mean([float(x["MK"]) for x in data])) + output.append(a) + return_dict[procnum] = output + log.info("\u001b[32mEvaluation #" + str(procnum) + " done in " + str(float(time.time() - t))[0:5] + " seconds.\u001b[0m") + return True + +# Perform using multiple processes. +# Results for whole oracle +# Return list of results +def mp_eval_results_overall_bootstrap(results, cpu_limit=0): + # Set allocated cpus + if cpu_limit < 1: + log.debug("No CPU limit was specified, will use all avaliable.") + cpu_limit = int(multiprocessing.cpu_count()) + log.info("Using " + str(cpu_limit) + " CPUs.") + + # Multi processing return dictionary + return_dict = multiprocessing.Manager().dict() + output = list() + processes_waiting_list = list() + + t0 = time.time() + # Prepare processes, one for each fingerprint + for dbm in get_data_by_method(results): + processes_waiting_list.append(multiprocessing.Process(target=mp_eval_results_overall, args=(list(dbm), len(processes_waiting_list), return_dict, ))) + log.debug("Process waiting list of size " + str(len(processes_waiting_list)) + ".") + + processes_running_list = list() + # If either list is not empty + t = time.time() + while len(processes_waiting_list) > 0 or len(processes_running_list) > 0: + while len(processes_running_list) < cpu_limit and len(processes_waiting_list) > 0: + # Running list is not full and waiting list is not empty. + # Add more process. + p = processes_waiting_list.pop() + processes_running_list.append(p) + processes_running_list[len(processes_running_list) - 1].start() # Start last process in list + #log.debug("Add and start process " + str(p)) + + while len(processes_running_list) > 0: + # Running list is full or waiting list is empty + # Runnign list cannot be empty + # Wait for processes to join. + try: + p = processes_running_list.pop() + #log.debug("Join process " + str(p)) + p.join() + except Exception as ex: + log.error("\u001b[31mCould not join process, error " + str(ex) + "\u001b[0m") + + # Stats printout + if time.time() - t > 10: + t = time.time() + log.info("Evaluation per C, Method running, " + str(len(processes_running_list) + len(processes_waiting_list)) + " processes left.") + # Handle output + output = [] + [output.extend(value) for value in return_dict.values()] + log.debug("Evaluation per C, Method bootstrap, returning " + str(len(output)) + " rows of data. Took " + str(float(time.time() - t0)) + " seconds.") + return output + +# Evaluates results per C and method +# Effectively removes K and fingerprint from data +# Return list +def mp_eval_results_overall(results_by_method, procnum, return_dict): + method = get_methods(results_by_method).pop() + #t = time.time() + log.info("\u001b[36mEvaluation overall (per C and method) #" + str(procnum) + " with " + str(len(results_by_method)) + " rows of data.\u001b[0m") + output = list() + for C in set([x["C"] for x in results]): + data = [x for x in results if str(x["method"]) == str(method) and str(x["C"]) == str(C)] + #log.debug("Evaluate results for method " + str(method) + " with C = " + str(C) + " found " + str(len(data)) + " datapoints.") + a = {} + a["method"] = str(method) + a["C"] = str(C) + a["TPR"] = str(statistics.mean([float(x["TPR"]) for x in data])) + a["FPR"] = str(statistics.mean([float(x["FPR"]) for x in data])) + a["TNR"] = str(statistics.mean([float(x["TNR"]) for x in data])) + a["PPV"] = str(statistics.mean([float(x["PPV"]) for x in data])) + a["NPV"] = str(statistics.mean([float(x["NPV"]) for x in data])) + a["FNR"] = str(statistics.mean([float(x["FNR"]) for x in data])) + a["FDR"] = str(statistics.mean([float(x["FDR"]) for x in data])) + a["FOR"] = str(statistics.mean([float(x["FOR"]) for x in data])) + a["PT"] = str(statistics.mean([float(x["PT"]) for x in data])) + a["TS"] = str(statistics.mean([float(x["TS"]) for x in data])) + a["ACC"] = str(statistics.mean([float(x["ACC"]) for x in data])) + a["BA"] = str(statistics.mean([float(x["BA"]) for x in data])) + a["F1"] = str(statistics.mean([float(x["F1"]) for x in data])) + a["MCC"] = str(statistics.mean([float(x["MCC"]) for x in data])) + a["FM"] = str(statistics.mean([float(x["FM"]) for x in data])) + a["BM"] = str(statistics.mean([float(x["BM"]) for x in data])) + a["MK"] = str(statistics.mean([float(x["MK"]) for x in data])) + output.append(a) + return_dict[procnum] = output + log.info("\u001b[32mEvaluation overall (per C and method) #" + str(procnum) + " done in " + str(float(time.time() - t))[0:5] + " seconds.\u001b[0m") + return True + +# +# BOOSTRAP ANALYZER +# + +if __name__ == "__main__": + try: + if os.path.isfile(str(sys.argv[1])): + t = time.time() + log.info("Reading data from " + str(sys.argv[1])) + data = read_csv(str(sys.argv[1])) + log.info("Read data sucessfully, took " + str(float(time.time() - t))[0:5] + " seconds.") + else: + log.error("Provide a CSV file as first command line arguemnt.") + exit(-1) + except Exception as ex: + log.error("\u001b[31mReading CSV file as input failed with error " + str(ex) + "\u001b[0m") + exit(-1) + else: + # GET C MAX VALUE + try: + c_min = int(sys.argv[2]) + c_max = int(sys.argv[3]) + if c_max < c_min: + log.error("C max less than C min!") + exit(-1) + except Exception as ex: + log.error("C value from command line failed with error " + str(ex)) + log.info("Provide a a min and max C value as second and third command line argument.") + exit(-1) + else: + log.info("C max value set to " + str(c_max) + ", will run " + str(len(range(c_max))) + " times with different C value.") + + # PERFORM CROSS VALIDATION + t = time.time() + log.debug("Got " + str(len(data)) + " rows of data containg " + str(len(get_fingerprints(data))) + " fingerprints.") + results = mp_cross_validation_boostrap(data, N=10, cpu_limit=0, c_min=c_min, c_max=c_max) + log.info("Cross validation resulted in " + str(len(results)) + " rows of data.") + + # WRITE RESULTS TO DISK + # try: + # log.info("Sorting and writing data to disk, please wait...") + # results = sorted(results, key=lambda x: x["method"], reverse=False) + # write_data(results, "theoracle_cross_validation_all_" + str(time.time())) + # except Exception as ex: + # log.error("\u001b[31mSorting and writing failed with error " + str(ex) + "\u001b[0m") + # else: + # log.info("Sorted data and wrote to disk sucessfully.") + + # EVALUATE RESULTS PER FINGERPRINT + try: + log.info("Evaluating results per fingerprint, calulating means.") + # EVALUATING RESULTS PER FINGERPRINT + results = mp_eval_results_fingerprint_boostrap(results, cpu_limit=0) + log.debug("Resulting in " + str(len(results)) + " rows of data.") + except Exception as ex: + log.error("\u001b[31mEvaluation failed with error " + str(ex) + "\u001b[0m") + else: + log.info("Evaluation done.") + + # WRITE RESULTS TO DISK + # try: + # log.info("Sorting and writing data to disk, please wait...") + # results = sorted(results, key=lambda x: x["method"], reverse=False) + # write_data(results, "theoracle_cross_validation_per_fingerprint_" + str(time.time())) + # except Exception as ex: + # log.error("\u001b[31mSorting and writing failed with error " + str(ex) + "\u001b[0m") + # else: + # log.info("Sorted data and wrote to disk sucessfully.") + + # EVALUATE RESULTS PER METHOD AND C + try: + # EVALUATING RESULTS OVERALL (PER METHOD) + log.info("Evaluating results overall (per method), calulating means.") + results = mp_eval_results_overall_bootstrap(results, cpu_limit=0) + log.debug("Resulting in " + str(len(results)) + " rows of data.") + except Exception as ex: + log.error("\u001b[31mEvaluation failed with error " + str(ex) + "\u001b[0m") + else: + log.info("Evaluation done.") + + # WRITE RESULTS TO DISK + try: + log.info("Sorting and writing data to disk, please wait...") + results = sorted(results, key=lambda x: x["method"], reverse=False) + write_data(results, "theoracle_cross_validation_final_" + str(time.time())) + except Exception as ex: + log.error("\u001b[31mSorting and writing failed with error " + str(ex) + "\u001b[0m") + else: + log.info("Sorted data and wrote to disk sucessfully.") + finally: + log.info("Cross validation done, exiting.") + exit(0) \ No newline at end of file diff --git a/src/timeddns.py b/src/timeddns.py new file mode 100644 index 0000000..958df43 --- /dev/null +++ b/src/timeddns.py @@ -0,0 +1,168 @@ +#!/usr/bin/env python3 +# This is a exitmap module + +import os +import sys +import logging +import random +import string +import torsocks +import time +import configparser +import csv +import datetime + +# +# LOGGER +# + +log = logging.getLogger(__name__) + +# +# CONFIG LOADER +# + +config_path = "theoracle.conf" + +def get_config(section, variable): + try: + config = configparser.ConfigParser() + config.read(config_path) + except Exception as ex: + print("Failed to read config, exception: " + str(ex)) + exit(-1) + else: + #print("Read config successfully.") + return config.get(section, variable) + +def get_white_flag(): + return get_config("DOMAIN", "WhiteFlag") + +def get_subdomain_base(): + return get_config("DOMAIN", "SubDomainBase") + +def get_subdomain_length(): + return get_config("DOMAIN", "SubDomainLength") + +def get_resolves(): + return get_config("DOMAIN", "Resolves") + +def get_output_directory(): + return get_config("OUTPUT", "ResultDirectory") + +def get_tries(): + return get_config("DOMAIN", "Tries") + +# +# CSV AND SAVING RESULTS +# + +result_directory = str(get_output_directory() + "/" + datetime.datetime.now().isoformat()) + +def setup_result_directory(): + try: + if (os.path.isdir(str(get_output_directory())) == False): + os.mkdir(str(get_output_directory())) + if (os.path.isdir(result_directory) == False): + os.mkdir(result_directory) + except Exception as ex: + log.error("Result directory exception " + str(ex)) + +def save_results_as_csv(filename, data): + with open(str(result_directory + "/" + filename + ".csv"), mode='w') as F: + csv_writer = csv.DictWriter(F, fieldnames=['directory', 'fingerprint', 'domain', 'time', 'timestamp', 'status', 'tries']) + csv_writer.writeheader() + csv_writer = csv.writer(F) + for row in data: + csv_writer.writerow([ + row['directory'], + row['fingerprint'], + row['domain'], + row['time'], + row['timestamp'], + row['status'], + row['tries'] + ]) + +# +# GENERATE DOMAIN +# + +def get_random_subdomain(): + return ''.join(random.choice(string.ascii_lowercase) for i in range(int(get_subdomain_length()))) + "." + str(get_subdomain_base()) + +def get_set_of_random_domains(): + return [get_random_subdomain() for x in [" "]*int(get_resolves())] + +# +# TOR RESOLVE +# + +def resolve_domain(exit_desc, url, tries=0): + sock = torsocks.torsocket() + sock.settimeout(10) # 10 second timeout before socket closes the TCP connection + + try: + t = time.time() + ipv4 = sock.resolve(url) # Try to resolve the domain + except Exception as err: + log.debug("Error: " + str(err) + " " + str(url) + " on exit " + str(exit_desc.fingerprint)) + if (tries < int(get_tries())): + #time.sleep(0.01 * tries) + return resolve_domain(exit_desc, url, tries=tries + 1) + else: + log.warn(str(tries) + " tries, abandoning " + str(url) + " on exit " + str(exit_desc.fingerprint)) + return url, -1, tries + else: + t = time.time() - t + log.debug(str(url) + " resolved to " + str(ipv4)) + return url, t, tries + +def resolve_list(exit_desc, domains, status): + data = [] + for url, t, tries in [resolve_domain(exit_desc, url) for url in domains]: + a = {} + a['directory'] = str(result_directory) + a['fingerprint'] = str(exit_desc.fingerprint) + a['domain'] = str(url) + a['time'] = str(t) + a['timestamp'] = str(time.time()) + a['status'] = str(status) + a['tries'] = str(tries) + data.append(a) + return data + +def test_domains_and_save(exit_desc): + # Print how many domains will be resolved. + log.debug("Resolving " + str(int(get_resolves()) * 3) + " (2*" + str(get_resolves()) + "+" + str(get_resolves()) + ") domains on exit relay " + str(exit_desc.fingerprint) + ".") + + # Notify the exit relay that research is performed. + data = resolve_list(exit_desc, [get_white_flag()], "white flag") + + # Resolve a set of random domains once + data.extend(resolve_list(exit_desc, get_set_of_random_domains(), "not cached")) + + # This set will be resolved twice and measured once + cached_domains_twice = get_set_of_random_domains() + # First resolve will cache the domains in the exit relay + data.extend(resolve_list(exit_desc, cached_domains_twice, "now caching")) + # Second resolve will only be used to measure the cache speed + data.extend(resolve_list(exit_desc, cached_domains_twice, "pre cached")) + + # Store data in a CSV file named after the current date and time + save_results_as_csv(str(exit_desc.fingerprint), data) + + log.debug("Done on exit relay " + str(exit_desc.fingerprint) + " collected " + str(len(data)) + " lines of data.") + +# +# MODULE BOOTSTRAP +# + +def probe(exit_desc, run_python_over_tor, run_cmd_over_tor, **kwargs): + setup_result_directory() + run_python_over_tor(test_domains_and_save, exit_desc) + +if __name__ == "__main__": + # When invoked over the command line + log.critical("Module can only be run over Tor, not stand-alone.") + exit(1) \ No newline at end of file diff --git a/src/visualize.py b/src/visualize.py new file mode 100644 index 0000000..342e0b8 --- /dev/null +++ b/src/visualize.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 +# This will run k-fold tests on the results generated by analyzer.py +# Provide a .csv file with test data + +import os +import sys +import time +import logging +import csv +import matplotlib.pyplot as plt + +# +# LOGGER +# + +log_format = "%(asctime)s %(name)s [%(levelname)s] %(message)s" # From exitmap by Philipp Winter +logging.basicConfig(format=log_format, level=logging.DEBUG) +log = logging.getLogger("visualizer") + +# +# LOAD FROM CSV +# + +def read_csv(path): + data = [] + with open(path, mode='r') as csv_file: + for line in csv.DictReader(csv_file): + data.append(line) + return data + +# +# RESULT LIST OPERATIOSN +# + +def get_methods(data): + return set([x["method"] for x in data]) + +def get_data_by_method(data, methods=list()): + t = time.time() + if len(methods) == 0: + methods = get_methods(data) + output = [] + # N^2 TODO improve + for method in methods: + output.append([x for x in data if x["method"] == method]) + if time.time() - t > 10: + t = time.time() + log.debug("Divide data by method. " + str(len(methods) - len(output)) + " left.") + return output + +def get_cs(data): + return set([x["C"] for x in data]) + +def get_data_by_c(data, cs=list()): + t = time.time() + if len(cs) == 0: + cs = get_cs(data) + output = [] + # N^2 TODO improve + for c in cs: + output.append([x for x in data if x["C"] == c]) + if time.time() - t > 10: + t = time.time() + log.debug("Divide data by C. " + str(len(cs) - len(output)) + " left.") + return output + +# +# VISUALIZER +# + +def visualizer(data): + log.debug("Visualizer got " + str(len(data)) + " rows of data.") + + plt.xlabel('False Positive Rate (FPR)') + plt.ylabel('True Positive Rate (TPR)') + plt.title("The Oracle - plotting " + str(len(get_methods(data))) + " methods, len(C)=" + str(len(get_cs(get_data_by_method(data).pop()))) + ".") + + lines = ["o", "v", "^", "<", ">", "s", "P", "*", "X", "D", "d"] + + i = 0 + for data_by_method in get_data_by_method(data): + method = get_methods(data_by_method).pop() + log.debug("Method " + str(method) + " with " + str(len(data_by_method)) + " rows of data.") + x = [float(z["FPR"]) for z in data_by_method] # X axis is FPR + y = [float(z["TPR"]) for z in data_by_method] # Y axis is TPR + plt.scatter(x=x, y=y, label=str(method), marker=str(lines[i])) + x = sorted(x) + y = sorted(y) + plt.plot(x, y) + i += 1 + + plt.scatter(x=[0], y=[1], color="0", label="Perfect Classifier") # Optimal point + plt.plot([0,1],[0,1], color="0", label="Random Classifier") # Random rate + + plt.autoscale() + plt.legend() + plt.show() + +# +# BOOSTRAP ANALYZER +# + +if __name__ == "__main__": + try: + if os.path.isfile(str(sys.argv[1])): + t = time.time() + log.info("Reading data from " + str(sys.argv[1])) + data = read_csv(str(sys.argv[1])) + log.info("Read data sucessfully, took " + str(float(time.time() - t))[0:5] + " seconds.") + else: + log.error("Provide a CSV file as first command line arguemnt.") + exit(-1) + except Exception as ex: + log.error("\u001b[31mReading CSV file as input failed with error " + str(ex) + "\u001b[0m") + exit(-1) + else: + log.info("Visualizing " + str(len(data)) + " rows of data.") + try: + visualizer(data) + except Exception as ex: + log.error("Visualizer error " + str(ex)) + finally: + log.info("Visualization done, exiting.") + exit(0) \ No newline at end of file diff --git a/theoracle.conf b/theoracle.conf new file mode 100644 index 0000000..580ea5a --- /dev/null +++ b/theoracle.conf @@ -0,0 +1,14 @@ +[DOMAIN] +# Message to exit relay operators, wave a white flag to show that you're not an intruder, this should be contact information to identify you as a researcher. Has to be a valid domain. +WhiteFlag = oscardotanderssondot200atgmaildotcom.pulls.name +# A domain you own that accepts all subdomains +SubDomainBase = pulls.name +# Length of the subdomain, should be a high number, preferably 32 to 48. Subdomain cannot be longer than 63. +SubDomainLength = 48 +# Number of domains to resolve at each exit relay, this is multiplied by 3 +Resolves = 128 +# Amount of times to resolve a domain after a failed atempt. Higher numbers will put more stress on the network but will results in more resolves. Recommended 25-75 times. +Tries = 16 + +[OUTPUT] +ResultDirectory = results