diff --git a/python/magika/__init__.py b/python/magika/__init__.py index 047a2b71..45299ba0 100644 --- a/python/magika/__init__.py +++ b/python/magika/__init__.py @@ -15,7 +15,8 @@ import dotenv -from magika import magika, prediction_mode +from magika import magika +from magika.types import prediction_mode Magika = magika.Magika MagikaError = magika.MagikaError diff --git a/python/magika/cli/magika.py b/python/magika/cli/magika_python_module_tester.py similarity index 57% rename from python/magika/cli/magika.py rename to python/magika/cli/magika_python_module_tester.py index a7112534..5b85c87a 100755 --- a/python/magika/cli/magika.py +++ b/python/magika/cli/magika_python_module_tester.py @@ -14,25 +14,23 @@ # limitations under the License. -import copy import dataclasses -import hashlib +import importlib.metadata import json import logging import os import sys from pathlib import Path -from typing import List, Optional +from typing import List, Optional, Tuple import click -from tabulate import tabulate from magika import Magika, MagikaError, PredictionMode, colors -from magika.content_types import ContentTypesManager from magika.logger import get_logger -from magika.types import FeedbackReport, MagikaResult +from magika.types import ContentTypeLabel, MagikaResult, Status, StatusOr -VERSION = "0.5.2-dev" +# TODO: the version should be migrated to the magika module, or somewhere else in python/ +VERSION = importlib.metadata.version("magika") CONTACT_EMAIL = "magika-dev@google.com" @@ -115,12 +113,6 @@ ) @click.option("-v", "--verbose", is_flag=True, help="Enable more verbose output.") @click.option("-vv", "--debug", is_flag=True, help="Enable debug logging.") -@click.option( - "--generate-report", - "generate_report_flag", - is_flag=True, - help="Generate report useful when reporting feedback.", -) @click.option( "--dump-performance-stats", "dump_performance_stats_flag", @@ -130,12 +122,6 @@ @click.option( "--version", "output_version", is_flag=True, help="Print the version and exit." ) -@click.option( - "--list-output-content-types", - "list_output_content_types", - is_flag=True, - help="Show a list of supported content types.", -) @click.option( "--model-dir", type=click.Path( @@ -158,10 +144,8 @@ def main( with_colors: bool, verbose: bool, debug: bool, - generate_report_flag: bool, dump_performance_stats_flag: bool, output_version: bool, - list_output_content_types: bool, model_dir: Optional[Path], ) -> None: """ @@ -178,6 +162,9 @@ def main( with_colors = False _l = get_logger(use_colors=with_colors) + _l.warning( + "This CLI is deprecated and only used for testing the python module! Use the Rust CLI instead." + ) if verbose: _l.setLevel(logging.INFO) @@ -189,14 +176,6 @@ def main( _l.raw_print_to_stdout(f"Default model: {Magika.get_default_model_name()}") sys.exit(0) - # check CLI arguments and options - if list_output_content_types: - if len(files_paths) > 0: - _l.error("You cannot pass any path when using the -l / --list option.") - sys.exit(1) - print_output_content_types_list() - sys.exit(0) - if len(files_paths) == 0: _l.error("You need to pass at least one path, or - to read from stdin.") sys.exit(1) @@ -244,7 +223,7 @@ def main( _l.error(f'File or directory "{str(p)}" does not exist.') sys.exit(1) # the resulting list may still include some directories; thus, we filter them out. - files_paths = list(filter(lambda x: not x.is_dir(), expanded_paths)) + files_paths: List[Path] = list(filter(lambda x: not x.is_dir(), expanded_paths)) # type: ignore[no-redef] _l.info(f"Considering {len(files_paths)} files") _l.debug(f"Files: {files_paths}") @@ -284,84 +263,85 @@ def main( } # updated only when we need to output in JSON format - all_predictions: List[MagikaResult] = [] - - # used only when the user decides to generate a feedback report - report_entries: List[FeedbackReport] = [] + all_predictions: List[Tuple[Path, StatusOr[MagikaResult]]] = [] batches_num = len(files_paths) // batch_size if len(files_paths) % batch_size != 0: batches_num += 1 for batch_idx in range(batches_num): - files_ = files_paths[batch_idx * batch_size : (batch_idx + 1) * batch_size] + batch_files_paths = files_paths[ + batch_idx * batch_size : (batch_idx + 1) * batch_size + ] if should_read_from_stdin(files_paths): batch_predictions = [get_magika_result_from_stdin(magika)] else: - batch_predictions = magika.identify_paths(files_) + batch_predictions = magika.identify_paths(batch_files_paths) if json_output: # we do not stream the output for JSON output - all_predictions.extend(batch_predictions) + all_predictions.extend(zip(batch_files_paths, batch_predictions)) elif jsonl_output: - for magika_result in batch_predictions: - _l.raw_print_to_stdout(json.dumps(dataclasses.asdict(magika_result))) + for file_path, result in zip(batch_files_paths, batch_predictions): + _l.raw_print_to_stdout( + json.dumps(path_and_result_to_dict(file_path, result)) + ) else: - for magika_result in batch_predictions: - path = magika_result.path - output_ct_label = magika_result.output.ct_label - output_ct_description = magika_result.output.description - output_ct_group = magika_result.output.group - - if mime_output: - # If the user requested the MIME type, we use the mime type - # regardless of the compatibility mode. - output = magika_result.output.mime_type - elif label_output: - output = magika_result.output.ct_label - elif magic_compatibility_mode: - output = magika_result.output.magic - else: # human-readable description - dl_ct_label = magika_result.dl.ct_label - - output = f"{output_ct_description} ({output_ct_group})" - - if dl_ct_label is not None and dl_ct_label != output_ct_label: - # it seems that we had a too-low confidence prediction - # from the model. Let's warn the user about our best - # bet. - assert magika_result.dl.score is not None - dl_description = magika_result.dl.description - dl_group = magika_result.dl.group - dl_score = int(magika_result.dl.score * 100) - output += f" [Low-confidence model best-guess: {dl_description} ({dl_group}), score={dl_score}]" - - if with_colors: - start_color = color_by_group.get(output_ct_group, colors.WHITE) - end_color = colors.RESET - - if output_score: - score = int(magika_result.output.score * 100) + for file_path, result in zip(batch_files_paths, batch_predictions): + if result.ok: + if mime_output: + # If the user requested the MIME type, we use the mime type + # regardless of the compatibility mode. + output = result.value.output.mime_type + elif label_output: + output = str(result.value.output.label) + else: # human-readable description + output = f"{result.value.output.description} ({result.value.output.group})" + + if ( + result.value.dl.label != ContentTypeLabel.UNDEFINED + and result.value.dl.label != result.value.output.label + ): + # it seems that we had a too-low confidence prediction + # from the model. Let's warn the user about our best + # bet. + output += ( + " [Low-confidence model best-guess: " + f"{result.value.dl.description} ({result.value.dl.group}), " + f"score={result.value.score}]" + ) + + if with_colors: + start_color = color_by_group.get( + result.value.output.group, colors.WHITE + ) + end_color = colors.RESET + else: + output = result.status + start_color = "" + end_color = "" + + if output_score and result.ok: + score = int(result.value.score * 100) _l.raw_print_to_stdout( - f"{start_color}{path}: {output} {score}%{end_color}" + f"{start_color}{file_path}: {output} {score}%{end_color}" ) else: - _l.raw_print_to_stdout(f"{start_color}{path}: {output}{end_color}") - - if generate_report_flag: - for file_path, magika_result in zip(files_, batch_predictions): - report_entries.append( - generate_feedback_report(magika, file_path, magika_result) - ) + _l.raw_print_to_stdout( + f"{start_color}{file_path}: {output}{end_color}" + ) if json_output: _l.raw_print_to_stdout( - json.dumps([dataclasses.asdict(res) for res in all_predictions], indent=4) + json.dumps( + [ + path_and_result_to_dict(file_path, result) + for file_path, result in all_predictions + ], + indent=4, + ) ) - if generate_report_flag: - print_feedback_report(magika=magika, reports=report_entries) - if dump_performance_stats_flag: magika.dump_performance_stats() @@ -370,92 +350,21 @@ def should_read_from_stdin(files_paths: List[Path]) -> bool: return len(files_paths) == 1 and str(files_paths[0]) == "-" -def get_magika_result_from_stdin(magika: Magika) -> MagikaResult: +def get_magika_result_from_stdin(magika: Magika) -> StatusOr[MagikaResult]: content = sys.stdin.buffer.read() result = magika.identify_bytes(content) return result -def generate_feedback_report( - magika: Magika, file_path: Path, magika_result: MagikaResult -) -> FeedbackReport: - magika_result_copy = copy.copy(magika_result) - magika_result_copy.path = "" # avoid PII - features = Magika._extract_features_from_path( - file_path, - beg_size=magika._input_sizes["beg"], - mid_size=magika._input_sizes["mid"], - end_size=magika._input_sizes["end"], - padding_token=magika._padding_token, - block_size=magika._block_size, - ) - return FeedbackReport( - hash=hashlib.sha256(file_path.read_bytes()).hexdigest(), - features=features, - result=magika_result_copy, - ) - - -def print_feedback_report(magika: Magika, reports: List[FeedbackReport]) -> None: - _l = get_logger() - - processed_reports = [ - { - "hash": report.hash, - "features": json.dumps(dataclasses.asdict(report.features)).replace( - " ", "" - ), - "result": dataclasses.asdict(report.result), +def path_and_result_to_dict(file_path: Path, result: StatusOr[MagikaResult]) -> dict: + if result.ok: + out = { + "path": str(file_path), + "result": {"status": Status.OK, "value": dataclasses.asdict(result.value)}, } - for report in reports - ] - - full_report = { - "version": VERSION, - "model_dir_name": magika.get_model_name(), - "python_version": sys.version, - "reports": processed_reports, - } - report_header = "REPORT" - report_header_full_len = 40 - _l.raw_print("#" * report_header_full_len) - _l.raw_print( - "###" - + (" " * ((report_header_full_len - 6 - len(report_header)) // 2)) - + report_header - + (" " * ((report_header_full_len - 6 - len(report_header)) // 2)) - + "###", - ) - _l.raw_print("#" * report_header_full_len) - _l.raw_print(json.dumps(full_report)) - _l.raw_print("#" * report_header_full_len) - _l.raw_print( - f"Please copy/paste the above as a description of your issue. Open a GitHub issue or reach out at {CONTACT_EMAIL}.", - ) - _l.raw_print( - "Please include as many details as possible, e.g., what was the expected content type.", - ) - _l.raw_print( - "IMPORTANT: do NOT submit private information or PII! The extracted features include many bytes of the tested files!", - ) - - -def print_output_content_types_list() -> None: - _l = get_logger() - - ctm = ContentTypesManager() - content_types = ctm.get_output_content_types() - - headers = ["#", "Content Type Label", "Description"] - rows = [] - for ct_idx, ct in enumerate(content_types): - row = [ - ct_idx + 1, - ct.name, - "" if ct.description is None else ct.description, - ] - rows.append(row) - _l.raw_print_to_stdout(tabulate(rows, headers=headers)) + else: + out = {"path": str(file_path), "result": {"status": result.status}} + return out if __name__ == "__main__": diff --git a/python/magika/config/content_types_config.json b/python/magika/config/content_types_config.json deleted file mode 100644 index 429905d9..00000000 --- a/python/magika/config/content_types_config.json +++ /dev/null @@ -1,4760 +0,0 @@ -{ - "3gp": { - "name": "3gp", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "ace": { - "name": "ace", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "aff": { - "name": "aff", - "extensions": [ - "aff" - ], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "ai": { - "name": "ai", - "extensions": [ - "ai" - ], - "mime_type": "application/pdf", - "group": "document", - "magic": "PDF document", - "description": "Adobe Illustrator Artwork", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary" - ], - "model_target_label": "ai", - "target_label": "ai", - "correct_labels": [ - "ai", - "pdf" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "algol68": { - "name": "algol68", - "extensions": [ - "a68" - ], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "apk": { - "name": "apk", - "extensions": [ - "apk" - ], - "mime_type": "application/vnd.android.package-archive", - "group": "executable", - "magic": "Java archive data", - "description": "Android package", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "zip_archive", - "archive" - ], - "model_target_label": "zip", - "target_label": "apk", - "correct_labels": [ - "apk" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "appleplist": { - "name": "appleplist", - "extensions": [ - "bplist", - "plist" - ], - "mime_type": "application/x-plist", - "group": "application", - "magic": "Apple binary property list", - "description": "Apple property list", - "vt_type": "appleplist", - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "text" - ], - "model_target_label": "appleplist", - "target_label": "appleplist", - "correct_labels": [ - "appleplist" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "arj": { - "name": "arj", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "asm": { - "name": "asm", - "extensions": [ - "S", - "asm" - ], - "mime_type": "text/x-asm", - "group": "code", - "magic": "assembler source", - "description": "Assembly", - "vt_type": null, - "datasets": [ - "github" - ], - "parent": null, - "tags": [ - "text", - "dl_target" - ], - "model_target_label": "asm", - "target_label": "asm", - "correct_labels": [ - "asm" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "asp": { - "name": "asp", - "extensions": [ - "aspx", - "asp" - ], - "mime_type": "text/html", - "group": "code", - "magic": "HTML document", - "description": "ASP source", - "vt_type": null, - "datasets": [ - "github" - ], - "parent": null, - "tags": [ - "text", - "dl_target" - ], - "model_target_label": "asp", - "target_label": "asp", - "correct_labels": [ - "asp" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "avi": { - "name": "avi", - "extensions": [ - "avi" - ], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "ax": { - "name": "ax", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "batch": { - "name": "batch", - "extensions": [ - "bat" - ], - "mime_type": "text/x-msdos-batch", - "group": "code", - "magic": "DOS batch file", - "description": "DOS batch file", - "vt_type": null, - "datasets": [ - "github" - ], - "parent": null, - "tags": [ - "text", - "dl_target" - ], - "model_target_label": "batch", - "target_label": "batch", - "correct_labels": [ - "batch" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "bcad": { - "name": "bcad", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "bib": { - "name": "bib", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "bmp": { - "name": "bmp", - "extensions": [ - "bmp" - ], - "mime_type": "image/bmp", - "group": "image", - "magic": "PC bitmap", - "description": "BMP image data", - "vt_type": "bmp", - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "media" - ], - "model_target_label": "bmp", - "target_label": "bmp", - "correct_labels": [ - "bmp" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "bpl": { - "name": "bpl", - "extensions": [ - "bpl" - ], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "brainfuck": { - "name": "brainfuck", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "bzip": { - "name": "bzip", - "extensions": [ - "bz2", - "tbz2", - "tar.bz2" - ], - "mime_type": "application/x-bzip2", - "group": "archive", - "magic": "bzip2 compressed data", - "description": "bzip2 compressed data", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "archive" - ], - "model_target_label": "bzip", - "target_label": "bzip", - "correct_labels": [ - "bzip" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "c": { - "name": "c", - "extensions": [ - "c", - "cpp", - "h", - "hpp", - "cc" - ], - "mime_type": "text/x-c", - "group": "code", - "magic": "C source", - "description": "C source", - "vt_type": "c,cpp", - "datasets": [ - "github" - ], - "parent": null, - "tags": [ - "text", - "dl_target" - ], - "model_target_label": "c", - "target_label": "c", - "correct_labels": [ - "c", - "cpp" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "cab": { - "name": "cab", - "extensions": [ - "cab" - ], - "mime_type": "application/vnd.ms-cab-compressed", - "group": "archive", - "magic": "Microsoft Cabinet archive data", - "description": "Microsoft Cabinet archive data", - "vt_type": "cab", - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary" - ], - "model_target_label": "cab", - "target_label": "cab", - "correct_labels": [ - "cab" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "cad": { - "name": "cad", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "cat": { - "name": "cat", - "extensions": [ - "cat" - ], - "mime_type": "application/octet-stream", - "group": "application", - "magic": "data", - "description": "Windows Catalog file", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary" - ], - "model_target_label": "cat", - "target_label": "cat", - "correct_labels": [ - "cat", - "ctl" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "cdf": { - "name": "cdf", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "chm": { - "name": "chm", - "extensions": [ - "chm" - ], - "mime_type": "application/chm", - "group": "application", - "magic": "MS Windows HtmlHelp Data", - "description": "MS Windows HtmlHelp Data", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary" - ], - "model_target_label": "chm", - "target_label": "chm", - "correct_labels": [ - "chm" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "clojure": { - "name": "clojure", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "cmake": { - "name": "cmake", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "cobol": { - "name": "cobol", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "coff": { - "name": "coff", - "extensions": [], - "mime_type": "application/x-coff", - "group": "executable", - "magic": "Intel 80386 COFF", - "description": "Intel 80386 COFF", - "vt_type": "coff", - "datasets": [ - "vt-type" - ], - "parent": null, - "tags": [ - "binary" - ], - "model_target_label": "coff", - "target_label": "coff", - "correct_labels": [ - "coff", - "exp" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "coffee": { - "name": "coffee", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "com": { - "name": "com", - "extensions": [], - "mime_type": "application/x-dosexec", - "group": null, - "magic": null, - "description": null, - "vt_type": "com", - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "cpl": { - "name": "cpl", - "extensions": [ - "cpl" - ], - "mime_type": "application/x-dosexec", - "group": "executable", - "magic": "PE32 executable", - "description": "PE Windows executable", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "pebin" - ], - "model_target_label": "pebin", - "target_label": "pebin", - "correct_labels": [ - "pebin", - "cpl" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "cpp": { - "name": "cpp", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "crx": { - "name": "crx", - "extensions": [ - "crx" - ], - "mime_type": "application/x-chrome-extension", - "group": "executable", - "magic": "Google Chrome extension", - "description": "Google Chrome extension", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "zip_archive", - "archive" - ], - "model_target_label": "crx", - "target_label": "crx", - "correct_labels": [ - "crx" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "cs": { - "name": "cs", - "extensions": [ - "cs" - ], - "mime_type": "text/plain", - "group": "code", - "magic": "ASCII text", - "description": "C# source", - "vt_type": null, - "datasets": [ - "github" - ], - "parent": null, - "tags": [ - "text", - "dl_target" - ], - "model_target_label": "cs", - "target_label": "cs", - "correct_labels": [ - "cs" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "css": { - "name": "css", - "extensions": [ - "css" - ], - "mime_type": "text/css", - "group": "code", - "magic": "ASCII text", - "description": "CSS source", - "vt_type": null, - "datasets": [ - "github" - ], - "parent": null, - "tags": [ - "text", - "dl_target" - ], - "model_target_label": "css", - "target_label": "css", - "correct_labels": [ - "css" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "csv": { - "name": "csv", - "extensions": [ - "csv" - ], - "mime_type": "text/csv", - "group": "code", - "magic": "CSV text", - "description": "CSV document", - "vt_type": null, - "datasets": [ - "github" - ], - "parent": null, - "tags": [ - "text", - "dl_target" - ], - "model_target_label": "csv", - "target_label": "csv", - "correct_labels": [ - "csv" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "ctl": { - "name": "ctl", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "dart": { - "name": "dart", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "deb": { - "name": "deb", - "extensions": [ - "deb" - ], - "mime_type": "application/vnd.debian.binary-package", - "group": "archive", - "magic": "Debian binary package", - "description": "Debian binary package", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "archive" - ], - "model_target_label": "deb", - "target_label": "deb", - "correct_labels": [ - "deb" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "dex": { - "name": "dex", - "extensions": [ - "dex" - ], - "mime_type": "application/x-android-dex", - "group": "executable", - "magic": "Dalvik dex file", - "description": "Dalvik dex file", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary" - ], - "model_target_label": "dex", - "target_label": "dex", - "correct_labels": [ - "dex" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "dey": { - "name": "dey", - "extensions": [], - "mime_type": "application/x-android-dey", - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "diff": { - "name": "diff", - "extensions": [ - "diff" - ], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "directory": { - "name": "directory", - "extensions": [], - "mime_type": "inode/directory", - "group": "inode", - "magic": "directory", - "description": "A directory", - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": "directory", - "correct_labels": [ - "directory" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": false - }, - "dll": { - "name": "dll", - "extensions": [ - "dll" - ], - "mime_type": "application/x-dosexec", - "group": "executable", - "magic": "PE Windows executable", - "description": "PE Windows executable", - "vt_type": "pedll", - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "pebin" - ], - "model_target_label": "pebin", - "target_label": "pebin", - "correct_labels": [ - "pebin", - "dll" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "dm": { - "name": "dm", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "dmg": { - "name": "dmg", - "extensions": [ - "dmg" - ], - "mime_type": "application/x-apple-diskimage", - "group": "archive", - "magic": "Apple disk image", - "description": "Apple disk image", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary" - ], - "model_target_label": "dmg", - "target_label": "dmg", - "correct_labels": [ - "dmg" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "doc": { - "name": "doc", - "extensions": [ - "doc" - ], - "mime_type": "application/msword", - "group": "document", - "magic": "Composite Document File", - "description": "Microsoft Word CDF document", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "cdf" - ], - "model_target_label": "cdf", - "target_label": "doc", - "correct_labels": [ - "doc" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "dockerfile": { - "name": "dockerfile", - "extensions": [ - "=Dockerfile" - ], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "docx": { - "name": "docx", - "extensions": [ - "docx", - "docm" - ], - "mime_type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - "group": "document", - "magic": "Microsoft Word 2007+", - "description": "Microsoft Word 2007+ document", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "ooxml", - "zip_archive", - "archive" - ], - "model_target_label": "zip", - "target_label": "docx", - "correct_labels": [ - "docx", - "tmdx" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "dosmbr": { - "name": "dosmbr", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "dylib": { - "name": "dylib", - "extensions": [ - "dylib" - ], - "mime_type": "application/x-mach-o", - "group": "executable", - "magic": "Mach-O executable", - "description": "Mach-O executable", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "macho" - ], - "model_target_label": "macho", - "target_label": "macho", - "correct_labels": [ - "macho", - "dylib" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "elf": { - "name": "elf", - "extensions": [ - "elf", - "so" - ], - "mime_type": "application/x-executable-elf", - "group": "executable", - "magic": "ELF executable", - "description": "ELF executable", - "vt_type": "elf", - "datasets": [ - "vt-type" - ], - "parent": null, - "tags": [ - "binary", - "elf" - ], - "model_target_label": "elf", - "target_label": "elf", - "correct_labels": [ - "elf", - "so" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "elixir": { - "name": "elixir", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "emf": { - "name": "emf", - "extensions": [ - "emf" - ], - "mime_type": "application/octet-stream", - "group": "application", - "magic": "Windows Enhanced Metafile", - "description": "Windows Enhanced Metafile image data", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary" - ], - "model_target_label": "emf", - "target_label": "emf", - "correct_labels": [ - "emf" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "eml": { - "name": "eml", - "extensions": [ - "eml" - ], - "mime_type": "message/rfc822", - "group": "text", - "magic": "RFC 822 mail", - "description": "RFC 822 mail", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "text" - ], - "model_target_label": "eml", - "target_label": "eml", - "correct_labels": [ - "eml" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "empty": { - "name": "empty", - "extensions": [], - "mime_type": "inode/x-empty", - "group": "inode", - "magic": "empty", - "description": "Empty file", - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": "empty", - "correct_labels": [ - "empty" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": false - }, - "epub": { - "name": "epub", - "extensions": [ - "epub" - ], - "mime_type": "application/epub+zip", - "group": "document", - "magic": "EPUB document", - "description": "EPUB document", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "zip_archive", - "archive" - ], - "model_target_label": "zip", - "target_label": "epub", - "correct_labels": [ - "epub" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "erlang": { - "name": "erlang", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "ese": { - "name": "ese", - "extensions": [], - "mime_type": "application/x-ms-ese", - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "exe": { - "name": "exe", - "extensions": [ - "exe" - ], - "mime_type": "application/x-dosexec", - "group": "executable", - "magic": "ELF executable", - "description": "ELF executable", - "vt_type": "peexe", - "datasets": [ - "vt-ext", - "vt-ext-malicious" - ], - "parent": null, - "tags": [ - "binary", - "pebin" - ], - "model_target_label": "pebin", - "target_label": "pebin", - "correct_labels": [ - "pebin", - "exe" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "exp": { - "name": "exp", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "flac": { - "name": "flac", - "extensions": [ - "flac" - ], - "mime_type": "audio/flac", - "group": "audio", - "magic": "FLAC audio bitstream data", - "description": "FLAC audio bitstream data", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary" - ], - "model_target_label": "flac", - "target_label": "flac", - "correct_labels": [ - "flac" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "fortran": { - "name": "fortran", - "extensions": [ - "f90", - "f95", - "f03" - ], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "fpx": { - "name": "fpx", - "extensions": [ - "fpx" - ], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": "fpx", - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "gif": { - "name": "gif", - "extensions": [ - "gif" - ], - "mime_type": "image/gif", - "group": "image", - "magic": "GIF image data", - "description": "GIF image data", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "media" - ], - "model_target_label": "gif", - "target_label": "gif", - "correct_labels": [ - "gif" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "go": { - "name": "go", - "extensions": [ - "go" - ], - "mime_type": "text/x-golang", - "group": "code", - "magic": "ASCII text", - "description": "Golang source", - "vt_type": null, - "datasets": [ - "github" - ], - "parent": null, - "tags": [ - "text", - "dl_target" - ], - "model_target_label": "go", - "target_label": "go", - "correct_labels": [ - "go" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "gpx": { - "name": "gpx", - "extensions": [ - "gpx" - ], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "groovy": { - "name": "groovy", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "gzip": { - "name": "gzip", - "extensions": [ - "gz", - "gzip", - "tgz", - "tar.gz" - ], - "mime_type": "application/gzip", - "group": "archive", - "magic": "gzip compressed data", - "description": "gzip compressed data", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "archive" - ], - "model_target_label": "gzip", - "target_label": "gzip", - "correct_labels": [ - "gzip" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "haskell": { - "name": "haskell", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "hfs": { - "name": "hfs", - "extensions": [ - "hfs" - ], - "mime_type": "application/x-hfs", - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "hlp": { - "name": "hlp", - "extensions": [ - "hlp" - ], - "mime_type": "application/winhlp", - "group": "application", - "magic": "MS Windows help", - "description": "MS Windows help", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary" - ], - "model_target_label": "hlp", - "target_label": "hlp", - "correct_labels": [ - "hlp" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "hta": { - "name": "hta", - "extensions": [ - "hta" - ], - "mime_type": "application/hta", - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "html": { - "name": "html", - "extensions": [ - "html", - "htm", - "xhtml", - "xht" - ], - "mime_type": "text/html", - "group": "code", - "magic": "HTML document", - "description": "HTML document", - "vt_type": "html", - "datasets": [ - "github" - ], - "parent": null, - "tags": [ - "text", - "dl_target" - ], - "model_target_label": "html", - "target_label": "html", - "correct_labels": [ - "html" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "hve": { - "name": "hve", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "ico": { - "name": "ico", - "extensions": [ - "ico" - ], - "mime_type": "image/vnd.microsoft.icon", - "group": "image", - "magic": "MS Windows icon resource", - "description": "MS Windows icon resource", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary" - ], - "model_target_label": "ico", - "target_label": "ico", - "correct_labels": [ - "ico" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "img": { - "name": "img", - "extensions": [ - "img" - ], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "ini": { - "name": "ini", - "extensions": [ - "ini" - ], - "mime_type": "text/plain", - "group": "text", - "magic": "Generic INItialization configuration", - "description": "INI configuration file", - "vt_type": null, - "datasets": [ - "github" - ], - "parent": null, - "tags": [ - "text", - "dl_target" - ], - "model_target_label": "ini", - "target_label": "ini", - "correct_labels": [ - "ini" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "internetshortcut": { - "name": "internetshortcut", - "extensions": [ - "url" - ], - "mime_type": "application/x-mswinurl", - "group": "application", - "magic": "MS Windows 95 Internet shortcut", - "description": "MS Windows Internet shortcut", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "text" - ], - "model_target_label": "internetshortcut", - "target_label": "internetshortcut", - "correct_labels": [ - "internetshortcut" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "iosapp": { - "name": "iosapp", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "iso": { - "name": "iso", - "extensions": [ - "iso" - ], - "mime_type": "application/x-iso9660-image", - "group": "archive", - "magic": "ISO 9660 CD-ROM filesystem data", - "description": "ISO 9660 CD-ROM filesystem data", - "vt_type": "isoimage", - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary" - ], - "model_target_label": "iso", - "target_label": "iso", - "correct_labels": [ - "iso", - "udf" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "jar": { - "name": "jar", - "extensions": [ - "jar" - ], - "mime_type": "application/java-archive", - "group": "archive", - "magic": "Java archive data (JAR)", - "description": "Java archive data (JAR)", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "zip_archive", - "archive" - ], - "model_target_label": "jar", - "target_label": "jar", - "correct_labels": [ - "jar" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "java": { - "name": "java", - "extensions": [ - "java" - ], - "mime_type": "text/x-java", - "group": "code", - "magic": "Java source", - "description": "Java source", - "vt_type": "java", - "datasets": [ - "github" - ], - "parent": null, - "tags": [ - "text", - "dl_target" - ], - "model_target_label": "java", - "target_label": "java", - "correct_labels": [ - "java" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "javabytecode": { - "name": "javabytecode", - "extensions": [ - "class" - ], - "mime_type": "application/x-java-applet", - "group": "executable", - "magic": "compiled Java class data", - "description": "Java compiled bytecode", - "vt_type": "class", - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary" - ], - "model_target_label": "javabytecode", - "target_label": "javabytecode", - "correct_labels": [ - "javabytecode" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "javascript": { - "name": "javascript", - "extensions": [ - "js" - ], - "mime_type": "application/javascript", - "group": "code", - "magic": "JavaScript source", - "description": "JavaScript source", - "vt_type": "javascript", - "datasets": [ - "github", - "vt-ext", - "vt-ext-malicious" - ], - "parent": null, - "tags": [ - "text", - "dl_target" - ], - "model_target_label": "javascript", - "target_label": "javascript", - "correct_labels": [ - "javascript", - "typescript" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "jpeg": { - "name": "jpeg", - "extensions": [ - "jpg", - "jpeg" - ], - "mime_type": "image/jpeg", - "group": "image", - "magic": "JPEG image data", - "description": "JPEG image data", - "vt_type": "jpeg", - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "media" - ], - "model_target_label": "jpeg", - "target_label": "jpeg", - "correct_labels": [ - "jpeg" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "json": { - "name": "json", - "extensions": [ - "json" - ], - "mime_type": "application/json", - "group": "code", - "magic": "JSON data", - "description": "JSON document", - "vt_type": null, - "datasets": [ - "github" - ], - "parent": null, - "tags": [ - "text", - "dl_target" - ], - "model_target_label": "json", - "target_label": "json", - "correct_labels": [ - "json" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "julia": { - "name": "julia", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "ko": { - "name": "ko", - "extensions": [ - "ko" - ], - "mime_type": "application/x-executable-elf", - "group": "executable", - "magic": "ELF executable", - "description": "ELF executable", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "elf" - ], - "model_target_label": "elf", - "target_label": "elf", - "correct_labels": [ - "elf", - "ko" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "kotlin": { - "name": "kotlin", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "latex": { - "name": "latex", - "extensions": [ - "tex" - ], - "mime_type": "text/x-tex", - "group": "text", - "magic": "LaTeX document", - "description": "LaTeX document", - "vt_type": null, - "datasets": [ - "github" - ], - "parent": null, - "tags": [ - "text", - "dl_target" - ], - "model_target_label": "latex", - "target_label": "latex", - "correct_labels": [ - "latex" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "lisp": { - "name": "lisp", - "extensions": [ - "lisp" - ], - "mime_type": "text/x-lisp", - "group": "code", - "magic": "Lisp/Scheme program", - "description": "Lisp source", - "vt_type": null, - "datasets": [ - "github" - ], - "parent": null, - "tags": [ - "text", - "dl_target" - ], - "model_target_label": "lisp", - "target_label": "lisp", - "correct_labels": [ - "lisp" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "lnk": { - "name": "lnk", - "extensions": [ - "lnk" - ], - "mime_type": "application/x-ms-shortcut", - "group": "application", - "magic": "MS Windows shortcut", - "description": "MS Windows shortcut", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary" - ], - "model_target_label": "lnk", - "target_label": "lnk", - "correct_labels": [ - "lnk" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "lua": { - "name": "lua", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "m3u": { - "name": "m3u", - "extensions": [ - "m3u8", - "m3u" - ], - "mime_type": "text/plain", - "group": "application", - "magic": "M3U playlist", - "description": "M3U playlist", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary" - ], - "model_target_label": "m3u", - "target_label": "m3u", - "correct_labels": [ - "m3u" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "macho": { - "name": "macho", - "extensions": [], - "mime_type": "application/x-mach-o", - "group": "executable", - "magic": "Mach-O executable", - "description": "Mach-O executable", - "vt_type": "macho", - "datasets": [ - "vt-type" - ], - "parent": null, - "tags": [ - "binary", - "macho" - ], - "model_target_label": "macho", - "target_label": "macho", - "correct_labels": [ - "macho", - "dylib" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "maff": { - "name": "maff", - "extensions": [ - "maff" - ], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "makefile": { - "name": "makefile", - "extensions": [ - "=Makefile" - ], - "mime_type": "text/x-makefile", - "group": "code", - "magic": "makefile script", - "description": "Makefile source", - "vt_type": null, - "datasets": [ - "github" - ], - "parent": null, - "tags": [ - "text", - "dl_target" - ], - "model_target_label": "makefile", - "target_label": "makefile", - "correct_labels": [ - "makefile" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "markdown": { - "name": "markdown", - "extensions": [ - "md" - ], - "mime_type": "text/markdown", - "group": "text", - "magic": "ASCII text", - "description": "Markdown document", - "vt_type": null, - "datasets": [ - "github" - ], - "parent": null, - "tags": [ - "text", - "dl_target" - ], - "model_target_label": "markdown", - "target_label": "markdown", - "correct_labels": [ - "markdown" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "matlab": { - "name": "matlab", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "mht": { - "name": "mht", - "extensions": [ - "mht" - ], - "mime_type": "application/x-mimearchive", - "group": "code", - "magic": "HTML document", - "description": "MHTML document", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "text" - ], - "model_target_label": "mht", - "target_label": "mht", - "correct_labels": [ - "mht" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "mkv": { - "name": "mkv", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "mov": { - "name": "mov", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "mp3": { - "name": "mp3", - "extensions": [ - "mp3" - ], - "mime_type": "audio/mpeg", - "group": "audio", - "magic": "Audio file with ID3", - "description": "MP3 media file", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "media" - ], - "model_target_label": "mp3", - "target_label": "mp3", - "correct_labels": [ - "mp3" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "mp4": { - "name": "mp4", - "extensions": [ - "mov", - "mp4" - ], - "mime_type": "video/mp4", - "group": "video", - "magic": "ISO Media", - "description": "MP4 media file", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "media" - ], - "model_target_label": "mp4", - "target_label": "mp4", - "correct_labels": [ - "mp4" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "mscompress": { - "name": "mscompress", - "extensions": [], - "mime_type": "application/x-ms-compress-szdd", - "group": "archive", - "magic": "MS Compress archive data", - "description": "MS Compress archive data", - "vt_type": "mscompress", - "datasets": [ - "vt-type" - ], - "parent": null, - "tags": [ - "binary" - ], - "model_target_label": "mscompress", - "target_label": "mscompress", - "correct_labels": [ - "mscompress" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "msi": { - "name": "msi", - "extensions": [ - "msi" - ], - "mime_type": "application/x-msi", - "group": "archive", - "magic": "Composite Document File", - "description": "Microsoft Installer file", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "cdf" - ], - "model_target_label": "msi", - "target_label": "msi", - "correct_labels": [ - "msi" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "mst": { - "name": "mst", - "extensions": [ - "mst" - ], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "msvisio": { - "name": "msvisio", - "extensions": [], - "mime_type": "application/vnd.ms-visio.drawing.main+xml", - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "mui": { - "name": "mui", - "extensions": [ - "mui" - ], - "mime_type": "application/x-dosexec", - "group": "application", - "magic": "PE Windows executable", - "description": "PE Windows executable", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "pebin" - ], - "model_target_label": "pebin", - "target_label": "pebin", - "correct_labels": [ - "pebin", - "mui" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "mum": { - "name": "mum", - "extensions": [ - "mum" - ], - "mime_type": "text/xml", - "group": "application", - "magic": "XML document", - "description": "Windows Update Package file", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "text" - ], - "model_target_label": "mum", - "target_label": "mum", - "correct_labels": [ - "mum" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "mun": { - "name": "mun", - "extensions": [ - "mun" - ], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "nim": { - "name": "nim", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "null": { - "name": "null", - "extensions": [ - "null" - ], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "object": { - "name": "object", - "extensions": [ - "o" - ], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "objectivec": { - "name": "objectivec", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "ocaml": { - "name": "ocaml", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "ocx": { - "name": "ocx", - "extensions": [ - "ocx" - ], - "mime_type": "application/x-dosexec", - "group": "executable", - "magic": "PE Windows executable", - "description": "PE Windows executable", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "pebin" - ], - "model_target_label": "pebin", - "target_label": "pebin", - "correct_labels": [ - "pebin", - "ax", - "ocx" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "odex": { - "name": "odex", - "extensions": [ - "odex" - ], - "mime_type": "application/x-executable-elf", - "group": "executable", - "magic": "ELF executable", - "description": "ODEX ELF executable", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "elf" - ], - "model_target_label": "odex", - "target_label": "odex", - "correct_labels": [ - "odex", - "elf" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "odp": { - "name": "odp", - "extensions": [ - "odp" - ], - "mime_type": "application/vnd.oasis.opendocument.presentation", - "group": "document", - "magic": "OpenDocument Presentation", - "description": "OpenDocument Presentation", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "zip_archive" - ], - "model_target_label": "odp", - "target_label": "odp", - "correct_labels": [ - "odp" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "ods": { - "name": "ods", - "extensions": [ - "ods" - ], - "mime_type": "application/vnd.oasis.opendocument.spreadsheet", - "group": "document", - "magic": "OpenDocument Spreadsheet", - "description": "OpenDocument Spreadsheet", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "zip_archive" - ], - "model_target_label": "ods", - "target_label": "ods", - "correct_labels": [ - "ods" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "odt": { - "name": "odt", - "extensions": [ - "odt" - ], - "mime_type": "application/vnd.oasis.opendocument.text", - "group": "document", - "magic": "OpenDocument Text", - "description": "OpenDocument Text", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "zip_archive" - ], - "model_target_label": "odt", - "target_label": "odt", - "correct_labels": [ - "odt" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "ogg": { - "name": "ogg", - "extensions": [ - "ogg" - ], - "mime_type": "audio/ogg", - "group": "audio", - "magic": "Ogg data", - "description": "Ogg data", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "media" - ], - "model_target_label": "ogg", - "target_label": "ogg", - "correct_labels": [ - "ogg" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "ole": { - "name": "ole", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "ooxml": { - "name": "ooxml", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "outlook": { - "name": "outlook", - "extensions": [], - "mime_type": "application/vnd.ms-outlook", - "group": "application", - "magic": "CDFV2 Microsoft Outlook Message", - "description": "MS Outlook Message", - "vt_type": "outlook", - "datasets": [ - "vt-type" - ], - "parent": null, - "tags": [ - "binary", - "cdf" - ], - "model_target_label": "cdf", - "target_label": "outlook", - "correct_labels": [ - "outlook" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "palmos": { - "name": "palmos", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": "palmos", - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "pascal": { - "name": "pascal", - "extensions": [ - "pascal" - ], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "pbm": { - "name": "pbm", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "pcap": { - "name": "pcap", - "extensions": [ - "pcap", - "pcapng" - ], - "mime_type": "application/vnd.tcpdump.pcap", - "group": "application", - "magic": "pcap capture file", - "description": "pcap capture file", - "vt_type": "pcap", - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary" - ], - "model_target_label": "pcap", - "target_label": "pcap", - "correct_labels": [ - "pcap" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "pdf": { - "name": "pdf", - "extensions": [ - "pdf" - ], - "mime_type": "application/pdf", - "group": "document", - "magic": "PDF document", - "description": "PDF document", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary" - ], - "model_target_label": "pdf", - "target_label": "pdf", - "correct_labels": [ - "pdf", - "ai" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "pebin": { - "name": "pebin", - "extensions": [ - "exe", - "dll", - "sys" - ], - "mime_type": "application/x-dosexec", - "group": "executable", - "magic": "PE executable", - "description": "PE executable", - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": true, - "in_scope_for_training": false - }, - "pem": { - "name": "pem", - "extensions": [ - "pem", - "pub" - ], - "mime_type": "application/x-pem-file", - "group": "application", - "magic": "PEM certificate", - "description": "PEM certificate", - "vt_type": null, - "datasets": [ - "github" - ], - "parent": null, - "tags": [ - "text" - ], - "model_target_label": "pem", - "target_label": "pem", - "correct_labels": [ - "pem", - "pgpkey" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "perl": { - "name": "perl", - "extensions": [ - "pl" - ], - "mime_type": "text/x-perl", - "group": "code", - "magic": "Perl script text executable", - "description": "Perl source", - "vt_type": "perl", - "datasets": [ - "github" - ], - "parent": null, - "tags": [ - "text", - "dl_target" - ], - "model_target_label": "perl", - "target_label": "perl", - "correct_labels": [ - "perl" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "pgpkey": { - "name": "pgpkey", - "extensions": [], - "mime_type": "application/pgp-keys", - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "php": { - "name": "php", - "extensions": [ - "php" - ], - "mime_type": "text/x-php", - "group": "code", - "magic": "PHP script", - "description": "PHP source", - "vt_type": "php", - "datasets": [ - "github" - ], - "parent": null, - "tags": [ - "text", - "dl_target" - ], - "model_target_label": "php", - "target_label": "php", - "correct_labels": [ - "php" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "png": { - "name": "png", - "extensions": [ - "png" - ], - "mime_type": "image/png", - "group": "image", - "magic": "PNG image data", - "description": "PNG image data", - "vt_type": "png", - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "media" - ], - "model_target_label": "png", - "target_label": "png", - "correct_labels": [ - "png" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "postscript": { - "name": "postscript", - "extensions": [ - "ps" - ], - "mime_type": "application/postscript", - "group": "document", - "magic": "PostScript document text", - "description": "PostScript document", - "vt_type": "postscript", - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary" - ], - "model_target_label": "postscript", - "target_label": "postscript", - "correct_labels": [ - "postscript" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "powershell": { - "name": "powershell", - "extensions": [ - "ps1" - ], - "mime_type": "application/x-powershell", - "group": "code", - "magic": "a powershell script", - "description": "Powershell source", - "vt_type": "ps", - "datasets": [ - "github", - "vt-ext", - "vt-ext-malicious" - ], - "parent": null, - "tags": [ - "text", - "dl_target" - ], - "model_target_label": "powershell", - "target_label": "powershell", - "correct_labels": [ - "powershell" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "ppt": { - "name": "ppt", - "extensions": [ - "ppt" - ], - "mime_type": "application/vnd.ms-powerpoint", - "group": "document", - "magic": "Composite Document File", - "description": "Microsoft PowerPoint CDF document", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "cdf" - ], - "model_target_label": "cdf", - "target_label": "ppt", - "correct_labels": [ - "ppt" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "pptx": { - "name": "pptx", - "extensions": [ - "pptx", - "pptm" - ], - "mime_type": "application/vnd.openxmlformats-officedocument.presentationml.presentation", - "group": "document", - "magic": "Microsoft PowerPoint 2007+", - "description": "Microsoft PowerPoint 2007+ document", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "ooxml", - "zip_archive", - "archive" - ], - "model_target_label": "zip", - "target_label": "pptx", - "correct_labels": [ - "pptx" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "printfox": { - "name": "printfox", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "prolog": { - "name": "prolog", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "pub": { - "name": "pub", - "extensions": [ - "pub" - ], - "mime_type": "application/x-mspublisher", - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [ - "binary", - "cdf" - ], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "python": { - "name": "python", - "extensions": [ - "py" - ], - "mime_type": "text/x-python", - "group": "code", - "magic": "Python script", - "description": "Python source", - "vt_type": "python", - "datasets": [ - "github" - ], - "parent": null, - "tags": [ - "text", - "dl_target" - ], - "model_target_label": "python", - "target_label": "python", - "correct_labels": [ - "python" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "pythonbytecode": { - "name": "pythonbytecode", - "extensions": [ - "pyc", - "pyo" - ], - "mime_type": "application/x-bytecode.python", - "group": "executable", - "magic": "python byte-compiled", - "description": "Python compiled bytecode", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary" - ], - "model_target_label": "pythonbytecode", - "target_label": "pythonbytecode", - "correct_labels": [ - "pythonbytecode" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "pythonpar": { - "name": "pythonpar", - "extensions": [ - "par" - ], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "r": { - "name": "r", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "randombytes": { - "name": "randombytes", - "extensions": [], - "mime_type": "application/octet-stream", - "group": "unknown", - "magic": "data", - "description": "Random bytes", - "vt_type": null, - "datasets": [ - "synthetic" - ], - "parent": null, - "tags": [], - "model_target_label": "unknown", - "target_label": "unknown", - "correct_labels": [ - "unknown" - ], - "in_scope_for_output_content_type": false, - "in_scope_for_training": true - }, - "rar": { - "name": "rar", - "extensions": [ - "rar" - ], - "mime_type": "application/x-rar", - "group": "archive", - "magic": "RAR archive data", - "description": "RAR archive data", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "archive" - ], - "model_target_label": "rar", - "target_label": "rar", - "correct_labels": [ - "rar" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "rdf": { - "name": "rdf", - "extensions": [ - "rdf" - ], - "mime_type": "application/rdf+xml", - "group": "text", - "magic": "XML document", - "description": "Resource Description Framework document (RDF)", - "vt_type": null, - "datasets": [ - "github" - ], - "parent": null, - "tags": [ - "text" - ], - "model_target_label": "rdf", - "target_label": "rdf", - "correct_labels": [ - "rdf" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "rll": { - "name": "rll", - "extensions": [ - "rll" - ], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "rpm": { - "name": "rpm", - "extensions": [ - "rpm" - ], - "mime_type": "application/x-rpm", - "group": "archive", - "magic": "RPM", - "description": "RedHat Package Manager archive (RPM)", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "archive" - ], - "model_target_label": "rpm", - "target_label": "rpm", - "correct_labels": [ - "rpm" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "rst": { - "name": "rst", - "extensions": [ - "rst" - ], - "mime_type": "text/x-rst", - "group": "text", - "magic": "ReStructuredText file", - "description": "ReStructuredText document", - "vt_type": null, - "datasets": [ - "github" - ], - "parent": null, - "tags": [ - "text", - "dl_target" - ], - "model_target_label": "rst", - "target_label": "rst", - "correct_labels": [ - "rst" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "rtf": { - "name": "rtf", - "extensions": [ - "rtf" - ], - "mime_type": "text/rtf", - "group": "text", - "magic": "Rich Text Format data", - "description": "Rich Text Format document", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "text" - ], - "model_target_label": "rtf", - "target_label": "rtf", - "correct_labels": [ - "rtf" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "ruby": { - "name": "ruby", - "extensions": [ - "rb" - ], - "mime_type": "application/x-ruby", - "group": "code", - "magic": "Ruby script", - "description": "Ruby source", - "vt_type": "ruby", - "datasets": [ - "github" - ], - "parent": null, - "tags": [ - "text", - "dl_target" - ], - "model_target_label": "ruby", - "target_label": "ruby", - "correct_labels": [ - "ruby" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "rust": { - "name": "rust", - "extensions": [ - "rs" - ], - "mime_type": "application/x-rust", - "group": "code", - "magic": "ASCII text", - "description": "Rust source", - "vt_type": null, - "datasets": [ - "github" - ], - "parent": null, - "tags": [ - "text", - "dl_target" - ], - "model_target_label": "rust", - "target_label": "rust", - "correct_labels": [ - "rust" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "s": { - "name": "s", - "extensions": [ - "s" - ], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "scala": { - "name": "scala", - "extensions": [ - "scala" - ], - "mime_type": "application/x-scala", - "group": "code", - "magic": "ASCII text", - "description": "Scala source", - "vt_type": null, - "datasets": [ - "github" - ], - "parent": null, - "tags": [ - "text", - "dl_target" - ], - "model_target_label": "scala", - "target_label": "scala", - "correct_labels": [ - "scala" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "scr": { - "name": "scr", - "extensions": [ - "scr" - ], - "mime_type": "application/x-dosexec", - "group": "executable", - "magic": "PE Windows executable", - "description": "PE Windows executable", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "pebin" - ], - "model_target_label": "pebin", - "target_label": "pebin", - "correct_labels": [ - "pebin", - "scr" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "scriptwsf": { - "name": "scriptwsf", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "sevenzip": { - "name": "sevenzip", - "extensions": [ - "7z" - ], - "mime_type": "application/x-7z-compressed", - "group": "archive", - "magic": "7-zip archive data", - "description": "7-zip archive data", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "archive" - ], - "model_target_label": "sevenzip", - "target_label": "sevenzip", - "correct_labels": [ - "sevenzip" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "sgml": { - "name": "sgml", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "sh3d": { - "name": "sh3d", - "extensions": [ - "sh3d" - ], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "shell": { - "name": "shell", - "extensions": [ - "sh" - ], - "mime_type": "text/x-shellscript", - "group": "code", - "magic": "shell script", - "description": "Shell script", - "vt_type": "shell", - "datasets": [ - "github" - ], - "parent": null, - "tags": [ - "text", - "dl_target" - ], - "model_target_label": "shell", - "target_label": "shell", - "correct_labels": [ - "shell" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "smali": { - "name": "smali", - "extensions": [ - "smali" - ], - "mime_type": "application/x-smali", - "group": "code", - "magic": "ASCII text", - "description": "Smali source", - "vt_type": null, - "datasets": [ - "github" - ], - "parent": null, - "tags": [ - "text" - ], - "model_target_label": "smali", - "target_label": "smali", - "correct_labels": [ - "smali" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "so": { - "name": "so", - "extensions": [ - "so" - ], - "mime_type": "application/x-executable-elf", - "group": "executable", - "magic": "ELF executable", - "description": "ELF executable", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "elf" - ], - "model_target_label": "elf", - "target_label": "elf", - "correct_labels": [ - "elf", - "so" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "sql": { - "name": "sql", - "extensions": [ - "sql" - ], - "mime_type": "application/x-sql", - "group": "code", - "magic": "ASCII text", - "description": "SQL source", - "vt_type": null, - "datasets": [ - "github" - ], - "parent": null, - "tags": [ - "text", - "dl_target" - ], - "model_target_label": "sql", - "target_label": "sql", - "correct_labels": [ - "sql" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "squashfs": { - "name": "squashfs", - "extensions": [], - "mime_type": "application/octet-stream", - "group": "archive", - "magic": "Squashfs filesystem", - "description": "Squash filesystem", - "vt_type": "squashfs", - "datasets": [ - "vt-type" - ], - "parent": null, - "tags": [ - "binary" - ], - "model_target_label": "squashfs", - "target_label": "squashfs", - "correct_labels": [ - "squashfs" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "svd": { - "name": "svd", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "svg": { - "name": "svg", - "extensions": [ - "svg" - ], - "mime_type": "image/svg+xml", - "group": "image", - "magic": "SVG Scalable Vector Graphics image", - "description": "SVG Scalable Vector Graphics image data", - "vt_type": null, - "datasets": [ - "github" - ], - "parent": null, - "tags": [ - "text" - ], - "model_target_label": "svg", - "target_label": "svg", - "correct_labels": [ - "svg" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "swf": { - "name": "swf", - "extensions": [ - "swf" - ], - "mime_type": "application/x-shockwave-flash", - "group": "executable", - "magic": "Macromedia Flash data", - "description": "Macromedia Flash data", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary" - ], - "model_target_label": "swf", - "target_label": "swf", - "correct_labels": [ - "swf" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "swift": { - "name": "swift", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "symlinktext": { - "name": "symlinktext", - "extensions": [], - "mime_type": "text/plain", - "group": "application", - "magic": "ASCII text", - "description": "Symbolic link (textual representation)", - "vt_type": null, - "datasets": [ - "synthetic" - ], - "parent": null, - "tags": [ - "text" - ], - "model_target_label": "symlinktext", - "target_label": "symlinktext", - "correct_labels": [ - "symlinktext", - "txt" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "symlink": { - "name": "symlink", - "extensions": [], - "mime_type": "inode/symlink", - "group": "inode", - "magic": "symbolic link to ", - "description": "Symbolic link to ", - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": "symlink", - "correct_labels": [ - "symlink" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": false - }, - "sys": { - "name": "sys", - "extensions": [ - "sys" - ], - "mime_type": "application/x-windows-driver", - "group": "executable", - "magic": "PE Windows executable", - "description": "PE Windows executable", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "pebin" - ], - "model_target_label": "pebin", - "target_label": "pebin", - "correct_labels": [ - "pebin", - "sys" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "tar": { - "name": "tar", - "extensions": [ - "tar" - ], - "mime_type": "application/x-tar", - "group": "archive", - "magic": "POSIX tar archive", - "description": "POSIX tar archive", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "archive" - ], - "model_target_label": "tar", - "target_label": "tar", - "correct_labels": [ - "tar" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "tga": { - "name": "tga", - "extensions": [ - "tga" - ], - "mime_type": "image/x-tga", - "group": "image", - "magic": "Targa image data", - "description": "Targa image data", - "vt_type": "targa", - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary" - ], - "model_target_label": "tga", - "target_label": "tga", - "correct_labels": [ - "tga" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "tiff": { - "name": "tiff", - "extensions": [ - "tiff", - "tif" - ], - "mime_type": "image/tiff", - "group": "image", - "magic": "TIFF image data", - "description": "TIFF image data", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "media" - ], - "model_target_label": "tiff", - "target_label": "tiff", - "correct_labels": [ - "tiff" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "tmdx": { - "name": "tmdx", - "extensions": [ - "tmdx", - "tmvx" - ], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "toml": { - "name": "toml", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "torrent": { - "name": "torrent", - "extensions": [ - "torrent" - ], - "mime_type": "application/x-bittorrent", - "group": "application", - "magic": "BitTorrent file", - "description": "BitTorrent file", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary" - ], - "model_target_label": "torrent", - "target_label": "torrent", - "correct_labels": [ - "torrent" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "troff": { - "name": "troff", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "ttf": { - "name": "ttf", - "extensions": [ - "ttf" - ], - "mime_type": "font/sfnt", - "group": "font", - "magic": "TrueType Font data", - "description": "TrueType Font data", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary" - ], - "model_target_label": "ttf", - "target_label": "ttf", - "correct_labels": [ - "ttf" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "txt": { - "name": "txt", - "extensions": [ - "txt" - ], - "mime_type": "text/plain", - "group": "text", - "magic": "ASCII text", - "description": "Generic text document", - "vt_type": null, - "datasets": [ - "github", - "synthetic" - ], - "parent": null, - "tags": [ - "text", - "dl_target" - ], - "model_target_label": "txt", - "target_label": "txt", - "correct_labels": [ - "txt" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "typescript": { - "name": "typescript", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "udf": { - "name": "udf", - "extensions": [], - "mime_type": "application/x-udf-image", - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "unixcompress": { - "name": "unixcompress", - "extensions": [ - "z" - ], - "mime_type": "application/x-compress", - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "unknown": { - "name": "unknown", - "extensions": [], - "mime_type": "application/octet-stream", - "group": "unknown", - "magic": "data", - "description": "Unknown binary data", - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [ - "binary" - ], - "model_target_label": "unknown", - "target_label": "unknown", - "correct_labels": [ - "unknown" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": false - }, - "vba": { - "name": "vba", - "extensions": [ - "vbs" - ], - "mime_type": "text/vbscript", - "group": "code", - "magic": "ASCII text", - "description": "MS Visual Basic source (VBA)", - "vt_type": "vba", - "datasets": [ - "vt-ext", - "vt-ext-malicious" - ], - "parent": null, - "tags": [ - "text" - ], - "model_target_label": "vba", - "target_label": "vba", - "correct_labels": [ - "vba" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "verilog": { - "name": "verilog", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "vhd": { - "name": "vhd", - "extensions": [], - "mime_type": "application/x-vhd", - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "wasm": { - "name": "wasm", - "extensions": [ - "wasm" - ], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "wav": { - "name": "wav", - "extensions": [ - "wav" - ], - "mime_type": "audio/x-wav", - "group": "audio", - "magic": "RIFF data", - "description": "Waveform Audio file (WAV)", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "media" - ], - "model_target_label": "wav", - "target_label": "wav", - "correct_labels": [ - "wav" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "webm": { - "name": "webm", - "extensions": [ - "webm" - ], - "mime_type": "video/webm", - "group": "video", - "magic": "WebM", - "description": "WebM data", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "media" - ], - "model_target_label": "webm", - "target_label": "webm", - "correct_labels": [ - "webm" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "webp": { - "name": "webp", - "extensions": [ - "webp" - ], - "mime_type": "image/webp", - "group": "image", - "magic": "RIFF data", - "description": "WebP data", - "vt_type": "webp", - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "media" - ], - "model_target_label": "webp", - "target_label": "webp", - "correct_labels": [ - "webp" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "winregistry": { - "name": "winregistry", - "extensions": [ - "reg" - ], - "mime_type": "text/x-ms-regedit", - "group": "application", - "magic": "Windows Registry text", - "description": "Windows Registry text", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "text", - "dl_target" - ], - "model_target_label": "winregistry", - "target_label": "winregistry", - "correct_labels": [ - "winregistry" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "wmf": { - "name": "wmf", - "extensions": [ - "wmf" - ], - "mime_type": "image/wmf", - "group": "image", - "magic": "Windows metafile", - "description": "Windows metafile", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary" - ], - "model_target_label": "wmf", - "target_label": "wmf", - "correct_labels": [ - "wmf" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "woff": { - "name": "woff", - "extensions": [], - "mime_type": null, - "group": null, - "magic": null, - "description": null, - "vt_type": null, - "datasets": [], - "parent": null, - "tags": [], - "model_target_label": null, - "target_label": null, - "correct_labels": [], - "in_scope_for_output_content_type": false, - "in_scope_for_training": false - }, - "xar": { - "name": "xar", - "extensions": [ - "pkg", - "xar" - ], - "mime_type": "application/x-xar", - "group": "archive", - "magic": "xar archive compressed", - "description": "XAR archive compressed data", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "archive" - ], - "model_target_label": "xar", - "target_label": "xar", - "correct_labels": [ - "xar" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "xls": { - "name": "xls", - "extensions": [ - "xls" - ], - "mime_type": "application/vnd.ms-excel", - "group": "document", - "magic": "Composite Document File", - "description": "Microsoft Excel CDF document", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "cdf" - ], - "model_target_label": "cdf", - "target_label": "xls", - "correct_labels": [ - "xls" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "xlsb": { - "name": "xlsb", - "extensions": [ - "xlsb" - ], - "mime_type": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - "group": "document", - "magic": "Microsoft Excel 2007+", - "description": "Microsoft Excel 2007+ document (binary format)", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "ooxml", - "zip_archive", - "archive" - ], - "model_target_label": "zip", - "target_label": "xlsb", - "correct_labels": [ - "xlsb", - "xlsx" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "xlsx": { - "name": "xlsx", - "extensions": [ - "xlsx", - "xlsm" - ], - "mime_type": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - "group": "document", - "magic": "Microsoft Excel 2007+", - "description": "Microsoft Excel 2007+ document", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "ooxml", - "zip_archive", - "archive" - ], - "model_target_label": "zip", - "target_label": "xlsx", - "correct_labels": [ - "xlsx", - "xlsb" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "xml": { - "name": "xml", - "extensions": [ - "xml" - ], - "mime_type": "text/xml", - "group": "code", - "magic": "XML document", - "description": "XML document", - "vt_type": "xml", - "datasets": [ - "github" - ], - "parent": null, - "tags": [ - "text", - "dl_target" - ], - "model_target_label": "xml", - "target_label": "xml", - "correct_labels": [ - "xml" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "xpi": { - "name": "xpi", - "extensions": [ - "xpi" - ], - "mime_type": "application/zip", - "group": "archive", - "magic": "Zip archive data", - "description": "Compressed installation archive (XPI)", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "zip_archive", - "archive" - ], - "model_target_label": "zip", - "target_label": "xpi", - "correct_labels": [ - "xpi" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "xz": { - "name": "xz", - "extensions": [ - "xz" - ], - "mime_type": "application/x-xz", - "group": "archive", - "magic": "XZ compressed data", - "description": "XZ compressed data", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "archive" - ], - "model_target_label": "xz", - "target_label": "xz", - "correct_labels": [ - "xz" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "yaml": { - "name": "yaml", - "extensions": [ - "yml", - "yaml" - ], - "mime_type": "application/x-yaml", - "group": "code", - "magic": "ASCII text", - "description": "YAML source", - "vt_type": null, - "datasets": [ - "github" - ], - "parent": null, - "tags": [ - "text", - "dl_target" - ], - "model_target_label": "yaml", - "target_label": "yaml", - "correct_labels": [ - "yaml" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "zip": { - "name": "zip", - "extensions": [ - "zip" - ], - "mime_type": "application/zip", - "group": "archive", - "magic": "Zip archive data", - "description": "Zip archive data", - "vt_type": null, - "datasets": [ - "vt-ext" - ], - "parent": null, - "tags": [ - "binary", - "zip_archive", - "archive" - ], - "model_target_label": "zip", - "target_label": "zip", - "correct_labels": [ - "zip" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - }, - "zlibstream": { - "name": "zlibstream", - "extensions": [], - "mime_type": "application/zlib", - "group": "application", - "magic": "zlib compressed data", - "description": "zlib compressed data", - "vt_type": "zlib", - "datasets": [ - "vt-type" - ], - "parent": null, - "tags": [ - "binary" - ], - "model_target_label": "zlibstream", - "target_label": "zlibstream", - "correct_labels": [ - "zlibstream" - ], - "in_scope_for_output_content_type": true, - "in_scope_for_training": true - } -} diff --git a/python/magika/config/content_types_kb.min.json b/python/magika/config/content_types_kb.min.json new file mode 100644 index 00000000..fa959868 --- /dev/null +++ b/python/magika/config/content_types_kb.min.json @@ -0,0 +1 @@ +{"3gp": {"mime_type": "video/3gpp", "group": "video", "description": null, "extensions": ["3gp"], "is_text": false}, "3ds": {"mime_type": "application/octet-stream", "group": "unknown", "description": "Nintendo 3DS roms", "extensions": ["3ds"], "is_text": false}, "3dsx": {"mime_type": "application/octet-stream", "group": "unknown", "description": "Nintendo 3DS homebrew", "extensions": ["3dsx"], "is_text": false}, "3dsm": {"mime_type": "application/x-3ds", "group": "image", "description": "3D studio Max", "extensions": ["3ds"], "is_text": false}, "3mf": {"mime_type": "application/vnd.ms-package.3dmanufacturing-3dmodel+xml", "group": "image", "description": "3D Manufacturing Format", "extensions": ["3mf"], "is_text": false}, "abnf": {"mime_type": null, "group": null, "description": "augmented Backus\u2013Naur form", "extensions": ["abnf"], "is_text": false}, "ace": {"mime_type": "application/x-ace-compressed", "group": null, "description": "ACE", "extensions": ["ace"], "is_text": false}, "ada": {"mime_type": null, "group": null, "description": null, "extensions": [], "is_text": false}, "aff": {"mime_type": null, "group": null, "description": "Hunspell Affix", "extensions": ["aff"], "is_text": true}, "ai": {"mime_type": "application/pdf", "group": "document", "description": "Adobe Illustrator Artwork", "extensions": ["ai"], "is_text": false}, "aidl": {"mime_type": null, "group": null, "description": "Android Interface Definition Language", "extensions": ["aidl"], "is_text": true}, "algol68": {"mime_type": null, "group": null, "description": null, "extensions": ["a68"], "is_text": false}, "ani": {"mime_type": "application/x-navi-animation", "group": null, "description": "Animated cursor", "extensions": ["ani"], "is_text": false}, "apk": {"mime_type": "application/vnd.android.package-archive", "group": "executable", "description": "Android package", "extensions": ["apk"], "is_text": false}, "applebplist": {"mime_type": "application/x-bplist", "group": "application", "description": "Apple binary property list", "extensions": ["bplist", "plist"], "is_text": false}, "appledouble": {"mime_type": "multipart/appledouble", "group": "unknown", "description": "AppleDouble", "extensions": [], "is_text": false}, "appleplist": {"mime_type": "application/x-plist", "group": "application", "description": "Apple property list", "extensions": ["plist"], "is_text": true}, "applesingle": {"mime_type": "application/applefile", "group": "unknown", "description": "AppleSingle", "extensions": [], "is_text": false}, "ar": {"mime_type": "application/x-archive", "group": null, "description": null, "extensions": [], "is_text": false}, "arc": {"mime_type": "application/x-arc", "group": "archive", "description": "Arc", "extensions": ["arc"], "is_text": false}, "arj": {"mime_type": "application/arj", "group": "archive", "description": "Arj", "extensions": [], "is_text": false}, "arrow": {"mime_type": null, "group": null, "description": null, "extensions": [], "is_text": false}, "asc": {"mime_type": "application/pgp-signature", "group": "text", "description": "PGP", "extensions": ["asc"], "is_text": true}, "asd": {"mime_type": null, "group": null, "description": null, "extensions": [], "is_text": false}, "asf": {"mime_type": "video/x-ms-wma", "group": null, "description": "Microsoft Advanced Systems Format", "extensions": ["asf"], "is_text": false}, "asm": {"mime_type": "text/x-asm", "group": "code", "description": "Assembly", "extensions": ["s", "S", "asm"], "is_text": true}, "asp": {"mime_type": "text/html", "group": "code", "description": "ASP source", "extensions": ["aspx", "asp"], "is_text": true}, "autohotkey": {"mime_type": "text/plain", "group": null, "description": "AutoHotKey", "extensions": [], "is_text": true}, "autoit": {"mime_type": "text/plain", "group": "text", "description": "AutoIt", "extensions": ["au3"], "is_text": true}, "avi": {"mime_type": "video/x-msvideo", "group": "video", "description": "Audio Video Interleave", "extensions": ["avi"], "is_text": false}, "avif": {"mime_type": "image/avif", "group": "video", "description": "AV1 Image File Format", "extensions": ["avif", "avifs"], "is_text": false}, "avro": {"mime_type": "application/x-avro-binary", "group": null, "description": "Apache Avro binary", "extensions": ["avro"], "is_text": false}, "awk": {"mime_type": "text/plain", "group": "text", "description": "Awk", "extensions": ["awk"], "is_text": true}, "ax": {"mime_type": "application/x-dosexec", "group": "executable", "description": "Directshow filter", "extensions": ["ax"], "is_text": false}, "batch": {"mime_type": "text/x-msdos-batch", "group": "code", "description": "DOS batch file", "extensions": ["bat"], "is_text": true}, "bazel": {"mime_type": null, "group": null, "description": null, "extensions": ["bzl"], "is_text": true}, "bcad": {"mime_type": null, "group": null, "description": null, "extensions": [], "is_text": false}, "bib": {"mime_type": "text/x-bibtex", "group": "text", "description": "BibTeX", "extensions": ["bib"], "is_text": true}, "bmp": {"mime_type": "image/bmp", "group": "image", "description": "BMP image data", "extensions": ["bmp"], "is_text": false}, "bpg": {"mime_type": "image/bpg", "group": "image", "description": "BPG", "extensions": ["bpg"], "is_text": false}, "bpl": {"mime_type": null, "group": null, "description": null, "extensions": ["bpl"], "is_text": false}, "brainfuck": {"mime_type": null, "group": null, "description": null, "extensions": [], "is_text": true}, "brf": {"mime_type": "text/plain", "group": "text", "description": "Braille Ready Format", "extensions": ["brf", "bfm"], "is_text": false}, "bzip": {"mime_type": "application/x-bzip2", "group": "archive", "description": "bzip2 compressed data", "extensions": ["bz2", "tbz2", "tar.bz2"], "is_text": false}, "bzip3": {"mime_type": null, "group": null, "description": "bzip3", "extensions": ["bz3"], "is_text": false}, "c": {"mime_type": "text/x-c", "group": "code", "description": "C source", "extensions": ["c"], "is_text": true}, "cab": {"mime_type": "application/vnd.ms-cab-compressed", "group": "archive", "description": "Microsoft Cabinet archive data", "extensions": ["cab"], "is_text": false}, "cad": {"mime_type": null, "group": null, "description": null, "extensions": [], "is_text": false}, "cat": {"mime_type": "application/octet-stream", "group": "application", "description": "Windows Catalog file", "extensions": ["cat"], "is_text": false}, "cdf": {"mime_type": null, "group": null, "description": null, "extensions": [], "is_text": false}, "chm": {"mime_type": "application/chm", "group": "application", "description": "MS Windows HtmlHelp Data", "extensions": ["chm"], "is_text": false}, "clojure": {"mime_type": "text/x-clojure", "group": "code", "description": "Clojure", "extensions": ["clj", "cljs", "cljc", "cljr"], "is_text": true}, "cmake": {"mime_type": null, "group": null, "description": null, "extensions": ["cmake"], "is_text": true}, "cobol": {"mime_type": "text/x-cobol", "group": "code", "description": "Cobol", "extensions": ["cbl", "cob", "cpy", "CBL", "COB", "CPY"], "is_text": true}, "coff": {"mime_type": "application/x-coff", "group": "executable", "description": "Intel 80386 COFF", "extensions": ["obj", "o"], "is_text": false}, "coffeescript": {"mime_type": "text/coffeescript", "group": "code", "description": "CoffeeScript", "extensions": ["coffee"], "is_text": true}, "com": {"mime_type": "application/x-dosexec", "group": null, "description": null, "extensions": [], "is_text": false}, "cpl": {"mime_type": "application/x-dosexec", "group": "executable", "description": "PE Windows executable", "extensions": ["cpl"], "is_text": false}, "cpp": {"mime_type": "text/x-c", "group": "code", "description": "C++ source", "extensions": ["cc", "cpp", "cxx", "c++", "cppm", "ixx"], "is_text": true}, "crt": {"mime_type": null, "group": "text", "description": "Certificates (binary format)", "extensions": ["der", "cer", "crt"], "is_text": false}, "crx": {"mime_type": "application/x-chrome-extension", "group": "executable", "description": "Google Chrome extension", "extensions": ["crx"], "is_text": false}, "cs": {"mime_type": "text/plain", "group": "code", "description": "C# source", "extensions": ["cs", "csx"], "is_text": true}, "csproj": {"mime_type": null, "group": null, "description": null, "extensions": ["csproj"], "is_text": true}, "css": {"mime_type": "text/css", "group": "code", "description": "CSS source", "extensions": ["css"], "is_text": true}, "csv": {"mime_type": "text/csv", "group": "code", "description": "CSV document", "extensions": ["csv"], "is_text": true}, "ctl": {"mime_type": null, "group": null, "description": null, "extensions": [], "is_text": false}, "dart": {"mime_type": "text/plain", "group": "code", "description": null, "extensions": ["dart"], "is_text": true}, "deb": {"mime_type": "application/vnd.debian.binary-package", "group": "archive", "description": "Debian binary package", "extensions": ["deb"], "is_text": false}, "dex": {"mime_type": "application/x-android-dex", "group": "executable", "description": "Dalvik dex file", "extensions": ["dex"], "is_text": false}, "dey": {"mime_type": "application/x-android-dey", "group": null, "description": null, "extensions": [], "is_text": false}, "dicom": {"mime_type": "application/dicom", "group": "image", "description": "DICOM", "extensions": ["dcm"], "is_text": false}, "diff": {"mime_type": null, "group": null, "description": null, "extensions": ["diff", "patch"], "is_text": true}, "directory": {"mime_type": "inode/directory", "group": "inode", "description": "A directory", "extensions": [], "is_text": false}, "django": {"mime_type": null, "group": null, "description": null, "extensions": [], "is_text": false}, "dll": {"mime_type": "application/x-dosexec", "group": "executable", "description": "PE Windows executable", "extensions": ["dll"], "is_text": false}, "dm": {"mime_type": null, "group": "text", "description": "Dream Maker", "extensions": ["dm"], "is_text": true}, "dmigd": {"mime_type": null, "group": "text", "description": "Dominion Mods", "extensions": ["dm"], "is_text": true}, "dmg": {"mime_type": "application/x-apple-diskimage", "group": "archive", "description": "Apple disk image", "extensions": ["dmg"], "is_text": false}, "dmscript": {"mime_type": null, "group": "code", "description": "Digital Micrograph Script", "extensions": ["s"], "is_text": true}, "doc": {"mime_type": "application/msword", "group": "document", "description": "Microsoft Word CDF document", "extensions": ["doc"], "is_text": false}, "dockerfile": {"mime_type": "text/x-dockerfile", "group": "code", "description": "Dockerfile", "extensions": [], "is_text": true}, "docx": {"mime_type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "group": "document", "description": "Microsoft Word 2007+ document", "extensions": ["docx", "docm"], "is_text": false}, "dosmbr": {"mime_type": null, "group": null, "description": "Master boot record", "extensions": [], "is_text": false}, "dotx": {"mime_type": null, "group": null, "description": "Office Word 2007 template", "extensions": ["dotx"], "is_text": false}, "dsstore": {"mime_type": "application/octet-stream", "group": "unknown", "description": "Application Desktop Services Store", "extensions": [], "is_text": false}, "dwg": {"mime_type": "image/x-dwg", "group": "image", "description": "Autocad Drawing", "extensions": ["dwg"], "is_text": false}, "dxf": {"mime_type": "image/vnd.dxf", "group": "image", "description": "Audocad Drawing Exchange Format", "extensions": ["dxf"], "is_text": true}, "dylib": {"mime_type": "application/x-mach-o", "group": "executable", "description": "Mach-O executable", "extensions": ["dylib"], "is_text": false}, "ebml": {"mime_type": null, "group": null, "description": "Extensible Binary Meta Language", "extensions": [], "is_text": false}, "elf": {"mime_type": "application/x-executable-elf", "group": "executable", "description": "ELF executable", "extensions": ["elf"], "is_text": false}, "elixir": {"mime_type": "text/plain", "group": null, "description": "Elixir script", "extensions": ["exs"], "is_text": true}, "emf": {"mime_type": "application/octet-stream", "group": "application", "description": "Windows Enhanced Metafile image data", "extensions": ["emf"], "is_text": false}, "eml": {"mime_type": "message/rfc822", "group": "text", "description": "RFC 822 mail", "extensions": ["eml"], "is_text": true}, "empty": {"mime_type": "inode/x-empty", "group": "inode", "description": "Empty file", "extensions": [], "is_text": false}, "epub": {"mime_type": "application/epub+zip", "group": "document", "description": "EPUB document", "extensions": ["epub"], "is_text": false}, "erb": {"mime_type": null, "group": null, "description": null, "extensions": ["erb"], "is_text": true}, "erlang": {"mime_type": "text/x-erlang", "group": "code", "description": null, "extensions": ["erl", "hrl"], "is_text": true}, "ese": {"mime_type": "application/x-ms-ese", "group": null, "description": "ESE Db", "extensions": ["dat"], "is_text": false}, "exe": {"mime_type": "application/x-dosexec", "group": "executable", "description": "PE executable", "extensions": ["exe"], "is_text": false}, "exp": {"mime_type": null, "group": null, "description": null, "extensions": [], "is_text": false}, "flac": {"mime_type": "audio/flac", "group": "audio", "description": "FLAC audio bitstream data", "extensions": ["flac"], "is_text": false}, "flutter": {"mime_type": null, "group": null, "description": null, "extensions": [], "is_text": false}, "flv": {"mime_type": "video/x-flv", "group": "video", "description": "Flash Video", "extensions": ["flv"], "is_text": false}, "fortran": {"mime_type": "text/x-fortran", "group": "document", "description": "Fortran", "extensions": ["f90", "f95", "f03", "F90"], "is_text": true}, "fpx": {"mime_type": null, "group": "image", "description": "Flashpix", "extensions": ["fpx"], "is_text": false}, "gemfile": {"mime_type": null, "group": null, "description": null, "extensions": [], "is_text": true}, "gemspec": {"mime_type": null, "group": null, "description": null, "extensions": ["gemspec"], "is_text": true}, "gif": {"mime_type": "image/gif", "group": "image", "description": "GIF image data", "extensions": ["gif"], "is_text": false}, "gitattributes": {"mime_type": null, "group": null, "description": null, "extensions": [], "is_text": true}, "gitmodules": {"mime_type": null, "group": null, "description": null, "extensions": [], "is_text": true}, "gleam": {"mime_type": null, "group": "code", "description": "Gleam", "extensions": ["gleam"], "is_text": true}, "go": {"mime_type": "text/x-golang", "group": "code", "description": "Golang source", "extensions": ["go"], "is_text": true}, "gpx": {"mime_type": null, "group": null, "description": "XML document", "extensions": ["gpx"], "is_text": false}, "gradle": {"mime_type": null, "group": null, "description": null, "extensions": ["gradle"], "is_text": true}, "groovy": {"mime_type": null, "group": null, "description": null, "extensions": ["groovy"], "is_text": true}, "gzip": {"mime_type": "application/gzip", "group": "archive", "description": "gzip compressed data", "extensions": ["gz", "gzip", "tgz", "tar.gz"], "is_text": false}, "h": {"mime_type": null, "group": null, "description": null, "extensions": ["h"], "is_text": true}, "h5": {"mime_type": "application/x-hdf5", "group": null, "description": "Hierarchical Data Format v5", "extensions": ["h5", "hdf5"], "is_text": false}, "handlebars": {"mime_type": null, "group": null, "description": null, "extensions": ["hbs", "handlebars"], "is_text": true}, "haskell": {"mime_type": "text/plain", "group": null, "description": "Haskell", "extensions": ["hs", "lhs"], "is_text": true}, "hcl": {"mime_type": null, "group": null, "description": "HashiCorp configuration language.", "extensions": ["hcl"], "is_text": true}, "heif": {"mime_type": "image/heic", "group": "image", "description": "High Efficiency Image File", "extensions": ["heif", "heifs", "heic", "heics"], "is_text": false}, "hfs": {"mime_type": "application/x-hfs", "group": null, "description": null, "extensions": ["hfs"], "is_text": false}, "hlp": {"mime_type": "application/winhlp", "group": "application", "description": "MS Windows help", "extensions": ["hlp"], "is_text": false}, "hpp": {"mime_type": "text/x-h", "group": "code", "description": null, "extensions": ["hh", "hpp", "hxx", "h++"], "is_text": true}, "hta": {"mime_type": "application/hta", "group": "code", "description": "HTML Application", "extensions": ["hta"], "is_text": false}, "htaccess": {"mime_type": null, "group": null, "description": null, "extensions": [], "is_text": true}, "html": {"mime_type": "text/html", "group": "code", "description": "HTML document", "extensions": ["html", "htm", "xhtml", "xht"], "is_text": true}, "hve": {"mime_type": null, "group": "unknown", "description": null, "extensions": [], "is_text": false}, "hwp": {"mime_type": "application/x-hwp", "group": "document", "description": "Hangul Word Processor", "extensions": ["hwp"], "is_text": false}, "icc": {"mime_type": "application/vnd.iccprofile", "group": null, "description": "ICC profile", "extensions": ["icc"], "is_text": false}, "icns": {"mime_type": null, "group": null, "description": "Mac OS X icon", "extensions": ["icns"], "is_text": false}, "ico": {"mime_type": "image/vnd.microsoft.icon", "group": "image", "description": "MS Windows icon resource", "extensions": ["ico"], "is_text": false}, "ics": {"mime_type": "text/calendar", "group": null, "description": "Internet Calendaring and Scheduling", "extensions": ["ics"], "is_text": true}, "ignorefile": {"mime_type": null, "group": null, "description": null, "extensions": [], "is_text": true}, "img": {"mime_type": null, "group": null, "description": null, "extensions": ["img"], "is_text": false}, "ini": {"mime_type": "text/plain", "group": "text", "description": "INI configuration file", "extensions": ["ini"], "is_text": true}, "internetshortcut": {"mime_type": "application/x-mswinurl", "group": "application", "description": "MS Windows Internet shortcut", "extensions": ["url"], "is_text": true}, "iosapp": {"mime_type": null, "group": null, "description": null, "extensions": [], "is_text": false}, "ipynb": {"mime_type": null, "group": null, "description": null, "extensions": ["ipynb"], "is_text": true}, "iso": {"mime_type": "application/x-iso9660-image", "group": "archive", "description": "ISO 9660 CD-ROM filesystem data", "extensions": ["iso"], "is_text": false}, "jar": {"mime_type": "application/java-archive", "group": "archive", "description": "Java archive data (JAR)", "extensions": ["jar", "klib"], "is_text": false}, "java": {"mime_type": "text/x-java", "group": "code", "description": "Java source", "extensions": ["java"], "is_text": true}, "javabytecode": {"mime_type": "application/x-java-applet", "group": "executable", "description": "Java compiled bytecode", "extensions": ["class"], "is_text": false}, "javascript": {"mime_type": "application/javascript", "group": "code", "description": "JavaScript source", "extensions": ["js", "mjs", "cjs"], "is_text": true}, "jinja": {"mime_type": null, "group": null, "description": "Jinja Template", "extensions": ["jinja", "jinja2", "j2"], "is_text": true}, "jng": {"mime_type": "image/jng", "group": "image", "description": "JPEG network graphics", "extensions": ["jng"], "is_text": false}, "jnlp": {"mime_type": "application/x-java-jnlp-file", "group": "code", "description": "Java Network Launch Protocol", "extensions": ["jnlp"], "is_text": true}, "jp2": {"mime_type": "image/jpeg2000", "group": "image", "description": "jpeg2000", "extensions": ["jp2"], "is_text": false}, "jpeg": {"mime_type": "image/jpeg", "group": "image", "description": "JPEG image data", "extensions": ["jpg", "jpeg"], "is_text": false}, "json": {"mime_type": "application/json", "group": "code", "description": "JSON document", "extensions": ["json"], "is_text": true}, "jsonc": {"mime_type": null, "group": null, "description": null, "extensions": [], "is_text": false}, "jsonl": {"mime_type": null, "group": null, "description": null, "extensions": ["jsonl", "jsonld"], "is_text": true}, "jsx": {"mime_type": null, "group": null, "description": null, "extensions": ["jsx", "mjsx", "cjsx"], "is_text": true}, "julia": {"mime_type": "text/x-julia", "group": "document", "description": "Julia", "extensions": ["jl"], "is_text": true}, "jxl": {"mime_type": "image/jxl", "group": "image", "description": "JPEG XL", "extensions": ["jxl"], "is_text": false}, "ko": {"mime_type": "application/x-executable-elf", "group": "executable", "description": "ELF executable, kernel object", "extensions": ["ko"], "is_text": false}, "kotlin": {"mime_type": "text/plain", "group": "code", "description": "Kotlin", "extensions": ["kt", "kts"], "is_text": true}, "ks": {"mime_type": null, "group": null, "description": "Tyrano", "extensions": ["ks"], "is_text": true}, "latex": {"mime_type": "text/x-tex", "group": "text", "description": "LaTeX document", "extensions": ["tex", "sty"], "is_text": true}, "latexaux": {"mime_type": null, "group": null, "description": null, "extensions": ["aux"], "is_text": false}, "less": {"mime_type": null, "group": null, "description": null, "extensions": [], "is_text": false}, "lha": {"mime_type": "application/x-lha", "group": null, "description": "LHarc", "extensions": ["lha", "lzh"], "is_text": false}, "license": {"mime_type": null, "group": null, "description": null, "extensions": [], "is_text": true}, "lisp": {"mime_type": "text/x-lisp", "group": "code", "description": "Lisp source", "extensions": ["lisp", "lsp", "l", "cl"], "is_text": true}, "litcs": {"mime_type": null, "group": null, "description": "Literate CS", "extensions": ["litcoffee"], "is_text": false}, "lnk": {"mime_type": "application/x-ms-shortcut", "group": "application", "description": "MS Windows shortcut", "extensions": ["lnk"], "is_text": false}, "lock": {"mime_type": null, "group": null, "description": null, "extensions": ["lock"], "is_text": true}, "lrz": {"mime_type": "application/x-lrzip", "group": null, "description": "LRZip", "extensions": ["lrz"], "is_text": false}, "lua": {"mime_type": "text/plain", "group": "text", "description": "Lua", "extensions": ["lua"], "is_text": true}, "lz": {"mime_type": "application/x-lzip", "group": "archive", "description": "LZip", "extensions": ["lz"], "is_text": false}, "lz4": {"mime_type": "application/x-lz4", "group": "archive", "description": "LZ4", "extensions": ["lz4"], "is_text": false}, "lzx": {"mime_type": null, "group": null, "description": null, "extensions": [], "is_text": false}, "m3u": {"mime_type": "text/plain", "group": "application", "description": "M3U playlist", "extensions": ["m3u8", "m3u"], "is_text": true}, "m4": {"mime_type": "text/plain", "group": "code", "description": "GNU Macro", "extensions": ["m4"], "is_text": true}, "macho": {"mime_type": "application/x-mach-o", "group": "executable", "description": "Mach-O executable", "extensions": [], "is_text": false}, "maff": {"mime_type": "application/x-maff", "group": null, "description": null, "extensions": ["maff"], "is_text": false}, "makefile": {"mime_type": "text/x-makefile", "group": "code", "description": "Makefile source", "extensions": [], "is_text": true}, "markdown": {"mime_type": "text/markdown", "group": "text", "description": "Markdown document", "extensions": ["md", "markdown"], "is_text": true}, "matlab": {"mime_type": null, "group": null, "description": "Matlab Source", "extensions": ["m", "matlab"], "is_text": true}, "mht": {"mime_type": "application/x-mimearchive", "group": "code", "description": "MHTML document", "extensions": ["mht"], "is_text": true}, "midi": {"mime_type": "audio/midi", "group": "audio", "description": "Midi", "extensions": ["mid"], "is_text": false}, "mkv": {"mime_type": "video/x-matroska", "group": "video", "description": "Matroska", "extensions": ["mkv"], "is_text": false}, "mp2": {"mime_type": null, "group": null, "description": "MP2 stream", "extensions": ["mp2"], "is_text": false}, "mp3": {"mime_type": "audio/mpeg", "group": "audio", "description": "MP3 media file", "extensions": ["mp3"], "is_text": false}, "mp4": {"mime_type": "video/mp4", "group": "video", "description": "MP4 media file", "extensions": ["mp4"], "is_text": false}, "mpegts": {"mime_type": "video/MP2T", "group": "video", "description": "MPEG Transport stream", "extensions": ["ts", "tsv", "tsa", "m2t"], "is_text": false}, "mscompress": {"mime_type": "application/x-ms-compress-szdd", "group": "archive", "description": "MS Compress archive data", "extensions": [], "is_text": false}, "msi": {"mime_type": "application/x-msi", "group": "archive", "description": "Microsoft Installer file", "extensions": ["msi"], "is_text": false}, "msix": {"mime_type": null, "group": null, "description": "Windows app package", "extensions": ["msix"], "is_text": false}, "mst": {"mime_type": null, "group": null, "description": null, "extensions": ["mst"], "is_text": false}, "mui": {"mime_type": "application/x-dosexec", "group": "application", "description": "PE Windows executable", "extensions": ["mui"], "is_text": false}, "mum": {"mime_type": "text/xml", "group": "application", "description": "Windows Update Package file", "extensions": ["mum"], "is_text": true}, "mun": {"mime_type": null, "group": null, "description": null, "extensions": ["mun"], "is_text": false}, "nim": {"mime_type": null, "group": null, "description": null, "extensions": [], "is_text": false}, "npy": {"mime_type": null, "group": null, "description": "Numpy Array", "extensions": ["npy"], "is_text": false}, "npz": {"mime_type": null, "group": null, "description": "Numpy Arrays Archive", "extensions": ["npz"], "is_text": false}, "null": {"mime_type": null, "group": null, "description": null, "extensions": ["null"], "is_text": false}, "nupkg": {"mime_type": null, "group": null, "description": "NuGet Package", "extensions": ["nupkg"], "is_text": false}, "object": {"mime_type": null, "group": null, "description": null, "extensions": ["o"], "is_text": false}, "objectivec": {"mime_type": "text/x-objcsrc", "group": null, "description": "ObjectiveC", "extensions": ["m", "mm"], "is_text": true}, "ocaml": {"mime_type": "text-ocaml", "group": "text", "description": "OCaml", "extensions": ["ml", "mli"], "is_text": true}, "ocx": {"mime_type": "application/x-dosexec", "group": "executable", "description": "PE Windows executable", "extensions": ["ocx"], "is_text": false}, "odex": {"mime_type": "application/x-executable-elf", "group": "executable", "description": "ODEX ELF executable", "extensions": ["odex"], "is_text": false}, "odin": {"mime_type": null, "group": "code", "description": "Odin", "extensions": ["odin"], "is_text": true}, "odp": {"mime_type": "application/vnd.oasis.opendocument.presentation", "group": "document", "description": "OpenDocument Presentation", "extensions": ["odp"], "is_text": false}, "ods": {"mime_type": "application/vnd.oasis.opendocument.spreadsheet", "group": "document", "description": "OpenDocument Spreadsheet", "extensions": ["ods"], "is_text": false}, "odt": {"mime_type": "application/vnd.oasis.opendocument.text", "group": "document", "description": "OpenDocument Text", "extensions": ["odt"], "is_text": false}, "ogg": {"mime_type": "audio/ogg", "group": "audio", "description": "Ogg data", "extensions": ["ogg"], "is_text": false}, "ole": {"mime_type": null, "group": null, "description": null, "extensions": [], "is_text": false}, "one": {"mime_type": "application/msonenote", "group": "document", "description": "One Note", "extensions": ["one"], "is_text": false}, "onnx": {"mime_type": null, "group": null, "description": "Open Neural Network Exchange", "extensions": ["onnx"], "is_text": false}, "ooxml": {"mime_type": null, "group": null, "description": null, "extensions": [], "is_text": false}, "otf": {"mime_type": "font/otf", "group": "font", "description": "OpenType font", "extensions": ["otf"], "is_text": false}, "outlook": {"mime_type": "application/vnd.ms-outlook", "group": "application", "description": "MS Outlook Message", "extensions": [], "is_text": false}, "palmos": {"mime_type": null, "group": null, "description": null, "extensions": [], "is_text": false}, "parquet": {"mime_type": "application/vnd.apache.parquet", "group": "unknown", "description": "Apache Parquet", "extensions": ["pqt", "parquet"], "is_text": false}, "pascal": {"mime_type": "text/x-pascal", "group": "code", "description": null, "extensions": ["pas", "pp"], "is_text": true}, "pbm": {"mime_type": null, "group": null, "description": null, "extensions": [], "is_text": false}, "pcap": {"mime_type": "application/vnd.tcpdump.pcap", "group": "application", "description": "pcap capture file", "extensions": ["pcap", "pcapng"], "is_text": false}, "pdb": {"mime_type": null, "group": null, "description": "Windows Program Database", "extensions": ["pdb"], "is_text": false}, "pdf": {"mime_type": "application/pdf", "group": "document", "description": "PDF document", "extensions": ["pdf"], "is_text": false}, "pebin": {"mime_type": "application/x-dosexec", "group": "executable", "description": "PE Windows executable", "extensions": ["exe", "dll"], "is_text": false}, "pem": {"mime_type": "application/x-pem-file", "group": "application", "description": "PEM certificate", "extensions": ["pem", "pub", "gpg"], "is_text": true}, "perl": {"mime_type": "text/x-perl", "group": "code", "description": "Perl source", "extensions": ["pl"], "is_text": true}, "pgp": {"mime_type": "application/pgp-keys", "group": null, "description": "PGP", "extensions": ["gpg", "pgp"], "is_text": false}, "php": {"mime_type": "text/x-php", "group": "code", "description": "PHP source", "extensions": ["php"], "is_text": true}, "pickle": {"mime_type": null, "group": null, "description": "Python pickle", "extensions": ["pickle", "pkl"], "is_text": false}, "png": {"mime_type": "image/png", "group": "image", "description": "PNG image", "extensions": ["png"], "is_text": false}, "po": {"mime_type": null, "group": null, "description": "Portable Object (PO) for i18n", "extensions": ["po"], "is_text": true}, "postscript": {"mime_type": "application/postscript", "group": "document", "description": "PostScript document", "extensions": ["ps"], "is_text": false}, "powershell": {"mime_type": "application/x-powershell", "group": "code", "description": "Powershell source", "extensions": ["ps1"], "is_text": true}, "ppt": {"mime_type": "application/vnd.ms-powerpoint", "group": "document", "description": "Microsoft PowerPoint CDF document", "extensions": ["ppt"], "is_text": false}, "pptx": {"mime_type": "application/vnd.openxmlformats-officedocument.presentationml.presentation", "group": "document", "description": "Microsoft PowerPoint 2007+ document", "extensions": ["pptx", "pptm"], "is_text": false}, "printfox": {"mime_type": null, "group": null, "description": "c64", "extensions": [], "is_text": false}, "prolog": {"mime_type": "text/x-prolog", "group": "code", "description": null, "extensions": ["pl", "pro", "P"], "is_text": true}, "proteindb": {"mime_type": null, "group": null, "description": "Protein DB", "extensions": ["pdb"], "is_text": true}, "proto": {"mime_type": null, "group": null, "description": null, "extensions": ["proto"], "is_text": true}, "protobuf": {"mime_type": "application/protobuf", "group": "unknown", "description": "Protocol buffers", "extensions": ["protobuf", "pb"], "is_text": false}, "psd": {"mime_type": "image/vnd.adobe.photoshop", "group": "image", "description": "Adobe Photoshop", "extensions": ["psd"], "is_text": false}, "pytorch": {"mime_type": null, "group": null, "description": "Pytorch storage file", "extensions": ["pt", "pth"], "is_text": false}, "pub": {"mime_type": "application/x-mspublisher", "group": null, "description": null, "extensions": ["pub"], "is_text": false}, "python": {"mime_type": "text/x-python", "group": "code", "description": "Python source", "extensions": ["py", "pyi"], "is_text": true}, "pythonbytecode": {"mime_type": "application/x-bytecode.python", "group": "executable", "description": "Python compiled bytecode", "extensions": ["pyc", "pyo"], "is_text": false}, "pythonpar": {"mime_type": null, "group": null, "description": null, "extensions": ["par"], "is_text": false}, "qoi": {"mime_type": "image/x-qoi", "group": "image", "description": "Quite Ok Image", "extensions": ["qoi"], "is_text": false}, "qt": {"mime_type": "video/quicktime", "group": "video", "description": "QuickTime", "extensions": ["mov"], "is_text": false}, "r": {"mime_type": "text/x-R", "group": "code", "description": "R (language)", "extensions": ["R"], "is_text": true}, "randomascii": {"mime_type": "text/plain", "group": "text", "description": "Random ASCII characters", "extensions": [], "is_text": true}, "randombytes": {"mime_type": "application/octet-stream", "group": "unknown", "description": "Random bytes", "extensions": [], "is_text": false}, "rar": {"mime_type": "application/x-rar", "group": "archive", "description": "RAR archive data", "extensions": ["rar"], "is_text": false}, "rdf": {"mime_type": "application/rdf+xml", "group": "text", "description": "Resource Description Framework document (RDF)", "extensions": ["rdf"], "is_text": true}, "riff": {"mime_type": "application/x-riff", "group": null, "description": null, "extensions": [], "is_text": false}, "rlib": {"mime_type": "application/x-archive", "group": "archive", "description": "rust library", "extensions": ["rlib"], "is_text": false}, "rll": {"mime_type": null, "group": "executable", "description": "Resource Library", "extensions": ["rll"], "is_text": false}, "rpm": {"mime_type": "application/x-rpm", "group": "archive", "description": "RedHat Package Manager archive (RPM)", "extensions": ["rpm"], "is_text": false}, "rst": {"mime_type": "text/x-rst", "group": "text", "description": "ReStructuredText document", "extensions": ["rst"], "is_text": true}, "rtf": {"mime_type": "text/rtf", "group": "text", "description": "Rich Text Format document", "extensions": ["rtf"], "is_text": true}, "ruby": {"mime_type": "application/x-ruby", "group": "code", "description": "Ruby source", "extensions": ["rb"], "is_text": true}, "rust": {"mime_type": "application/x-rust", "group": "code", "description": "Rust source", "extensions": ["rs"], "is_text": true}, "rzip": {"mime_type": null, "group": null, "description": "Rzip", "extensions": ["rz"], "is_text": false}, "scala": {"mime_type": "application/x-scala", "group": "code", "description": "Scala source", "extensions": ["scala"], "is_text": true}, "scheme": {"mime_type": "text/x-scheme", "group": "code", "description": null, "extensions": ["scm", "ss"], "is_text": false}, "scr": {"mime_type": "application/x-dosexec", "group": "executable", "description": "PE Windows executable", "extensions": ["scr"], "is_text": false}, "scriptwsf": {"mime_type": null, "group": null, "description": null, "extensions": [], "is_text": false}, "scss": {"mime_type": null, "group": null, "description": null, "extensions": ["scss"], "is_text": true}, "sevenzip": {"mime_type": "application/x-7z-compressed", "group": "archive", "description": "7-zip archive data", "extensions": ["7z"], "is_text": false}, "sgml": {"mime_type": "application/sgml", "group": "text", "description": "sgml", "extensions": ["sgml"], "is_text": true}, "sh3d": {"mime_type": null, "group": null, "description": null, "extensions": ["sh3d"], "is_text": false}, "shell": {"mime_type": "text/x-shellscript", "group": "code", "description": "Shell script", "extensions": ["sh"], "is_text": true}, "smali": {"mime_type": "application/x-smali", "group": "code", "description": "Smali source", "extensions": ["smali"], "is_text": true}, "snap": {"mime_type": null, "group": null, "description": null, "extensions": ["snap"], "is_text": false}, "so": {"mime_type": "application/x-executable-elf", "group": "executable", "description": "ELF executable, shared library", "extensions": ["so"], "is_text": false}, "solidity": {"mime_type": null, "group": null, "description": null, "extensions": ["sol"], "is_text": true}, "sql": {"mime_type": "application/x-sql", "group": "code", "description": "SQL source", "extensions": ["sql"], "is_text": true}, "sqlite": {"mime_type": null, "group": null, "description": "SQLITE database", "extensions": ["sqlite", "sqlite3"], "is_text": false}, "squashfs": {"mime_type": "application/octet-stream", "group": "archive", "description": "Squash filesystem", "extensions": [], "is_text": false}, "srt": {"mime_type": null, "group": null, "description": "SubRip Text Format", "extensions": ["srt"], "is_text": true}, "stlbinary": {"mime_type": "application/sla", "group": "image", "description": "Stereolithography CAD (binary)", "extensions": ["stl"], "is_text": false}, "stltext": {"mime_type": "application/sla", "group": "image", "description": "Stereolithography CAD (text)", "extensions": ["stl"], "is_text": true}, "sum": {"mime_type": null, "group": null, "description": null, "extensions": ["sum"], "is_text": true}, "svd": {"mime_type": null, "group": null, "description": null, "extensions": [], "is_text": false}, "svg": {"mime_type": "image/svg+xml", "group": "image", "description": "SVG Scalable Vector Graphics image data", "extensions": ["svg"], "is_text": true}, "swf": {"mime_type": "application/x-shockwave-flash", "group": "executable", "description": "Small Web File", "extensions": ["swf"], "is_text": false}, "swift": {"mime_type": "text/x-swift", "group": "code", "description": "Swift", "extensions": ["swift"], "is_text": true}, "symlink": {"mime_type": "inode/symlink", "group": "inode", "description": "Symbolic link", "extensions": [], "is_text": false}, "symlinktext": {"mime_type": "text/plain", "group": "application", "description": "Symbolic link (textual representation)", "extensions": [], "is_text": true}, "sys": {"mime_type": "application/x-windows-driver", "group": "executable", "description": "PE Windows executable", "extensions": ["sys"], "is_text": false}, "tar": {"mime_type": "application/x-tar", "group": "archive", "description": "POSIX tar archive", "extensions": ["tar"], "is_text": false}, "tcl": {"mime_type": "application/x-tcl", "group": "text", "description": "Tickle", "extensions": ["tcl"], "is_text": true}, "textproto": {"mime_type": null, "group": null, "description": null, "extensions": ["textproto", "textpb", "pbtxt"], "is_text": true}, "tga": {"mime_type": "image/x-tga", "group": "image", "description": "Targa image data", "extensions": ["tga"], "is_text": false}, "thumbsdb": {"mime_type": null, "group": null, "description": null, "extensions": [], "is_text": false}, "tiff": {"mime_type": "image/tiff", "group": "image", "description": "TIFF image data", "extensions": ["tiff", "tif"], "is_text": false}, "tmdx": {"mime_type": null, "group": null, "description": null, "extensions": ["tmdx", "tmvx"], "is_text": false}, "toml": {"mime_type": null, "group": "text", "description": null, "extensions": ["toml"], "is_text": true}, "torrent": {"mime_type": "application/x-bittorrent", "group": "application", "description": "BitTorrent file", "extensions": ["torrent"], "is_text": false}, "troff": {"mime_type": null, "group": null, "description": null, "extensions": [], "is_text": false}, "tsv": {"mime_type": "text/tsv", "group": "code", "description": "TSV document", "extensions": ["tsv"], "is_text": true}, "tsx": {"mime_type": null, "group": null, "description": null, "extensions": ["tsx", "mtsx", "ctsx"], "is_text": true}, "ttf": {"mime_type": "font/sfnt", "group": "font", "description": "TrueType Font data", "extensions": ["ttf", "ttc"], "is_text": false}, "twig": {"mime_type": null, "group": null, "description": null, "extensions": ["twig"], "is_text": true}, "txt": {"mime_type": "text/plain", "group": "text", "description": "Generic text document", "extensions": ["txt"], "is_text": true}, "txtascii": {"mime_type": "text/plain", "group": "text", "description": "Generic text document encoded in ASCII", "extensions": ["txt"], "is_text": true}, "txtutf16": {"mime_type": "text/plain", "group": "text", "description": "Generic text document encoded in UTF-16", "extensions": ["txt"], "is_text": true}, "txtutf8": {"mime_type": "text/plain", "group": "text", "description": "Generic text document encoded in UTF-8", "extensions": ["txt"], "is_text": true}, "typescript": {"mime_type": "application/typescript", "group": "text", "description": "Typescript", "extensions": ["ts", "mts", "cts"], "is_text": true}, "udf": {"mime_type": "application/x-udf-image", "group": null, "description": "Universal Disc Format", "extensions": [], "is_text": false}, "undefined": {"mime_type": "application/undefined", "group": "undefined", "description": "Undefined", "extensions": [], "is_text": false}, "unixcompress": {"mime_type": "application/x-compress", "group": null, "description": null, "extensions": ["z"], "is_text": false}, "unknown": {"mime_type": "application/octet-stream", "group": "unknown", "description": "Unknown binary data", "extensions": [], "is_text": false}, "vba": {"mime_type": "text/vbscript", "group": "code", "description": "MS Visual Basic source (VBA)", "extensions": ["vbs", "vba", "vb"], "is_text": true}, "vbe": {"mime_type": null, "group": "code", "description": "EncryptedVBS", "extensions": ["vbe"], "is_text": false}, "vcard": {"mime_type": null, "group": null, "description": null, "extensions": ["vcard"], "is_text": false}, "vcs": {"mime_type": null, "group": null, "description": null, "extensions": [], "is_text": false}, "vcxproj": {"mime_type": null, "group": null, "description": null, "extensions": ["vcxproj"], "is_text": true}, "verilog": {"mime_type": null, "group": "code", "description": null, "extensions": ["v", "verilog", "vlg", "vh"], "is_text": true}, "vhd": {"mime_type": "application/x-vhd", "group": null, "description": "Virtual Hard Disk", "extensions": [], "is_text": false}, "vhdl": {"mime_type": null, "group": null, "description": "VHDL", "extensions": ["vhd"], "is_text": true}, "visio": {"mime_type": "application/vnd.ms-visio.drawing.main+xml", "group": "document", "description": "Microsoft Visio", "extensions": ["vsd", "vsdm", "vsdx", "vdw"], "is_text": false}, "vtt": {"mime_type": null, "group": null, "description": "Web Video Text Tracks", "extensions": ["vtt", "webvtt"], "is_text": true}, "vue": {"mime_type": null, "group": null, "description": null, "extensions": ["vue"], "is_text": true}, "wad": {"mime_type": "application/wad", "group": "archive", "description": "WAD", "extensions": ["wad"], "is_text": false}, "wasm": {"mime_type": "application/wasm", "group": "executable", "description": "Web Assembly", "extensions": ["wasm"], "is_text": false}, "wav": {"mime_type": "audio/x-wav", "group": "audio", "description": "Waveform Audio file (WAV)", "extensions": ["wav"], "is_text": false}, "webm": {"mime_type": "video/webm", "group": "video", "description": "WebM", "extensions": ["webm"], "is_text": false}, "webp": {"mime_type": "image/webp", "group": "image", "description": "WebP", "extensions": ["webp"], "is_text": false}, "wim": {"mime_type": "application/x-ms-wim", "group": "unknown", "description": "Windows Imaging Format", "extensions": ["wim", "swm", "esd"], "is_text": false}, "winregistry": {"mime_type": "text/x-ms-regedit", "group": "application", "description": "Windows Registry text", "extensions": ["reg"], "is_text": true}, "wma": {"mime_type": "audio/x-ms-wma", "group": "audio", "description": "Windows Media Audio", "extensions": ["wma"], "is_text": false}, "wmf": {"mime_type": "image/wmf", "group": "image", "description": "Windows metafile", "extensions": ["wmf"], "is_text": false}, "wmv": {"mime_type": "video/x-ms-wmv", "group": "video", "description": "Windows Media Video", "extensions": ["wmv"], "is_text": false}, "woff": {"mime_type": "font/woff", "group": "font", "description": "Web Open Font Format", "extensions": ["woff"], "is_text": false}, "woff2": {"mime_type": "font/woff2", "group": "font", "description": "Web Open Font Format v2", "extensions": ["woff2"], "is_text": false}, "xar": {"mime_type": "application/x-xar", "group": "archive", "description": "XAR archive compressed data", "extensions": ["pkg", "xar"], "is_text": false}, "xcf": {"mime_type": "image/x-xcf", "group": "image", "description": "Gimp image", "extensions": ["xcf"], "is_text": false}, "xls": {"mime_type": "application/vnd.ms-excel", "group": "document", "description": "Microsoft Excel CDF document", "extensions": ["xls"], "is_text": false}, "xlsb": {"mime_type": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "group": "document", "description": "Microsoft Excel 2007+ document (binary format)", "extensions": ["xlsb"], "is_text": false}, "xlsx": {"mime_type": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "group": "document", "description": "Microsoft Excel 2007+ document", "extensions": ["xlsx", "xlsm"], "is_text": false}, "xml": {"mime_type": "text/xml", "group": "code", "description": "XML document", "extensions": ["xml"], "is_text": true}, "xpi": {"mime_type": "application/zip", "group": "archive", "description": "Compressed installation archive (XPI)", "extensions": ["xpi"], "is_text": false}, "xsd": {"mime_type": null, "group": null, "description": null, "extensions": ["xsd"], "is_text": false}, "xz": {"mime_type": "application/x-xz", "group": "archive", "description": "XZ compressed data", "extensions": ["xz"], "is_text": false}, "yaml": {"mime_type": "application/x-yaml", "group": "code", "description": "YAML source", "extensions": ["yml", "yaml"], "is_text": true}, "yara": {"mime_type": null, "group": null, "description": null, "extensions": ["yar", "yara"], "is_text": true}, "zig": {"mime_type": "text/zig", "group": "code", "description": "Zig source", "extensions": ["zig"], "is_text": true}, "zip": {"mime_type": "application/zip", "group": "archive", "description": "Zip archive data", "extensions": ["zip"], "is_text": false}, "zlibstream": {"mime_type": "application/zlib", "group": "application", "description": "zlib compressed data", "extensions": [], "is_text": false}, "zst": {"mime_type": "application/zstd", "group": "archive", "description": "Zstandard", "extensions": ["zst"], "is_text": false}} \ No newline at end of file diff --git a/python/magika/config/magika_config.json b/python/magika/config/magika_config.json deleted file mode 100644 index cb2035e1..00000000 --- a/python/magika/config/magika_config.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "default_model_name": "standard_v1", - "medium_confidence_threshold": 0.5, - "min_file_size_for_dl": 16, - "padding_token": 256, - "block_size": 4096 -} \ No newline at end of file diff --git a/python/magika/content_types.py b/python/magika/content_types.py deleted file mode 100644 index 312fd5d0..00000000 --- a/python/magika/content_types.py +++ /dev/null @@ -1,441 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import annotations - -import json -import sys -from collections import defaultdict -from pathlib import Path -from typing import Any, Dict, Iterable, List, Optional, Set - -CONTENT_TYPES_CONFIG_PATH = ( - Path(__file__).parent / "config" / "content_types_config.json" -) - - -""" -This module defines ContentType, an abstraction for a content type and its -associated metadata, and ContentTypesManager, a class that wraps a number of -utility functions related to contentt types. Note that the ContentTypesManager -is extensively used by the training framework, and not just by the magika python -module. -""" - - -class ContentType: - # the tool returned unknown, '', None, or similar - UNKNOWN = "unknown" - UNKNOWN_MIME_TYPE = "application/unknown" - UNKNOWN_CONTENT_TYPE_GROUP = "unknown" - UNKNOWN_MAGIC = "Unknown" - UNKNOWN_DESCRIPTION = "Unknown type" - - # the tool returned an output that we currently do not map to our content types - UNSUPPORTED = "unsupported" - - # the tool exited with returncode != 0 - ERROR = "error" - - # there is no result for this tool - MISSING = "missing" - - # the file is empty (or just \x00, spaces, etc.) - EMPTY = "empty" - - # the output of the tool is gibberish / meaningless type - CORRUPTED = "corrupted" - - # the tool did not return in time - TIMEOUT = "timeout" - - # the mapping functions returned a type we don't recognized, and we flag it - # as NOT VALID - NOT_VALID = "not_valid" - - # Used when a file path does not exist - FILE_DOES_NOT_EXIST = "file_does_not_exist" - - # Used when a file path exists, but there are permission issues, e.g., can't - # read file - PERMISSION_ERROR = "permission_error" - - # more special labels - DIRECTORY = "directory" - SYMLINK = "symlink" - - GENERIC_TEXT = "txt" - - def __init__( - self, - name: str, - extensions: List[str], - mime_type: Optional[str], - group: Optional[str], - magic: Optional[str], - description: Optional[str], - vt_type: Optional[str], - datasets: List[str], - parent: Optional[str], - tags: List[str], - model_target_label: Optional[str], - target_label: Optional[str], - correct_labels: List[str], - in_scope_for_output_content_type: bool, - add_automatic_tags: bool = True, - ): - self.name = name - self.extensions = extensions - self.mime_type = mime_type - self.group = group - self.magic = magic - self.description = description - self.vt_type = vt_type - self.datasets = datasets - self.parent = parent - self.tags = tags - self.model_target_label = model_target_label - self.target_label = target_label - self.correct_labels = correct_labels - self.in_scope_for_output_content_type = in_scope_for_output_content_type - - # add automatic tags based on dataset - if add_automatic_tags: - if self.datasets is not None: - for dataset in self.datasets: - self.tags.append(f"dataset:{dataset}") - if self.model_target_label is not None: - self.tags.append(f"model_target_label:{self.model_target_label}") - if self.target_label is not None: - self.tags.append(f"target_label:{self.target_label}") - if self.correct_labels is not None: - for cl in self.correct_labels: - self.tags.append(f"correct_label:{cl}") - - @property - def is_text(self) -> bool: - return "text" in self.tags - - @property - def in_scope_for_training(self) -> bool: - if len(self.datasets) == 0: - return False - if self.model_target_label is None: - return False - if self.target_label is None: - return False - if len(self.correct_labels) == 0: - return False - return True - - def to_dict(self) -> Dict[str, Any]: - info: Dict[str, Any] = { - "name": self.name, - "extensions": self.extensions, - "mime_type": self.mime_type, - "group": self.group, - "magic": self.magic, - "description": self.description, - "vt_type": self.vt_type, - "datasets": self.datasets, - "parent": self.parent, - "tags": self.tags, - "model_target_label": self.model_target_label, - "target_label": self.target_label, - "correct_labels": self.correct_labels, - "in_scope_for_output_content_type": self.in_scope_for_output_content_type, - "in_scope_for_training": self.in_scope_for_training, - } - return info - - @staticmethod - def from_dict(info_d: Dict, add_automatic_tags: bool = True) -> ContentType: - info_d_copy = dict(info_d) - info_d_copy.pop("in_scope_for_training") - ct = ContentType(add_automatic_tags=add_automatic_tags, **info_d_copy) - return ct - - def __str__(self) -> str: - return f"<{self.name}>" - - def __repr__(self) -> str: - return str(self) - - -class ContentTypesManager: - SPECIAL_CONTENT_TYPES: List[str] = [ - ContentType.UNKNOWN, - ContentType.UNSUPPORTED, - ContentType.ERROR, - ContentType.MISSING, - ContentType.EMPTY, - ContentType.CORRUPTED, - ContentType.NOT_VALID, - ContentType.PERMISSION_ERROR, - ContentType.GENERIC_TEXT, - ] - - SUPPORTED_TARGET_LABELS_SPEC = [ - "content-type", - "model-target-label", - "target-label", - ] - - def __init__( - self, - content_type_config_path: Path = CONTENT_TYPES_CONFIG_PATH, - add_automatic_tags: bool = True, - ): - self.cts: Dict[str, ContentType] = {} - # tag to content type map - self.tag2cts: Dict[str, List[ContentType]] = defaultdict(list) - # map from extension to content types - self.ext2cts: Dict[str, List[ContentType]] = defaultdict(list) - self.load_content_types_info( - content_type_config_path=content_type_config_path, - add_automatic_tags=add_automatic_tags, - ) - - def load_content_types_info( - self, content_type_config_path: Path, add_automatic_tags: bool = True - ) -> None: - with open(content_type_config_path) as f: - info = json.load(f) - self.cts = {} - for k, v in info.items(): - assert k == v["name"] - ct = ContentType.from_dict(v, add_automatic_tags=add_automatic_tags) - self.cts[k] = ct - for tag in ct.tags: - self.tag2cts[tag].append(ct) - for ext in ct.extensions: - self.ext2cts[ext].append(ct) - - def get(self, content_type_name: str) -> Optional[ContentType]: - return self.cts.get(content_type_name) - - def get_or_raise(self, content_type_name: Optional[str]) -> ContentType: - if content_type_name is None: - raise Exception("Input content_type_name is None") - ct = self.get(content_type_name) - if ct is None: - raise Exception(f'Could not get a ContentType for "{content_type_name}"') - return ct - - def get_mime_type( - self, content_type_name: str, default: str = ContentType.UNKNOWN_MIME_TYPE - ) -> str: - ct = self.get(content_type_name) - if ct is None: - return default - if ct.mime_type is None: - return default - return ct.mime_type - - def get_group( - self, - content_type_name: str, - default: str = ContentType.UNKNOWN_CONTENT_TYPE_GROUP, - ) -> str: - ct = self.get(content_type_name) - if ct is None: - return default - if ct.group is None: - return default - return ct.group - - def get_magic( - self, - content_type_name: str, - default: str = ContentType.UNKNOWN_MAGIC, - fallback_to_label: bool = True, - ) -> str: - ct = self.get(content_type_name) - if ct is None or ct.magic is None: - if fallback_to_label: - return content_type_name - else: - return default - return ct.magic - - def get_description( - self, - content_type_name: str, - default: str = ContentType.UNKNOWN_DESCRIPTION, - fallback_to_label: bool = True, - ) -> str: - ct = self.get(content_type_name) - if ct is None or ct.description is None: - if fallback_to_label: - return content_type_name - else: - return default - return ct.description - - def get_is_text( - self, - content_type_name: str, - default: bool = False, - ) -> bool: - ct = self.get(content_type_name) - if ct is None: - return default - else: - return ct.is_text - - def get_cts_by_ext(self, ext: str) -> List[ContentType]: - return self.ext2cts.get(ext, list()) - - def get_cts_by_ext_or_raise(self, ext: str) -> List[ContentType]: - cts = self.get_cts_by_ext(ext) - if len(cts) == 0: - raise Exception(f'Could not find ContentType for extension "{ext}"') - return cts - - def get_valid_tags(self, only_explicit: bool = True) -> List[str]: - if only_explicit: - all_tags = sorted( - filter( - lambda x: ( - not x.split(":")[0].endswith("_label") - and not x.startswith("dataset") - ), - self.tag2cts.keys(), - ) - ) - else: - all_tags = sorted(self.tag2cts.keys()) - return all_tags - - def is_valid_ct_label(self, label: str) -> bool: - if self.get(label) is not None: - return True - if label in ContentTypesManager.SPECIAL_CONTENT_TYPES: - return True - return False - - def is_valid_tag(self, tag: str) -> bool: - return tag in self.tag2cts.keys() - - def select( - self, query: Optional[str] = None, must_be_in_scope_for_training: bool = True - ) -> List[ContentType]: - ct_names = self.select_names( - query=query, must_be_in_scope_for_training=must_be_in_scope_for_training - ) - # we know these are valid content types - return list(map(self.get_or_raise, ct_names)) - - def select_names( - self, query: Optional[str] = None, must_be_in_scope_for_training: bool = True - ) -> List[str]: - ct_names_set: Set[str] = set() - if query is None: - # select them all, honoring must_be_in_scope_for_training - for ct in self.cts.values(): - if must_be_in_scope_for_training and not ct.in_scope_for_training: - continue - ct_names_set.add(ct.name) - else: - # consider each element of the query in sequence and add/remove - # content types as appropriate (also honoring - # must_be_in_scope_for_training) - entries = query.split(",") - for entry in entries: - if entry in ["*", "all"]: - # we know we get list of strings because we set only_names=True - ct_names_set.update( - self.select_names( - must_be_in_scope_for_training=must_be_in_scope_for_training - ) - ) - elif entry.startswith("tag:"): - entry = entry[4:] - if not self.is_valid_tag(entry): - print( - f'ERROR: "entry" is not a valid tag. Valid tags: {sorted(self.tag2cts.keys())}.' - ) - sys.exit(1) - for ct in self.tag2cts[entry]: - if ( - must_be_in_scope_for_training - and not ct.in_scope_for_training - ): - continue - ct_names_set.add(ct.name) - elif entry.startswith("-tag:"): - entry = entry[5:] - assert self.is_valid_tag(entry) - for ct in self.tag2cts[entry]: - # no need to check for must_be_in_scope_for_training when removing - if ct.name in ct_names_set: - ct_names_set.remove(ct.name) - elif entry[0] == "-": - entry = entry[1:] - assert self.is_valid_ct_label(entry) - # no need to check for must_be_in_scope_for_training when removing - if entry in ct_names_set: - ct_names_set.remove(entry) - else: - assert self.is_valid_ct_label(entry) - # this ct was manually specified, if it does not honor - # must_be_in_scope_for_training, that's a problem. - if must_be_in_scope_for_training: - candidate_ct: ContentType | None = self.get(entry) - assert candidate_ct is not None - assert candidate_ct.in_scope_for_training - ct_names_set.add(entry) - - ct_names = sorted(ct_names_set) - return ct_names - - def get_content_types_space(self) -> List[str]: - """Returns the full list of possible content types, including out of - scope and special types. Returns only the names.""" - - # We know that we get content type names (str), and not a list of - # ContentType - return sorted( - set(self.select_names(must_be_in_scope_for_training=False)) - | set(self.SPECIAL_CONTENT_TYPES) - ) - - def get_output_content_types(self) -> List[ContentType]: - """Return a sorted list of ContentType objects representing valid output - content types.""" - return sorted( - set( - map( - lambda ct: self.get_or_raise(ct.target_label), - filter( - lambda ct: ct.in_scope_for_output_content_type - and ct.target_label is not None, - set(self.select(must_be_in_scope_for_training=False)), - ), - ) - ), - key=lambda ct: ct.name, - ) - - def get_output_content_types_names(self) -> List[str]: - """Return a sorted list of content type names representing valid output - content types.""" - return [ct.name for ct in self.get_output_content_types()] - - def get_invalid_labels(self, labels: Iterable[str]) -> List[str]: - not_valid_labels = set() - for label in set(labels): - if not self.is_valid_ct_label(label): - not_valid_labels.add(label) - return sorted(not_valid_labels) diff --git a/python/magika/magika.py b/python/magika/magika.py index f4b599d8..d519bbfe 100644 --- a/python/magika/magika.py +++ b/python/magika/magika.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. + import json import logging import os @@ -19,26 +20,28 @@ import time from collections import defaultdict from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Tuple import numpy as np import numpy.typing as npt import onnxruntime as rt -from tqdm.auto import tqdm -from magika.content_types import ContentType, ContentTypesManager from magika.logger import get_logger -from magika.prediction_mode import PredictionMode from magika.seekable import Buffer, File, Seekable from magika.types import ( - MagikaOutputFields, + ContentTypeInfo, + ContentTypeLabel, MagikaResult, + ModelConfig, ModelFeatures, - ModelFeaturesV2, ModelOutput, - ModelOutputFields, + PredictionMode, + Status, + StatusOr, ) +DEFAULT_MODEL_NAME = "draft_standard_v2" + class Magika: def __init__( @@ -52,44 +55,22 @@ def __init__( ) -> None: self._log = get_logger(use_colors=use_colors) - self._disable_progress_bar = True - - self._magika_config = Magika._get_magika_config() - - # Default model, used in case not specified via the Magika constructor - self._default_model_name = self._magika_config["default_model_name"] - # Minimum threshold for "default" prediction mode - self._medium_confidence_threshold = self._magika_config[ - "medium_confidence_threshold" - ] - # Minimum file size for using the DL model - self._min_file_size_for_dl = self._magika_config["min_file_size_for_dl"] - # Which integer we use to indicate padding - self._padding_token = self._magika_config["padding_token"] - self._block_size = self._magika_config["block_size"] - if verbose: self._log.setLevel(logging.INFO) - self._disable_progress_bar = False if debug: self._log.setLevel(logging.DEBUG) - self._disable_progress_bar = False if model_dir is not None: self._model_dir = model_dir else: # use default model self._model_dir = ( - Path(__file__).parent / "models" / self._default_model_name + Path(__file__).parent / "models" / self.get_default_model_name() ) self._model_path = self._model_dir / "model.onnx" - self._model_config_path = self._model_dir / "model_config.json" - self._thresholds_path = self._model_dir / "thresholds.json" - self._model_output_overwrite_map_path = ( - self._model_dir / "model_output_overwrite_map.json" - ) + self._model_config_path = self._model_dir / "config.min.json" if not self._model_dir.is_dir(): raise MagikaError(f"model dir not found at {str(self._model_dir)}") @@ -99,48 +80,38 @@ def __init__( raise MagikaError( f"model config not found at {str(self._model_config_path)}" ) - if not self._thresholds_path.is_file(): - raise MagikaError(f"thresholds not found at {str(self._thresholds_path)}") - if not self._model_output_overwrite_map_path.is_file(): - raise MagikaError( - f"thresholds not found at {str(self._model_output_overwrite_map_path)}" - ) - - self._model_config = json.loads(self._model_config_path.read_text()) - self._thresholds = json.loads(self._thresholds_path.read_text())["thresholds"] - - self._model_output_overwrite_map: Dict[str, str] = json.loads( - self._model_output_overwrite_map_path.read_text() + self._model_config: ModelConfig = Magika._load_model_config( + self._model_config_path ) - self._input_sizes: Dict[str, int] = { - "beg": self._model_config["cfg"]["input_sizes"]["beg"], - "mid": self._model_config["cfg"]["input_sizes"]["mid"], - "end": self._model_config["cfg"]["input_sizes"]["end"], - } self._target_labels_space_np = np.array( - self._model_config["train_dataset_info"]["target_labels_info"][ - "target_labels_space" - ] + list(map(str, self._model_config.target_labels_space)) ) self._prediction_mode = prediction_mode self._no_dereference = no_dereference - self._ctm = ContentTypesManager() + content_types_kb_path = ( + Path(__file__).parent / "config" / "content_types_kb.min.json" + ) + self._cts_infos = Magika._load_content_types_kb(content_types_kb_path) + + # self._ctm = ContentTypesManager() self._onnx_session = self._init_onnx_session() self._perf_stats: Dict[str, List[float]] = defaultdict(list) - def identify_path(self, path: Path) -> MagikaResult: + def identify_path(self, path: Path) -> StatusOr[MagikaResult]: return self._get_result_from_path(path) - def identify_paths(self, paths: List[Path]) -> List[MagikaResult]: + def identify_paths(self, paths: List[Path]) -> List[StatusOr[MagikaResult]]: return self._get_results_from_paths(paths) - def identify_bytes(self, content: bytes) -> MagikaResult: + def identify_bytes(self, content: bytes) -> StatusOr[MagikaResult]: + if not isinstance(content, bytes): + raise Exception(f"Content must have type 'bytes', not {type(content)}.") return self._get_result_from_bytes(content) @staticmethod @@ -151,11 +122,53 @@ def get_default_model_name() -> str: print help, etc.) without the need to instantiate a Magika object. """ - return str(Magika._get_magika_config()["default_model_name"]) + return DEFAULT_MODEL_NAME - def get_model_name(self) -> str: + def get_model_dir_name(self) -> str: return self._model_dir.name + @staticmethod + def _load_content_types_kb( + content_types_kb_json_path: Path, + ) -> Dict[ContentTypeLabel, ContentTypeInfo]: + TXT_MIME_TYPE = "text/plain" + UNKNOWN_MIME_TYPE = "application/octet-stream" + UNKNOWN_GROUP = "unknown" + + out = {} + for ct_name, ct_info in json.loads( + content_types_kb_json_path.read_text() + ).items(): + is_text = ct_info["is_text"] + if is_text: + default_mime_type = TXT_MIME_TYPE + else: + default_mime_type = UNKNOWN_MIME_TYPE + mime_type = ( + default_mime_type + if ct_info["mime_type"] is None + else ct_info["mime_type"] + ) + group = UNKNOWN_GROUP if ct_info["group"] is None else ct_info["group"] + description = ( + ct_name if ct_info["description"] is None else ct_info["description"] + ) + extensions = ct_info["extensions"] + out[ContentTypeLabel(ct_name)] = ContentTypeInfo( + label=ContentTypeLabel(ct_name), + mime_type=mime_type, + group=group, + description=description, + extensions=extensions, + is_text=is_text, + ) + return out + + @staticmethod + def _load_model_config(model_config_path: Path) -> ModelConfig: + config = json.loads(model_config_path.read_text()) + return ModelConfig(**config) + def _init_onnx_session(self) -> rt.InferenceSession: start_time = time.time() rt.disable_telemetry_events() @@ -170,12 +183,12 @@ def _init_onnx_session(self) -> rt.InferenceSession: ) return onnx_session - @staticmethod - def _get_magika_config() -> Dict[str, Any]: - config_path = Path(__file__).parent / "config" / "magika_config.json" - return json.loads(config_path.read_text()) # type: ignore[no-any-return] + def _get_ct_info(self, content_type: ContentTypeLabel) -> ContentTypeInfo: + return self._cts_infos[content_type] - def _get_results_from_paths(self, paths: List[Path]) -> List[MagikaResult]: + def _get_results_from_paths( + self, paths: List[Path] + ) -> List[StatusOr[MagikaResult]]: """Given a list of paths, returns a list of predictions. Each prediction is a dict with the relevant information, such as the file path, the output of the DL model, the output of the tool, and the associated @@ -188,7 +201,7 @@ def _get_results_from_paths(self, paths: List[Path]) -> List[MagikaResult]: # We use a "str" instead of Path because it makes it easier later on to # serialize. - all_outputs: Dict[str, MagikaResult] = {} # {path: MagikaOutput, ...} + all_outputs: Dict[str, StatusOr[MagikaResult]] = {} # {path: , ...} # We use a list and not the dict because that's what we need later on # for inference. @@ -198,7 +211,7 @@ def _get_results_from_paths(self, paths: List[Path]) -> List[MagikaResult]: f"Processing input files and extracting features for {len(paths)} samples" ) start_time = time.time() - for path in tqdm(paths, disable=self._disable_progress_bar): + for path in paths: output, features = self._get_result_or_features_from_path(path) if output is not None: all_outputs[str(path)] = output @@ -209,8 +222,8 @@ def _get_results_from_paths(self, paths: List[Path]) -> List[MagikaResult]: self._log.debug(f"First pass and features extracted in {elapsed_time:.03f} ms") # Get the outputs via DL for the files that need it. - outputs_with_dl = self._get_results_from_features(all_features) - all_outputs.update(outputs_with_dl) + for path_str, result in self._get_results_from_features(all_features).items(): + all_outputs[path_str] = result # Finally, we collect the predictions in a final list, sorted by the # initial paths list (and not by insertion order). @@ -219,10 +232,10 @@ def _get_results_from_paths(self, paths: List[Path]) -> List[MagikaResult]: sorted_outputs.append(all_outputs[str(path)]) return sorted_outputs - def _get_result_from_path(self, path: Path) -> MagikaResult: + def _get_result_from_path(self, path: Path) -> StatusOr[MagikaResult]: return self._get_results_from_paths([path])[0] - def _get_result_from_bytes(self, content: bytes) -> MagikaResult: + def _get_result_from_bytes(self, content: bytes) -> StatusOr[MagikaResult]: result, features = self._get_result_or_features_from_bytes(content) if result is not None: return result @@ -237,11 +250,18 @@ def _extract_features_from_path( end_size: int, padding_token: int, block_size: int, + use_inputs_at_offsets: bool, ) -> ModelFeatures: # TODO: reimplement this using a context manager seekable = File(file_path) mf = Magika._extract_features_from_seekable( - seekable, beg_size, mid_size, end_size, padding_token, block_size + seekable, + beg_size, + mid_size, + end_size, + padding_token, + block_size, + use_inputs_at_offsets, ) seekable.close() return mf @@ -254,10 +274,17 @@ def _extract_features_from_bytes( end_size: int, padding_token: int, block_size: int, + use_inputs_at_offsets: bool, ) -> ModelFeatures: buffer = Buffer(content) return Magika._extract_features_from_seekable( - buffer, beg_size, mid_size, end_size, padding_token, block_size + buffer, + beg_size, + mid_size, + end_size, + padding_token, + block_size, + use_inputs_at_offsets, ) @staticmethod @@ -268,83 +295,8 @@ def _extract_features_from_seekable( end_size: int, padding_token: int, block_size: int, + use_inputs_at_offsets: bool, ) -> ModelFeatures: - """This implement features extraction from a seekable, which is an - abstraction about anything that can be "read_at" a specific offset, such - as a file or buffer. This is implemented so that we do not need to load - the entire content of the file in memory, and we do not need to scan the - entire buffer. - - High-level overview on what we do: - - beg: we read the first block in memory, we lstrip() it, and we use this as - the basis to extract beg_size integers (we either truncate to beg_size - or we add padding as suffix up to beg_size). - - end: same as "beg", but we read the last block in memory, and the padding - is prefixed (and not suffixed). - - mid: we consider the remaining content (after stripping whitespace), - and we take the mid_size bytes in the middle. If needed, we add padding - to the left and to the right. - """ - - if seekable.size < (2 * block_size + mid_size): - # If the content is small, we take this shortcut to avoid - # checking for too many corner cases. - content = seekable.read_at(0, seekable.size) - content = content.strip() - beg_content = content - mid_content = content - end_content = content - - else: # seekable.size >= (2 * block_size + mid_size) - # If the content is big enough, the implementation becomes much - # simpler. In this path of the code, we know we have enough content - # to strip up to "block_size" bytes from both sides, and still have - # enough data for mid_size. - - beg_content = seekable.read_at(0, block_size).lstrip() - - end_content = seekable.read_at( - seekable.size - block_size, block_size - ).rstrip() - - # we extract "mid" from the middle of the content that we have not - # trimmed - trimmed_beg_bytes_num = block_size - len(beg_content) - trimmed_end_bytes_num = block_size - len(end_content) - # mid_idx points to the first byte of the middle block - mid_idx = ( - trimmed_beg_bytes_num - + ( - seekable.size - - trimmed_beg_bytes_num - - trimmed_end_bytes_num - - mid_size - ) - // 2 - ) - mid_content = seekable.read_at(mid_idx, mid_size) - - beg_ints = Magika._get_beg_ints_with_padding( - beg_content, beg_size, padding_token - ) - mid_ints = Magika._get_mid_ints_with_padding( - mid_content, mid_size, padding_token - ) - end_ints = Magika._get_end_ints_with_padding( - end_content, end_size, padding_token - ) - - return ModelFeatures(beg=beg_ints, mid=mid_ints, end=end_ints) - - @staticmethod - def _extract_features_from_seekable_v2( - seekable: Seekable, - beg_size: int, - mid_size: int, - end_size: int, - padding_token: int, - block_size: int, - ) -> ModelFeaturesV2: """This implement v2 of the features extraction v2 from a seekable, which is an abstraction about anything that can be "read_at" a specific offset, such as a file or buffer. This is implemented so that we do not @@ -357,25 +309,18 @@ def _extract_features_from_seekable_v2( we have too many or too few. Blocks extraction and padding: - - beg: we read the first block in memory, we lstrip() it, and we use + - beg: we read the first block_size bytes, we lstrip() it, and we use this as the basis to extract beg_size integers. If we have too many bytes, we only consider the first beg_size ones. If we do not have enough, we add padding as suffix (up to beg_size integers). - mid: we determine "where the middle is" by using the entire content's - size, and we take the mid_size bytes in the middle. If we do not have - enough bytes, we add padding to the left and to the right. In case we - need to add an odd number of padding integers, we add an extra one to - the right. - - end: same as "beg", but we read the last block in memory, we rstrip() + size (before stripping the whitespace-like characters), and we take the + mid_size bytes in the middle. If we do not have enough bytes, we add + padding to the left and to the right. In case we need to add an odd + number of padding integers, we add an extra one to the right. + - end: same as "beg", but we read the last block_size bytes, we rstrip() (instead of lstrip()), and, if needed, we add padding as a prefix (and not as a suffix like we do with "beg"). - - Notes about similarities and differences with v1: the main difference is - that whether we strip some bytes from beg and end does not influence - which bytes we pick for the middle part. This makes the implementation - of v2 much simpler. And it makes it possible for a client to just read a - block at the beginning, middle, and end, and send it to our backend for - features extraction -- no need for additional check on the client side. """ assert beg_size < block_size @@ -385,41 +330,56 @@ def _extract_features_from_seekable_v2( # we read at most block_size bytes bytes_num_to_read = min(block_size, seekable.size) - beg_content = seekable.read_at(0, bytes_num_to_read).lstrip() - beg_ints = Magika._get_beg_ints_with_padding( - beg_content, beg_size, padding_token - ) - - end_content = seekable.read_at( - seekable.size - bytes_num_to_read, bytes_num_to_read - ).rstrip() - end_ints = Magika._get_end_ints_with_padding( - end_content, end_size, padding_token - ) + if beg_size > 0: + beg_content = seekable.read_at(0, bytes_num_to_read).lstrip() + beg_ints = Magika._get_beg_ints_with_padding( + beg_content, beg_size, padding_token + ) + else: + beg_ints = [] + + if mid_size > 0: + # mid_idx points to the left-most offset to read for the "mid" component + # of the features. + mid_bytes_num_to_read = min(seekable.size, mid_size) + mid_idx = (seekable.size - mid_bytes_num_to_read) // 2 + mid_content = seekable.read_at(mid_idx, mid_bytes_num_to_read) + mid_ints = Magika._get_mid_ints_with_padding( + mid_content, mid_size, padding_token + ) + else: + mid_ints = [] - # mid_idx points to the left-most offset to read for the "mid" component - # of the features. - mid_bytes_num_to_read = min(seekable.size, mid_size) - mid_idx = (seekable.size - mid_bytes_num_to_read) // 2 - mid_content = seekable.read_at(mid_idx, mid_bytes_num_to_read) - mid_ints = Magika._get_mid_ints_with_padding( - mid_content, mid_size, padding_token - ) + if end_size > 0: + end_content = seekable.read_at( + seekable.size - bytes_num_to_read, bytes_num_to_read + ).rstrip() + end_ints = Magika._get_end_ints_with_padding( + end_content, end_size, padding_token + ) + else: + end_ints = [] - offset_0x8000_0x8007 = Magika._get_ints_at_offset_or_padding( - seekable, 0x8000, 8, padding_token - ) - offset_0x8800_0x8807 = Magika._get_ints_at_offset_or_padding( - seekable, 0x8800, 8, padding_token - ) - offset_0x9000_0x9007 = Magika._get_ints_at_offset_or_padding( - seekable, 0x9000, 8, padding_token - ) - offset_0x9800_0x9807 = Magika._get_ints_at_offset_or_padding( - seekable, 0x9800, 8, padding_token - ) + if use_inputs_at_offsets: + offset_0x8000_0x8007 = Magika._get_ints_at_offset_or_padding( + seekable, 0x8000, 8, padding_token + ) + offset_0x8800_0x8807 = Magika._get_ints_at_offset_or_padding( + seekable, 0x8800, 8, padding_token + ) + offset_0x9000_0x9007 = Magika._get_ints_at_offset_or_padding( + seekable, 0x9000, 8, padding_token + ) + offset_0x9800_0x9807 = Magika._get_ints_at_offset_or_padding( + seekable, 0x9800, 8, padding_token + ) + else: + offset_0x8000_0x8007 = [] + offset_0x8800_0x8807 = [] + offset_0x9000_0x9007 = [] + offset_0x9800_0x9807 = [] - return ModelFeaturesV2( + return ModelFeatures( beg=beg_ints, mid=mid_ints, end=end_ints, @@ -433,9 +393,11 @@ def _extract_features_from_seekable_v2( def _get_beg_ints_with_padding( beg_content: bytes, beg_size: int, padding_token: int ) -> List[int]: - """Take an (already-stripped) buffer as input and extract beg ints. If + """Take an (already-stripped) buffer as input and extract beg ints. + This returns a list of integers whose length is exactly beg_size. If the buffer is bigger than required, take only the initial portion. If - the buffer is shorter, add padding at the end.""" + the buffer is shorter, add padding at the end. + """ if beg_size < len(beg_content): # we don't need so many bytes @@ -455,8 +417,9 @@ def _get_beg_ints_with_padding( def _get_mid_ints_with_padding( mid_content: bytes, mid_size: int, padding_token: int ) -> List[int]: - """Take a buffer as input and extract mid ints. If the buffer is bigger - than required, take only its middle part. If the buffer is shorter, add + """Take a buffer as input and extract mid ints. This returns a list of + integers whose length is exactly mid_size. If the buffer is bigger than + required, take only its middle part. If the buffer is shorter, add padding to its left and right. If we need to add an odd number of padding integers, add an extra one to the right. """ @@ -486,8 +449,9 @@ def _get_mid_ints_with_padding( def _get_end_ints_with_padding( end_content: bytes, end_size: int, padding_token: int ) -> List[int]: - """Take an (already-stripped) buffer as input and extract end ints. If - the buffer is bigger than required, take only the last portion. If the + """Take an (already-stripped) buffer as input and extract end ints. This + returns a list of integers whose length is exactly end_size. If the + buffer is bigger than required, take only the last portion. If the buffer is shorter, add padding at the beginning. """ @@ -522,7 +486,7 @@ def _get_model_outputs_from_features( scores = np.max(raw_preds, axis=1) return [ - (path, ModelOutput(ct_label=ct_label, score=float(score))) + (path, ModelOutput(ct_label=ContentTypeLabel(ct_label), score=float(score))) for (path, _), ct_label, score in zip( all_features, preds_content_types_labels, scores ) @@ -530,14 +494,14 @@ def _get_model_outputs_from_features( def _get_results_from_features( self, all_features: List[Tuple[Path, ModelFeatures]] - ) -> Dict[str, MagikaResult]: + ) -> Dict[str, StatusOr[MagikaResult]]: # We now do inference for those files that need it. if len(all_features) == 0: # nothing to be done return {} - outputs: Dict[str, MagikaResult] = {} + results: Dict[str, StatusOr[MagikaResult]] = {} for path, model_output in self._get_model_outputs_from_features(all_features): # In additional to the content type label from the DL model, we @@ -550,18 +514,17 @@ def _get_results_from_features( model_output.ct_label, model_output.score ) - outputs[str(path)] = self._get_result_from_labels_and_score( - path, + results[str(path)] = self._get_result_from_labels_and_score( dl_ct_label=model_output.ct_label, output_ct_label=output_ct_label, score=model_output.score, ) - return outputs + return results def _get_result_from_features( self, features: ModelFeatures, path: Optional[Path] = None - ) -> MagikaResult: + ) -> StatusOr[MagikaResult]: # This is useful to scan from stream of bytes if path is None: path = Path("-") @@ -570,24 +533,27 @@ def _get_result_from_features( return result_with_dl def _get_output_ct_label_from_dl_result( - self, dl_ct_label: str, score: float - ) -> str: + self, dl_ct_label: ContentTypeLabel, score: float + ) -> ContentTypeLabel: # overwrite ct_label if specified in the config - dl_ct_label = self._model_output_overwrite_map.get(dl_ct_label, dl_ct_label) + dl_ct_label = self._model_config.overwrite_map.get(dl_ct_label, dl_ct_label) if self._prediction_mode == PredictionMode.BEST_GUESS: # We take the model predictions, no matter what the score is. output_ct_label = dl_ct_label elif ( self._prediction_mode == PredictionMode.HIGH_CONFIDENCE - and score >= self._thresholds[dl_ct_label] + and score + >= self._model_config.thresholds.get( + dl_ct_label, self._model_config.medium_confidence_threshold + ) ): # The model score is higher than the per-content-type # high-confidence threshold. output_ct_label = dl_ct_label elif ( self._prediction_mode == PredictionMode.MEDIUM_CONFIDENCE - and score >= self._medium_confidence_threshold + and score >= self._model_config.medium_confidence_threshold ): # We take the model prediction only if the score is above a given # relatively loose threshold. @@ -598,70 +564,30 @@ def _get_output_ct_label_from_dl_result( # the model has, at the very least, got the binary vs. text category # right. This allows us to pick between unknown and txt without the # need to read or scan the file bytes once again. - if self._ctm.get_or_raise(dl_ct_label).is_text: - output_ct_label = ContentType.GENERIC_TEXT + if self._get_ct_info(dl_ct_label).is_text: + output_ct_label = ContentTypeLabel.TXT else: - output_ct_label = ContentType.UNKNOWN + output_ct_label = ContentTypeLabel.UNKNOWN return output_ct_label def _get_result_from_labels_and_score( - self, path: Path, dl_ct_label: Optional[str], score: float, output_ct_label: str - ) -> MagikaResult: - dl_score = None if dl_ct_label is None else score - output_score = score - - # add group info - dl_group = None if dl_ct_label is None else self._ctm.get_group(dl_ct_label) - output_group = self._ctm.get_group(output_ct_label) - - # add mime type info - dl_mime_type = ( - None if dl_ct_label is None else self._ctm.get_mime_type(dl_ct_label) - ) - output_mime_type = self._ctm.get_mime_type(output_ct_label) - - # add magic - dl_magic = None if dl_ct_label is None else self._ctm.get_magic(dl_ct_label) - output_magic = self._ctm.get_magic(output_ct_label) - - # add description - dl_description = ( - None if dl_ct_label is None else self._ctm.get_description(dl_ct_label) - ) - output_description = self._ctm.get_description(output_ct_label) - - # add is_text - dl_is_text = None if dl_ct_label is None else self._ctm.get_is_text(dl_ct_label) - output_is_text = self._ctm.get_is_text(output_ct_label) - - magika_result = MagikaResult( - path=str(path), - dl=ModelOutputFields( - ct_label=dl_ct_label, - score=dl_score, - group=dl_group, - mime_type=dl_mime_type, - magic=dl_magic, - description=dl_description, - is_text=dl_is_text, - ), - output=MagikaOutputFields( - ct_label=output_ct_label, - score=output_score, - group=output_group, - mime_type=output_mime_type, - magic=output_magic, - description=output_description, - is_text=output_is_text, - ), + self, + dl_ct_label: ContentTypeLabel, + output_ct_label: ContentTypeLabel, + score: float, + ) -> StatusOr[MagikaResult]: + return StatusOr( + value=MagikaResult( + dl=self._get_ct_info(dl_ct_label), + output=self._get_ct_info(output_ct_label), + score=score, + ) ) - return magika_result - def _get_result_or_features_from_path( self, path: Path - ) -> Tuple[Optional[MagikaResult], Optional[ModelFeatures]]: + ) -> Tuple[Optional[StatusOr[MagikaResult]], Optional[ModelFeatures]]: """ Given a path, we return either a MagikaOutput or a MagikaFeatures. @@ -677,61 +603,46 @@ def _get_result_or_features_from_path( if self._no_dereference and path.is_symlink(): result = self._get_result_from_labels_and_score( - path, dl_ct_label=None, output_ct_label=ContentType.SYMLINK, score=1.0 - ) - # The magic and description fields for symlink contain a placeholder - # for ; let's patch the output to reflect that. - result.output.magic = result.output.magic.replace( - "", str(path.resolve()) - ) - result.output.description = result.output.description.replace( - "", str(path.resolve()) + dl_ct_label=ContentTypeLabel.UNDEFINED, + output_ct_label=ContentTypeLabel.SYMLINK, + score=1.0, ) return result, None if not path.exists(): - result = self._get_result_from_labels_and_score( - path, - dl_ct_label=None, - output_ct_label=ContentType.FILE_DOES_NOT_EXIST, - score=1.0, - ) - return result, None + return StatusOr(status=Status.FILE_NOT_FOUND_ERROR), None if path.is_file(): if path.stat().st_size == 0: result = self._get_result_from_labels_and_score( - path, dl_ct_label=None, output_ct_label=ContentType.EMPTY, score=1.0 + dl_ct_label=ContentTypeLabel.UNDEFINED, + output_ct_label=ContentTypeLabel.EMPTY, + score=1.0, ) return result, None elif not os.access(path, os.R_OK): - result = self._get_result_from_labels_and_score( - path, - dl_ct_label=None, - output_ct_label=ContentType.PERMISSION_ERROR, - score=1.0, - ) - return result, None + return StatusOr(status=Status.PERMISSION_ERROR), None - elif path.stat().st_size <= self._min_file_size_for_dl: + elif path.stat().st_size <= self._model_config.min_file_size_for_dl: result = self._get_result_from_first_block_of_file(path) return result, None else: file_features = Magika._extract_features_from_path( path, - self._input_sizes["beg"], - self._input_sizes["mid"], - self._input_sizes["end"], - self._padding_token, - self._block_size, + self._model_config.beg_size, + self._model_config.mid_size, + self._model_config.end_size, + self._model_config.padding_token, + self._model_config.block_size, + self._model_config.use_inputs_at_offsets, ) # Check whether we have enough bytes for a meaningful # detection, and not just padding. if ( - file_features.beg[self._min_file_size_for_dl - 1] - == self._padding_token + file_features.beg[self._model_config.min_file_size_for_dl - 1] + == self._model_config.padding_token ): # If the n-th token is padding, then it means that, # post-stripping, we do not have enough meaningful @@ -747,13 +658,17 @@ def _get_result_or_features_from_path( elif path.is_dir(): result = self._get_result_from_labels_and_score( - path, dl_ct_label=None, output_ct_label=ContentType.DIRECTORY, score=1.0 + dl_ct_label=ContentTypeLabel.UNDEFINED, + output_ct_label=ContentTypeLabel.DIRECTORY, + score=1.0, ) return result, None else: result = self._get_result_from_labels_and_score( - path, dl_ct_label=None, output_ct_label=ContentType.UNKNOWN, score=1.0 + dl_ct_label=ContentTypeLabel.UNDEFINED, + output_ct_label=ContentTypeLabel.UNKNOWN, + score=1.0, ) return result, None @@ -761,67 +676,73 @@ def _get_result_or_features_from_path( def _get_result_or_features_from_bytes( self, content: bytes - ) -> Tuple[Optional[MagikaResult], Optional[ModelFeatures]]: + ) -> Tuple[Optional[StatusOr[MagikaResult]], Optional[ModelFeatures]]: if len(content) == 0: - output = self._get_result_from_labels_and_score( - Path("-"), - dl_ct_label=None, - output_ct_label=ContentType.EMPTY, + result = self._get_result_from_labels_and_score( + dl_ct_label=ContentTypeLabel.UNDEFINED, + output_ct_label=ContentTypeLabel.EMPTY, score=1.0, ) - return output, None + return result, None - elif len(content) <= self._min_file_size_for_dl: - output = self._get_result_of_few_bytes(content) - return output, None + elif len(content) <= self._model_config.min_file_size_for_dl: + result = self._get_result_from_few_bytes(content) + return result, None else: file_features = Magika._extract_features_from_bytes( content, - self._input_sizes["beg"], - self._input_sizes["mid"], - self._input_sizes["end"], - self._padding_token, - self._block_size, + self._model_config.beg_size, + self._model_config.mid_size, + self._model_config.end_size, + self._model_config.padding_token, + self._model_config.block_size, + self._model_config.use_inputs_at_offsets, ) # Check whether we have enough bytes for a meaningful # detection, and not just padding. - if file_features.beg[self._min_file_size_for_dl - 1] == self._padding_token: + if ( + file_features.beg[self._model_config.min_file_size_for_dl - 1] + == self._model_config.padding_token + ): # If the n-th token is padding, then it means that, # post-stripping, we do not have enough meaningful # bytes. - output = self._get_result_of_few_bytes(content) - return output, None + result = self._get_result_from_few_bytes(content) + return result, None else: # We have enough bytes, scheduling this file for model # prediction. - # features.append((path, file_features)) return None, file_features raise Exception("unreachable") - def _get_result_from_first_block_of_file(self, path: Path) -> MagikaResult: + def _get_result_from_first_block_of_file( + self, path: Path + ) -> StatusOr[MagikaResult]: # We read at most "block_size" bytes with open(path, "rb") as f: - content = f.read(self._block_size) - return self._get_result_of_few_bytes(content, path) + content = f.read(self._model_config.block_size) + return self._get_result_from_few_bytes(content, path) - def _get_result_of_few_bytes( + def _get_result_from_few_bytes( self, content: bytes, path: Path = Path("-") - ) -> MagikaResult: - assert len(content) <= 4 * self._block_size - ct_label = self._get_ct_label_of_few_bytes(content) + ) -> StatusOr[MagikaResult]: + assert len(content) <= 4 * self._model_config.block_size + ct_label = self._get_ct_label_from_few_bytes(content) return self._get_result_from_labels_and_score( - path, dl_ct_label=None, output_ct_label=ct_label, score=1.0 + dl_ct_label=ContentTypeLabel.UNDEFINED, + output_ct_label=ct_label, + score=1.0, ) - def _get_ct_label_of_few_bytes(self, content: bytes) -> str: + def _get_ct_label_from_few_bytes(self, content: bytes) -> ContentTypeLabel: try: - ct_label = ContentType.GENERIC_TEXT + ct_label = ContentTypeLabel.TXT _ = content.decode("utf-8") except UnicodeDecodeError: - ct_label = ContentType.UNKNOWN + ct_label = ContentTypeLabel.UNKNOWN return ct_label def _get_raw_predictions( @@ -832,20 +753,18 @@ def _get_raw_predictions( matrix encoding the predictions. """ - dataset_format = self._model_config["train_dataset_info"]["dataset_format"] - assert dataset_format == "int-concat/one-hot" start_time = time.time() X_bytes = [] for _, fs in features: sample_bytes = [] - if self._input_sizes["beg"] > 0: - sample_bytes.extend(fs.beg[: self._input_sizes["beg"]]) - if self._input_sizes["mid"] > 0: - sample_bytes.extend(fs.mid[: self._input_sizes["mid"]]) - if self._input_sizes["end"] > 0: - sample_bytes.extend(fs.end[-self._input_sizes["end"] :]) + if self._model_config.beg_size > 0: + sample_bytes.extend(fs.beg[: self._model_config.beg_size]) + if self._model_config.mid_size > 0: + sample_bytes.extend(fs.mid[: self._model_config.mid_size]) + if self._model_config.end_size > 0: + sample_bytes.extend(fs.end[-self._model_config.end_size :]) X_bytes.append(sample_bytes) - X = np.array(X_bytes).astype(np.float32) + X = np.array(X_bytes, dtype=np.int32) elapsed_time = 1000 * (time.time() - start_time) self._log.debug(f"DL input prepared in {elapsed_time:.03f} ms") diff --git a/python/magika/models/draft_begonly_v2/config.min.json b/python/magika/models/draft_begonly_v2/config.min.json new file mode 100644 index 00000000..63dad503 --- /dev/null +++ b/python/magika/models/draft_begonly_v2/config.min.json @@ -0,0 +1 @@ +{"beg_size": 2048, "mid_size": 0, "end_size": 0, "use_inputs_at_offsets": false, "medium_confidence_threshold": 0.5, "min_file_size_for_dl": 8, "padding_token": 256, "block_size": 4096, "target_labels_space": ["3gp", "ace", "ai", "aidl", "apk", "applebplist", "appleplist", "asm", "asp", "autohotkey", "autoit", "awk", "batch", "bazel", "bib", "bmp", "bzip", "c", "cab", "cat", "chm", "clojure", "cmake", "cobol", "coff", "coffeescript", "cpp", "crt", "crx", "cs", "csproj", "css", "csv", "dart", "deb", "dex", "dicom", "diff", "dm", "dmg", "doc", "dockerfile", "docx", "dsstore", "dwg", "dxf", "elf", "elixir", "emf", "eml", "epub", "erb", "erlang", "flac", "flv", "fortran", "gemfile", "gemspec", "gif", "gitattributes", "gitmodules", "go", "gradle", "groovy", "gzip", "h5", "handlebars", "haskell", "hcl", "hlp", "htaccess", "html", "icns", "ico", "ics", "ignorefile", "ini", "internetshortcut", "ipynb", "iso", "jar", "java", "javabytecode", "javascript", "jinja", "jp2", "jpeg", "json", "jsonl", "julia", "kotlin", "latex", "lha", "lisp", "lnk", "lua", "m3u", "m4", "macho", "makefile", "markdown", "matlab", "mht", "midi", "mkv", "mp3", "mp4", "mscompress", "msi", "mum", "npy", "npz", "nupkg", "objectivec", "ocaml", "odp", "ods", "odt", "ogg", "one", "onnx", "otf", "outlook", "parquet", "pascal", "pcap", "pdb", "pdf", "pebin", "pem", "perl", "php", "pickle", "png", "po", "postscript", "powershell", "ppt", "pptx", "prolog", "proteindb", "proto", "psd", "python", "pythonbytecode", "qt", "r", "rar", "rdf", "rpm", "rst", "rtf", "ruby", "rust", "scala", "scss", "sevenzip", "sgml", "shell", "smali", "snap", "solidity", "sql", "sqlite", "squashfs", "srt", "stlbinary", "stltext", "sum", "svg", "swf", "swift", "tar", "tcl", "textproto", "tga", "thumbsdb", "tiff", "toml", "torrent", "tsv", "ttf", "twig", "txt", "typescript", "unknown", "vba", "vcxproj", "verilog", "vhdl", "vtt", "vue", "wasm", "wav", "webm", "webp", "winregistry", "wmf", "woff", "woff2", "xar", "xls", "xlsb", "xlsx", "xml", "xpi", "xz", "yaml", "yara", "zig", "zip", "zlibstream"], "thresholds": {}, "overwrite_map": {}} \ No newline at end of file diff --git a/python/magika/models/draft_begonly_v2/model.onnx b/python/magika/models/draft_begonly_v2/model.onnx new file mode 100644 index 00000000..ed0e099c Binary files /dev/null and b/python/magika/models/draft_begonly_v2/model.onnx differ diff --git a/python/magika/models/draft_fast_v2/config.min.json b/python/magika/models/draft_fast_v2/config.min.json new file mode 100644 index 00000000..87042418 --- /dev/null +++ b/python/magika/models/draft_fast_v2/config.min.json @@ -0,0 +1 @@ +{"beg_size": 512, "mid_size": 0, "end_size": 512, "use_inputs_at_offsets": false, "medium_confidence_threshold": 0.5, "min_file_size_for_dl": 8, "padding_token": 256, "block_size": 4096, "target_labels_space": ["3gp", "ace", "ai", "aidl", "apk", "applebplist", "appleplist", "asm", "asp", "autohotkey", "autoit", "awk", "batch", "bazel", "bib", "bmp", "bzip", "c", "cab", "cat", "chm", "clojure", "cmake", "cobol", "coff", "coffeescript", "cpp", "crt", "crx", "cs", "csproj", "css", "csv", "dart", "deb", "dex", "dicom", "diff", "dm", "dmg", "doc", "dockerfile", "docx", "dsstore", "dwg", "dxf", "elf", "elixir", "emf", "eml", "epub", "erb", "erlang", "flac", "flv", "fortran", "gemfile", "gemspec", "gif", "gitattributes", "gitmodules", "go", "gradle", "groovy", "gzip", "h5", "handlebars", "haskell", "hcl", "hlp", "htaccess", "html", "icns", "ico", "ics", "ignorefile", "ini", "internetshortcut", "ipynb", "iso", "jar", "java", "javabytecode", "javascript", "jinja", "jp2", "jpeg", "json", "jsonl", "julia", "kotlin", "latex", "lha", "lisp", "lnk", "lua", "m3u", "m4", "macho", "makefile", "markdown", "matlab", "mht", "midi", "mkv", "mp3", "mp4", "mscompress", "msi", "mum", "npy", "npz", "nupkg", "objectivec", "ocaml", "odp", "ods", "odt", "ogg", "one", "onnx", "otf", "outlook", "parquet", "pascal", "pcap", "pdb", "pdf", "pebin", "pem", "perl", "php", "pickle", "png", "po", "postscript", "powershell", "ppt", "pptx", "prolog", "proteindb", "proto", "psd", "python", "pythonbytecode", "qt", "r", "rar", "rdf", "rpm", "rst", "rtf", "ruby", "rust", "scala", "scss", "sevenzip", "sgml", "shell", "smali", "snap", "solidity", "sql", "sqlite", "squashfs", "srt", "stlbinary", "stltext", "sum", "svg", "swf", "swift", "tar", "tcl", "textproto", "tga", "thumbsdb", "tiff", "toml", "torrent", "tsv", "ttf", "twig", "txt", "typescript", "unknown", "vba", "vcxproj", "verilog", "vhdl", "vtt", "vue", "wasm", "wav", "webm", "webp", "winregistry", "wmf", "woff", "woff2", "xar", "xls", "xlsb", "xlsx", "xml", "xpi", "xz", "yaml", "yara", "zig", "zip", "zlibstream"], "thresholds": {}, "overwrite_map": {}} \ No newline at end of file diff --git a/python/magika/models/draft_fast_v2/model.onnx b/python/magika/models/draft_fast_v2/model.onnx new file mode 100644 index 00000000..01f66430 Binary files /dev/null and b/python/magika/models/draft_fast_v2/model.onnx differ diff --git a/python/magika/models/draft_standard_v2/config.min.json b/python/magika/models/draft_standard_v2/config.min.json new file mode 100644 index 00000000..ff83e7bc --- /dev/null +++ b/python/magika/models/draft_standard_v2/config.min.json @@ -0,0 +1 @@ +{"beg_size": 2048, "mid_size": 0, "end_size": 2048, "use_inputs_at_offsets": false, "medium_confidence_threshold": 0.5, "min_file_size_for_dl": 8, "padding_token": 256, "block_size": 4096, "target_labels_space": ["3gp", "ace", "ai", "aidl", "apk", "applebplist", "appleplist", "asm", "asp", "autohotkey", "autoit", "awk", "batch", "bazel", "bib", "bmp", "bzip", "c", "cab", "cat", "chm", "clojure", "cmake", "cobol", "coff", "coffeescript", "cpp", "crt", "crx", "cs", "csproj", "css", "csv", "dart", "deb", "dex", "dicom", "diff", "dm", "dmg", "doc", "dockerfile", "docx", "dsstore", "dwg", "dxf", "elf", "elixir", "emf", "eml", "epub", "erb", "erlang", "flac", "flv", "fortran", "gemfile", "gemspec", "gif", "gitattributes", "gitmodules", "go", "gradle", "groovy", "gzip", "h5", "handlebars", "haskell", "hcl", "hlp", "htaccess", "html", "icns", "ico", "ics", "ignorefile", "ini", "internetshortcut", "ipynb", "iso", "jar", "java", "javabytecode", "javascript", "jinja", "jp2", "jpeg", "json", "jsonl", "julia", "kotlin", "latex", "lha", "lisp", "lnk", "lua", "m3u", "m4", "macho", "makefile", "markdown", "matlab", "mht", "midi", "mkv", "mp3", "mp4", "mscompress", "msi", "mum", "npy", "npz", "nupkg", "objectivec", "ocaml", "odp", "ods", "odt", "ogg", "one", "onnx", "otf", "outlook", "parquet", "pascal", "pcap", "pdb", "pdf", "pebin", "pem", "perl", "php", "pickle", "png", "po", "postscript", "powershell", "ppt", "pptx", "prolog", "proteindb", "proto", "psd", "python", "pythonbytecode", "qt", "r", "rar", "rdf", "rpm", "rst", "rtf", "ruby", "rust", "scala", "scss", "sevenzip", "sgml", "shell", "smali", "snap", "solidity", "sql", "sqlite", "squashfs", "srt", "stlbinary", "stltext", "sum", "svg", "swf", "swift", "tar", "tcl", "textproto", "tga", "thumbsdb", "tiff", "toml", "torrent", "tsv", "ttf", "twig", "txt", "typescript", "unknown", "vba", "vcxproj", "verilog", "vhdl", "vtt", "vue", "wasm", "wav", "webm", "webp", "winregistry", "wmf", "woff", "woff2", "xar", "xls", "xlsb", "xlsx", "xml", "xpi", "xz", "yaml", "yara", "zig", "zip", "zlibstream"], "thresholds": {}, "overwrite_map": {}} \ No newline at end of file diff --git a/python/magika/models/draft_standard_v2/model.onnx b/python/magika/models/draft_standard_v2/model.onnx new file mode 100644 index 00000000..19e0d4bf Binary files /dev/null and b/python/magika/models/draft_standard_v2/model.onnx differ diff --git a/python/magika/types/__init__.py b/python/magika/types/__init__.py new file mode 100644 index 00000000..990e2170 --- /dev/null +++ b/python/magika/types/__init__.py @@ -0,0 +1,38 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from magika.types.content_type_info import ContentTypeInfo # noqa: F401 +from magika.types.content_type_label import ContentTypeLabel # noqa: F401 +from magika.types.magika_result import MagikaResult # noqa: F401 +from magika.types.model import ( # noqa: F401 + ModelConfig, + ModelFeatures, + ModelOutput, +) +from magika.types.prediction_mode import PredictionMode # noqa: F401 +from magika.types.status import Status # noqa: F401 +from magika.types.statusor import StatusOr # noqa: F401 + +__all__ = [ + "ContentTypeInfo", + "ContentTypeLabel", + "MagikaResult", + "ModelConfig", + "ModelFeatures", + "ModelOutput", + "PredictionMode", + "Status", + "StatusOr", +] diff --git a/python/magika/types/content_type_info.py b/python/magika/types/content_type_info.py new file mode 100644 index 00000000..2c2ff0b6 --- /dev/null +++ b/python/magika/types/content_type_info.py @@ -0,0 +1,14 @@ +from dataclasses import dataclass +from typing import List + +from magika.types.content_type_label import ContentTypeLabel + + +@dataclass(frozen=True) +class ContentTypeInfo: + label: ContentTypeLabel + mime_type: str + group: str + description: str + extensions: List[str] + is_text: bool diff --git a/python/magika/types/content_type_label.py b/python/magika/types/content_type_label.py new file mode 100644 index 00000000..b5130a9c --- /dev/null +++ b/python/magika/types/content_type_label.py @@ -0,0 +1,375 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from magika.types.strenum import StrEnum + +# NOTE: DO NOT EDIT --- This file is automatically generated. + + +# This is the list of all possible content types we know about; however, models +# support a smaller subset of them. See model's config for details. +class ContentTypeLabel(StrEnum): + _3DS = "3ds" + _3DSM = "3dsm" + _3DSX = "3dsx" + _3GP = "3gp" + _3MF = "3mf" + ABNF = "abnf" + ACE = "ace" + ADA = "ada" + AFF = "aff" + AI = "ai" + AIDL = "aidl" + ALGOL68 = "algol68" + ANI = "ani" + APK = "apk" + APPLEBPLIST = "applebplist" + APPLEDOUBLE = "appledouble" + APPLEPLIST = "appleplist" + APPLESINGLE = "applesingle" + AR = "ar" + ARC = "arc" + ARJ = "arj" + ARROW = "arrow" + ASC = "asc" + ASD = "asd" + ASF = "asf" + ASM = "asm" + ASP = "asp" + AUTOHOTKEY = "autohotkey" + AUTOIT = "autoit" + AVI = "avi" + AVIF = "avif" + AVRO = "avro" + AWK = "awk" + AX = "ax" + BATCH = "batch" + BAZEL = "bazel" + BCAD = "bcad" + BIB = "bib" + BMP = "bmp" + BPG = "bpg" + BPL = "bpl" + BRAINFUCK = "brainfuck" + BRF = "brf" + BZIP = "bzip" + BZIP3 = "bzip3" + C = "c" + CAB = "cab" + CAD = "cad" + CAT = "cat" + CDF = "cdf" + CHM = "chm" + CLOJURE = "clojure" + CMAKE = "cmake" + COBOL = "cobol" + COFF = "coff" + COFFEESCRIPT = "coffeescript" + COM = "com" + CPL = "cpl" + CPP = "cpp" + CRT = "crt" + CRX = "crx" + CS = "cs" + CSPROJ = "csproj" + CSS = "css" + CSV = "csv" + CTL = "ctl" + DART = "dart" + DEB = "deb" + DEX = "dex" + DEY = "dey" + DICOM = "dicom" + DIFF = "diff" + DIRECTORY = "directory" + DJANGO = "django" + DLL = "dll" + DM = "dm" + DMG = "dmg" + DMIGD = "dmigd" + DMSCRIPT = "dmscript" + DOC = "doc" + DOCKERFILE = "dockerfile" + DOCX = "docx" + DOSMBR = "dosmbr" + DOTX = "dotx" + DSSTORE = "dsstore" + DWG = "dwg" + DXF = "dxf" + DYLIB = "dylib" + EBML = "ebml" + ELF = "elf" + ELIXIR = "elixir" + EMF = "emf" + EML = "eml" + EMPTY = "empty" + EPUB = "epub" + ERB = "erb" + ERLANG = "erlang" + ESE = "ese" + EXE = "exe" + EXP = "exp" + FLAC = "flac" + FLUTTER = "flutter" + FLV = "flv" + FORTRAN = "fortran" + FPX = "fpx" + GEMFILE = "gemfile" + GEMSPEC = "gemspec" + GIF = "gif" + GITATTRIBUTES = "gitattributes" + GITMODULES = "gitmodules" + GLEAM = "gleam" + GO = "go" + GPX = "gpx" + GRADLE = "gradle" + GROOVY = "groovy" + GZIP = "gzip" + H = "h" + H5 = "h5" + HANDLEBARS = "handlebars" + HASKELL = "haskell" + HCL = "hcl" + HEIF = "heif" + HFS = "hfs" + HLP = "hlp" + HPP = "hpp" + HTA = "hta" + HTACCESS = "htaccess" + HTML = "html" + HVE = "hve" + HWP = "hwp" + ICC = "icc" + ICNS = "icns" + ICO = "ico" + ICS = "ics" + IGNOREFILE = "ignorefile" + IMG = "img" + INI = "ini" + INTERNETSHORTCUT = "internetshortcut" + IOSAPP = "iosapp" + IPYNB = "ipynb" + ISO = "iso" + JAR = "jar" + JAVA = "java" + JAVABYTECODE = "javabytecode" + JAVASCRIPT = "javascript" + JINJA = "jinja" + JNG = "jng" + JNLP = "jnlp" + JP2 = "jp2" + JPEG = "jpeg" + JSON = "json" + JSONC = "jsonc" + JSONL = "jsonl" + JSX = "jsx" + JULIA = "julia" + JXL = "jxl" + KO = "ko" + KOTLIN = "kotlin" + KS = "ks" + LATEX = "latex" + LATEXAUX = "latexaux" + LESS = "less" + LHA = "lha" + LICENSE = "license" + LISP = "lisp" + LITCS = "litcs" + LNK = "lnk" + LOCK = "lock" + LRZ = "lrz" + LUA = "lua" + LZ = "lz" + LZ4 = "lz4" + LZX = "lzx" + M3U = "m3u" + M4 = "m4" + MACHO = "macho" + MAFF = "maff" + MAKEFILE = "makefile" + MARKDOWN = "markdown" + MATLAB = "matlab" + MHT = "mht" + MIDI = "midi" + MKV = "mkv" + MP2 = "mp2" + MP3 = "mp3" + MP4 = "mp4" + MPEGTS = "mpegts" + MSCOMPRESS = "mscompress" + MSI = "msi" + MSIX = "msix" + MST = "mst" + MUI = "mui" + MUM = "mum" + MUN = "mun" + NIM = "nim" + NPY = "npy" + NPZ = "npz" + NULL = "null" + NUPKG = "nupkg" + OBJECT = "object" + OBJECTIVEC = "objectivec" + OCAML = "ocaml" + OCX = "ocx" + ODEX = "odex" + ODIN = "odin" + ODP = "odp" + ODS = "ods" + ODT = "odt" + OGG = "ogg" + OLE = "ole" + ONE = "one" + ONNX = "onnx" + OOXML = "ooxml" + OTF = "otf" + OUTLOOK = "outlook" + PALMOS = "palmos" + PARQUET = "parquet" + PASCAL = "pascal" + PBM = "pbm" + PCAP = "pcap" + PDB = "pdb" + PDF = "pdf" + PEBIN = "pebin" + PEM = "pem" + PERL = "perl" + PGP = "pgp" + PHP = "php" + PICKLE = "pickle" + PNG = "png" + PO = "po" + POSTSCRIPT = "postscript" + POWERSHELL = "powershell" + PPT = "ppt" + PPTX = "pptx" + PRINTFOX = "printfox" + PROLOG = "prolog" + PROTEINDB = "proteindb" + PROTO = "proto" + PROTOBUF = "protobuf" + PSD = "psd" + PUB = "pub" + PYTHON = "python" + PYTHONBYTECODE = "pythonbytecode" + PYTHONPAR = "pythonpar" + PYTORCH = "pytorch" + QOI = "qoi" + QT = "qt" + R = "r" + RANDOMASCII = "randomascii" + RANDOMBYTES = "randombytes" + RAR = "rar" + RDF = "rdf" + RIFF = "riff" + RLIB = "rlib" + RLL = "rll" + RPM = "rpm" + RST = "rst" + RTF = "rtf" + RUBY = "ruby" + RUST = "rust" + RZIP = "rzip" + SCALA = "scala" + SCHEME = "scheme" + SCR = "scr" + SCRIPTWSF = "scriptwsf" + SCSS = "scss" + SEVENZIP = "sevenzip" + SGML = "sgml" + SH3D = "sh3d" + SHELL = "shell" + SMALI = "smali" + SNAP = "snap" + SO = "so" + SOLIDITY = "solidity" + SQL = "sql" + SQLITE = "sqlite" + SQUASHFS = "squashfs" + SRT = "srt" + STLBINARY = "stlbinary" + STLTEXT = "stltext" + SUM = "sum" + SVD = "svd" + SVG = "svg" + SWF = "swf" + SWIFT = "swift" + SYMLINK = "symlink" + SYMLINKTEXT = "symlinktext" + SYS = "sys" + TAR = "tar" + TCL = "tcl" + TEXTPROTO = "textproto" + TGA = "tga" + THUMBSDB = "thumbsdb" + TIFF = "tiff" + TMDX = "tmdx" + TOML = "toml" + TORRENT = "torrent" + TROFF = "troff" + TSV = "tsv" + TSX = "tsx" + TTF = "ttf" + TWIG = "twig" + TXT = "txt" + TXTASCII = "txtascii" + TXTUTF16 = "txtutf16" + TXTUTF8 = "txtutf8" + TYPESCRIPT = "typescript" + UDF = "udf" + UNDEFINED = "undefined" + UNIXCOMPRESS = "unixcompress" + UNKNOWN = "unknown" + VBA = "vba" + VBE = "vbe" + VCARD = "vcard" + VCS = "vcs" + VCXPROJ = "vcxproj" + VERILOG = "verilog" + VHD = "vhd" + VHDL = "vhdl" + VISIO = "visio" + VTT = "vtt" + VUE = "vue" + WAD = "wad" + WASM = "wasm" + WAV = "wav" + WEBM = "webm" + WEBP = "webp" + WIM = "wim" + WINREGISTRY = "winregistry" + WMA = "wma" + WMF = "wmf" + WMV = "wmv" + WOFF = "woff" + WOFF2 = "woff2" + XAR = "xar" + XCF = "xcf" + XLS = "xls" + XLSB = "xlsb" + XLSX = "xlsx" + XML = "xml" + XPI = "xpi" + XSD = "xsd" + XZ = "xz" + YAML = "yaml" + YARA = "yara" + ZIG = "zig" + ZIP = "zip" + ZLIBSTREAM = "zlibstream" + ZST = "zst" + + def __repr__(self) -> str: + return str(self) diff --git a/python/magika/types/magika_result.py b/python/magika/types/magika_result.py new file mode 100644 index 00000000..34a3a34e --- /dev/null +++ b/python/magika/types/magika_result.py @@ -0,0 +1,26 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from dataclasses import dataclass + +from magika.types.content_type_info import ContentTypeInfo + + +@dataclass(frozen=True) +class MagikaResult: + dl: ContentTypeInfo + output: ContentTypeInfo + score: float diff --git a/python/magika/types.py b/python/magika/types/model.py similarity index 53% rename from python/magika/types.py rename to python/magika/types/model.py index 60c00b05..94fcc9c2 100644 --- a/python/magika/types.py +++ b/python/magika/types/model.py @@ -13,21 +13,14 @@ # limitations under the License. -from __future__ import annotations - from dataclasses import dataclass -from typing import List, Optional +from typing import Dict, List - -@dataclass -class ModelFeatures: - beg: List[int] - mid: List[int] - end: List[int] +from magika.types.content_type_label import ContentTypeLabel -@dataclass -class ModelFeaturesV2: +@dataclass(frozen=True) +class ModelFeatures: beg: List[int] mid: List[int] end: List[int] @@ -39,43 +32,22 @@ class ModelFeaturesV2: offset_0x9800_0x9807: List[int] -@dataclass +@dataclass(frozen=True) class ModelOutput: - ct_label: str - score: float - - -@dataclass -class MagikaResult: - path: str - dl: ModelOutputFields - output: MagikaOutputFields - - -@dataclass -class ModelOutputFields: - ct_label: Optional[str] - score: Optional[float] - group: Optional[str] - mime_type: Optional[str] - magic: Optional[str] - description: Optional[str] - is_text: Optional[bool] - - -@dataclass -class MagikaOutputFields: - ct_label: str + ct_label: ContentTypeLabel score: float - group: str - mime_type: str - magic: str - description: str - is_text: bool -@dataclass -class FeedbackReport: - hash: str - features: ModelFeatures - result: MagikaResult +@dataclass(frozen=True) +class ModelConfig: + beg_size: int + mid_size: int + end_size: int + use_inputs_at_offsets: bool + medium_confidence_threshold: float + min_file_size_for_dl: int + padding_token: int + block_size: int + target_labels_space: List[ContentTypeLabel] + thresholds: Dict[ContentTypeLabel, float] + overwrite_map: Dict[ContentTypeLabel, ContentTypeLabel] diff --git a/python/magika/prediction_mode.py b/python/magika/types/prediction_mode.py similarity index 94% rename from python/magika/prediction_mode.py rename to python/magika/types/prediction_mode.py index 6bd8e53f..fb54eaa7 100644 --- a/python/magika/prediction_mode.py +++ b/python/magika/types/prediction_mode.py @@ -17,7 +17,7 @@ import enum from typing import List -from magika.strenum import LowerCaseStrEnum +from magika.types.strenum import LowerCaseStrEnum class PredictionMode(LowerCaseStrEnum): diff --git a/python/magika/types/status.py b/python/magika/types/status.py new file mode 100644 index 00000000..ee590aa8 --- /dev/null +++ b/python/magika/types/status.py @@ -0,0 +1,29 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from magika.types.strenum import StrEnum + + +class Status(StrEnum): + OK = "ok" + + # Used when a file path does not exist + FILE_NOT_FOUND_ERROR = "file_not_found_error" + + # Used when a file path exists, but there are permission issues, e.g., can't + # read file + PERMISSION_ERROR = "permission_error" + + # Represents a generic error-like unknown status. + UNKNOWN = "unknown" diff --git a/python/magika/types/statusor.py b/python/magika/types/statusor.py new file mode 100644 index 00000000..7f1e1cf1 --- /dev/null +++ b/python/magika/types/statusor.py @@ -0,0 +1,54 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Generic, Optional, TypeVar + +from magika.types.status import Status + +T = TypeVar("T") + + +class StatusOr(Generic[T]): + def __init__(self, *, status: Status = Status.OK, value: Optional[T] = None): + self._status = status + self._value = value + + def __post_init__(self) -> None: + if self._status == Status.OK: + if self._value is None: + raise ValueError("value must be set when status == OK") + else: + if self._value is not None: + raise ValueError("value cannot be set when status != OK") + + @property + def ok(self) -> bool: + return self._status == Status.OK + + @property + def status(self) -> Status: + return self._status + + @property + def value(self) -> T: + if self.ok: + assert self._value is not None + return self._value + raise ValueError("value is not set when status != OK") + + def __repr__(self) -> str: + return str(self) + + def __str__(self) -> str: + return f"StatusOr(status={self.status}, value={self.value})" diff --git a/python/magika/strenum.py b/python/magika/types/strenum.py similarity index 85% rename from python/magika/strenum.py rename to python/magika/types/strenum.py index db219285..de4bcb1a 100644 --- a/python/magika/strenum.py +++ b/python/magika/types/strenum.py @@ -30,20 +30,20 @@ class Example(StrEnum): assert Example.MixedCase == "MixedCase" """ - def __new__(cls, value: Union[str, StrEnum], *args, **kwargs): + def __new__(cls, value: Union[str, StrEnum], *args, **kwargs): # type: ignore[no-untyped-def] if not isinstance(value, (str, enum.auto)): raise TypeError( f"Values of StrEnums must be strings: {value!r} is a {type(value)}" ) return super().__new__(cls, value, *args, **kwargs) - def __str__(self): + def __str__(self) -> str: return str(self.value) - def _generate_next_value_(name, *_): + def _generate_next_value_(name, *_): # type: ignore[no-untyped-def] return name class LowerCaseStrEnum(StrEnum): - def _generate_next_value_(name, *_): + def _generate_next_value_(name, *_): # type: ignore[no-untyped-def] return name.lower().replace("_", "-") diff --git a/python/pyproject.toml b/python/pyproject.toml index b74dadd5..23de4466 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -1,9 +1,10 @@ [tool.poetry] name = "magika" -version = "0.5.2-dev" -description = "A tool to determine the content type of a file with deep-learning" -authors = ["Yanick Fratantonio "] +version = "0.6.0-dev" +description = "A tool to determine the content type of a file with deep learning" +authors = ["Magika Developers "] readme = "README.md" +license = "Apache License 2.0" packages = [{include = "magika"}] [tool.poetry.dependencies] @@ -18,9 +19,6 @@ numpy = [ tabulate = "^0.9.0" python-dotenv = "^1.0.1" -[tool.poetry.scripts] -magika = "magika.cli.magika:main" - [tool.poetry.group.dev.dependencies] pytest = "^8.0.1" ipython = [ diff --git a/python/tests/test_features_extraction.py b/python/tests/test_features_extraction.py index 85920477..ce80550b 100644 --- a/python/tests/test_features_extraction.py +++ b/python/tests/test_features_extraction.py @@ -18,14 +18,13 @@ import math import random import string -import tempfile from dataclasses import asdict, dataclass from pathlib import Path from typing import List, Tuple from magika import Magika from magika.seekable import Buffer -from magika.types import ModelFeatures, ModelFeaturesV2 +from magika.types import ModelFeatures from tests.utils import get_tests_data_dir random.seed(42) @@ -45,73 +44,23 @@ class TestInfo: __test__ = False -def test_features_extraction(debug: bool = False) -> None: - """This iterates over the content in the test suite and checks whether the - trivial implementation matches the python module one, which is the reference - code.""" - - tests_cases = _get_tests_cases_from_reference() - - for test_case in tests_cases: - test_info = TestInfo(**test_case["test_info"]) - test_content = base64.b64decode(test_case["content"]) - expected_features = ModelFeatures(**test_case["features_v1"]) - - beg_size = test_info.beg_size - mid_size = test_info.mid_size - end_size = test_info.end_size - block_size = test_info.block_size - padding_token = test_info.padding_token - - if debug: - print(f"Test details: {test_info} =>") - - features_from_bytes = Magika._extract_features_from_bytes( - test_content, beg_size, mid_size, end_size, padding_token, block_size - ) - with tempfile.TemporaryDirectory() as td: - tf_path = Path(td) / "test.dat" - tf_path.write_bytes(test_content) - features_from_path = Magika._extract_features_from_path( - tf_path, beg_size, mid_size, end_size, padding_token, block_size - ) - - comparison_by_bytes = {} - comparison_by_bytes["beg"] = features_from_bytes.beg == expected_features.beg - comparison_by_bytes["mid"] = features_from_bytes.mid == expected_features.mid - comparison_by_bytes["end"] = features_from_bytes.end == expected_features.end - comparison_by_bytes["all"] = set(comparison_by_bytes.values()) == set([True]) - - comparison_by_path = {} - comparison_by_path["beg"] = features_from_path.beg == expected_features.beg - comparison_by_path["mid"] = features_from_path.mid == expected_features.mid - comparison_by_path["end"] = features_from_path.end == expected_features.end - comparison_by_path["all"] = set(comparison_by_path.values()) == set([True]) - - if debug: - print("comparison_by_bytes: " + json.dumps(comparison_by_bytes)) - - if not comparison_by_bytes["all"] or not comparison_by_path["all"]: - raise Exception - - def test_features_extraction_v2(debug: bool = False) -> None: tests_cases = _get_tests_cases_from_reference() for test_case in tests_cases: test_info = TestInfo(**test_case["test_info"]) test_content = base64.b64decode(test_case["content"]) - expected_features = ModelFeaturesV2(**test_case["features_v2"]) - - beg_size = test_info.beg_size - mid_size = test_info.mid_size - end_size = test_info.end_size - block_size = test_info.block_size - padding_token = test_info.padding_token + expected_features = ModelFeatures(**test_case["features_v2"]) s = Buffer(test_content) - features = Magika._extract_features_from_seekable_v2( - s, beg_size, mid_size, end_size, padding_token, block_size + features = Magika._extract_features_from_seekable( + s, + beg_size=test_info.beg_size, + mid_size=test_info.mid_size, + end_size=test_info.end_size, + padding_token=test_info.padding_token, + block_size=test_info.block_size, + use_inputs_at_offsets=True, ) with_error = False @@ -197,17 +146,19 @@ def generate_features_extraction_reference(): for test_info, test_content in test_suite: s = Buffer(test_content) - features_v1 = Magika._extract_features_from_seekable( - s, beg_size, mid_size, end_size, padding_token, block_size - ) - features_v2 = Magika._extract_features_from_seekable_v2( - s, beg_size, mid_size, end_size, padding_token, block_size + features_v2 = Magika._extract_features_from_seekable( + s, + beg_size, + mid_size, + end_size, + padding_token, + block_size, + use_inputs_at_offsets=True, ) test_case = { "test_info": asdict(test_info), "content": base64.b64encode(test_content).decode("ascii"), - "features_v1": asdict(features_v1), "features_v2": asdict(features_v2), } ref_features_extraction_tests.append(test_case) @@ -316,5 +267,4 @@ def _get_features_extration_tests_path() -> Path: if __name__ == "__main__": - test_features_extraction(debug=True) test_features_extraction_v2(debug=True) diff --git a/python/tests/test_magika_python_cli.py b/python/tests/test_magika_python_cli.py deleted file mode 100644 index 42cc21e1..00000000 --- a/python/tests/test_magika_python_cli.py +++ /dev/null @@ -1,779 +0,0 @@ -# Copyright 2024 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import signal -import subprocess -import tempfile -from pathlib import Path -from typing import Any - -import pytest - -from magika.content_types import ContentType, ContentTypesManager -from magika.prediction_mode import PredictionMode -from tests import utils -from tests.utils_magika_python_client import MagikaClientError, run_magika_python_cli - - -@pytest.mark.smoketest -def test_magika_python_cli_with_one_test_file() -> None: - test_file_path = utils.get_basic_test_files_paths()[0] - - stdout, stderr = run_magika_python_cli([test_file_path]) - utils.check_magika_cli_output_matches_expected_by_ext( - [test_file_path], stdout, stderr - ) - - stdout, stderr = run_magika_python_cli( - [test_file_path], extra_cli_options=["--json"] - ) - utils.check_magika_cli_output_matches_expected_by_ext( - [test_file_path], stdout, stderr, json_output=True - ) - - stdout, stderr = run_magika_python_cli( - [test_file_path], extra_cli_options=["--jsonl"] - ) - utils.check_magika_cli_output_matches_expected_by_ext( - [test_file_path], stdout, stderr, jsonl_output=True - ) - - stdout, stderr = run_magika_python_cli( - [test_file_path], extra_cli_options=["--output-score"] - ) - utils.check_magika_cli_output_matches_expected_by_ext( - [test_file_path], stdout, stderr, output_score=True - ) - - stdout, stderr = run_magika_python_cli( - [test_file_path], extra_cli_options=["--mime-type"] - ) - utils.check_magika_cli_output_matches_expected_by_ext( - [test_file_path], stdout, stderr, mime_output=True - ) - - stdout, stderr = run_magika_python_cli( - [test_file_path], extra_cli_options=["--label"] - ) - utils.check_magika_cli_output_matches_expected_by_ext( - [test_file_path], stdout, stderr, label_output=True - ) - - stdout, stderr = run_magika_python_cli( - [test_file_path], extra_cli_options=["--compatibility-mode"] - ) - utils.check_magika_cli_output_matches_expected_by_ext( - [test_file_path], stdout, stderr, compatibility_mode=True - ) - - -def test_magika_python_cli_with_very_small_test_files() -> None: - """Magika does not use the DL model for very small files. This test covers - these scenarios. - """ - - with tempfile.TemporaryDirectory() as td: - text_test_path = Path(td) / "small.txt" - text_test_path.write_text("small test") - stdout, stderr = run_magika_python_cli([text_test_path], label_output=True) - assert ( - utils.get_magika_cli_output_from_stdout_stderr(stdout, stderr)[0][1] - == ContentType.GENERIC_TEXT - ) - - binary_test_path = Path(td) / "small.dat" - binary_test_path.write_bytes(b"\x80\xff") - stdout, stderr = run_magika_python_cli([binary_test_path], label_output=True) - assert ( - utils.get_magika_cli_output_from_stdout_stderr(stdout, stderr)[0][1] - == ContentType.UNKNOWN - ) - - -def test_magika_cli_with_small_test_files() -> None: - """Magika needs to pad files that are small. This test covers scenarios - where padding is relevant. - """ - - with tempfile.TemporaryDirectory() as td: - text_test_path = Path(td) / "small.txt" - # small, but bigger than the threshold to use the DL model - text_test_path.write_text("A" * 32) - _ = run_magika_python_cli([text_test_path], label_output=True) - # we do not care about the prediction - - -def test_magika_cli_with_empty_file() -> None: - with tempfile.TemporaryDirectory() as td: - empty_test_path = Path(td) / "empty.dat" - empty_test_path.touch() - stdout, stderr = run_magika_python_cli([empty_test_path], label_output=True) - assert ( - utils.get_magika_cli_output_from_stdout_stderr(stdout, stderr)[0][1] - == ContentType.EMPTY - ) - - -def test_magika_cli_with_directories() -> None: - with tempfile.TemporaryDirectory() as td: - test_files_num = 3 - for idx in range(test_files_num): - p = Path(td) / f"test-{idx}.txt" - p.write_text("test") - - # run without recursive mode - stdout, stderr = run_magika_python_cli([Path(td)], label_output=True) - predicted_cts = utils.get_magika_cli_output_from_stdout_stderr(stdout, stderr) - assert len(predicted_cts) == 1 - assert predicted_cts[0][1] == "directory" - - # run with recursive mode - stdout, stderr = run_magika_python_cli( - [Path(td)], label_output=True, extra_cli_options=["--recursive"] - ) - predicted_cts = utils.get_magika_cli_output_from_stdout_stderr(stdout, stderr) - assert len(predicted_cts) == test_files_num - for _, ct in predicted_cts: - assert ct == ContentType.GENERIC_TEXT - - -def test_magika_cli_with_symlinks() -> None: - with tempfile.TemporaryDirectory() as td: - test_path = Path(td) / "test.txt" - test_path.write_text("test") - - symlink_path = Path(td) / "symlink-test.txt" - symlink_path.symlink_to(test_path) - - # run without --no-dereference mode; symlinks are dereferenced - stdout, stderr = run_magika_python_cli([symlink_path], label_output=True) - predicted_cts = utils.get_magika_cli_output_from_stdout_stderr(stdout, stderr) - assert len(predicted_cts) == 1 - assert predicted_cts[0][1] == ContentType.GENERIC_TEXT - - # run with --no-dereference, to avoid dereferencing symlinks - stdout, stderr = run_magika_python_cli( - [symlink_path], label_output=True, extra_cli_options=["--no-dereference"] - ) - predicted_cts = utils.get_magika_cli_output_from_stdout_stderr(stdout, stderr) - assert len(predicted_cts) == 1 - assert predicted_cts[0][1] == "symlink" - - # run with --no-dereference, to avoid dereferencing symlinks - stdout, stderr = run_magika_python_cli( - [symlink_path], extra_cli_options=["--no-dereference"] - ) - predicted_cts = utils.get_magika_cli_output_from_stdout_stderr(stdout, stderr) - assert len(predicted_cts) == 1 - assert isinstance(predicted_cts[0][0], Path) - assert isinstance(predicted_cts[0][1], str) - assert predicted_cts[0][1].startswith("Symbolic link") - assert predicted_cts[0][1].find(str(test_path)) >= 0 - - -def test_magika_cli_with_files_with_permission_errors() -> None: - with tempfile.TemporaryDirectory() as td: - unreadable_test_path = Path(td) / "test1.txt" - unreadable_test_path.write_text("test") - - # make it unreadable - unreadable_test_path.chmod(0o000) - - stdout, stderr = run_magika_python_cli( - [unreadable_test_path], label_output=True - ) - predicted_cts = utils.get_magika_cli_output_from_stdout_stderr(stdout, stderr) - assert len(predicted_cts) == 1 - assert predicted_cts[0][1] == ContentType.PERMISSION_ERROR - - # add another, readable file, and check that it is scanned properly - readable_test_path = Path(td) / "test2.txt" - readable_test_path.write_text("test") - stdout, stderr = run_magika_python_cli( - [unreadable_test_path, readable_test_path], label_output=True - ) - predicted_cts = utils.get_magika_cli_output_from_stdout_stderr(stdout, stderr) - assert len(predicted_cts) == 2 - assert predicted_cts[0][1] == ContentType.PERMISSION_ERROR - assert predicted_cts[1][1] == ContentType.GENERIC_TEXT - - # try the same, but passing the directory as input - stdout, stderr = run_magika_python_cli( - [Path(td)], label_output=True, extra_cli_options=["--recursive"] - ) - predicted_cts = utils.get_magika_cli_output_from_stdout_stderr(stdout, stderr) - assert len(predicted_cts) == 2 - assert predicted_cts[0][1] == ContentType.PERMISSION_ERROR - assert predicted_cts[1][1] == ContentType.GENERIC_TEXT - - -def test_magika_cli_with_basic_test_files() -> None: - test_files_paths = utils.get_basic_test_files_paths() - - for n in [1, 2, 5, 10, len(test_files_paths)]: - stdout, stderr = run_magika_python_cli(test_files_paths[:n]) - utils.check_magika_cli_output_matches_expected_by_ext( - test_files_paths[:n], stdout, stderr - ) - - -def test_magika_cli_with_mitra_test_files() -> None: - test_files_paths = utils.get_mitra_test_files_paths() - - stdout, stderr = run_magika_python_cli(test_files_paths) - utils.check_magika_cli_output_matches_expected_by_ext( - test_files_paths, stdout, stderr - ) - - -def test_magika_cli_with_basic_test_files_and_json_output() -> None: - test_files_paths = utils.get_basic_test_files_paths() - - for n in [1, 2, len(test_files_paths)]: - stdout, stderr = run_magika_python_cli(test_files_paths[:n], json_output=True) - utils.check_magika_cli_output_matches_expected_by_ext( - test_files_paths[:n], stdout, stderr, json_output=True - ) - - stdout, stderr = run_magika_python_cli( - test_files_paths[:n], extra_cli_options=["--json"] - ) - utils.check_magika_cli_output_matches_expected_by_ext( - test_files_paths[:n], stdout, stderr, json_output=True - ) - - -def test_magika_cli_with_basic_test_files_and_jsonl_output() -> None: - test_files_paths = utils.get_basic_test_files_paths() - - for n in [1, 2, len(test_files_paths)]: - stdout, stderr = run_magika_python_cli(test_files_paths[:n], jsonl_output=True) - utils.check_magika_cli_output_matches_expected_by_ext( - test_files_paths[:n], stdout, stderr, jsonl_output=True - ) - - stdout, stderr = run_magika_python_cli( - test_files_paths[:n], extra_cli_options=["--jsonl"] - ) - utils.check_magika_cli_output_matches_expected_by_ext( - test_files_paths[:n], stdout, stderr, jsonl_output=True - ) - - -def test_magika_cli_with_basic_test_files_and_probability() -> None: - test_files_paths = utils.get_basic_test_files_paths() - - for n in [1, 2, len(test_files_paths)]: - stdout, stderr = run_magika_python_cli(test_files_paths[:n], output_score=True) - utils.check_magika_cli_output_matches_expected_by_ext( - test_files_paths[:n], stdout, stderr, output_score=True - ) - - stdout, stderr = run_magika_python_cli( - test_files_paths[:n], extra_cli_options=["-s"] - ) - utils.check_magika_cli_output_matches_expected_by_ext( - test_files_paths[:n], stdout, stderr, output_score=True - ) - - stdout, stderr = run_magika_python_cli( - test_files_paths[:n], extra_cli_options=["--output-score"] - ) - utils.check_magika_cli_output_matches_expected_by_ext( - test_files_paths[:n], stdout, stderr, output_score=True - ) - - -def test_magika_cli_with_basic_test_files_and_mime_output() -> None: - test_files_paths = utils.get_basic_test_files_paths() - - for n in [1, 2, len(test_files_paths)]: - stdout, stderr = run_magika_python_cli(test_files_paths[:n], mime_output=True) - utils.check_magika_cli_output_matches_expected_by_ext( - test_files_paths[:n], stdout, stderr, mime_output=True - ) - - stdout, stderr = run_magika_python_cli( - test_files_paths[:n], extra_cli_options=["-i"] - ) - utils.check_magika_cli_output_matches_expected_by_ext( - test_files_paths[:n], stdout, stderr, mime_output=True - ) - - stdout, stderr = run_magika_python_cli( - test_files_paths[:n], extra_cli_options=["--mime-type"] - ) - utils.check_magika_cli_output_matches_expected_by_ext( - test_files_paths[:n], stdout, stderr, mime_output=True - ) - - -def test_magika_cli_with_basic_test_files_and_label_output() -> None: - test_files_paths = utils.get_basic_test_files_paths() - - for n in [1, 2, len(test_files_paths)]: - stdout, stderr = run_magika_python_cli(test_files_paths[:n], label_output=True) - utils.check_magika_cli_output_matches_expected_by_ext( - test_files_paths[:n], stdout, stderr, label_output=True - ) - - stdout, stderr = run_magika_python_cli( - test_files_paths[:n], extra_cli_options=["-l"] - ) - utils.check_magika_cli_output_matches_expected_by_ext( - test_files_paths[:n], stdout, stderr, label_output=True - ) - - stdout, stderr = run_magika_python_cli( - test_files_paths[:n], extra_cli_options=["--label"] - ) - utils.check_magika_cli_output_matches_expected_by_ext( - test_files_paths[:n], stdout, stderr, label_output=True - ) - - -def test_magika_cli_with_basic_test_files_and_compatibility_mode() -> None: - test_files_paths = utils.get_basic_test_files_paths() - - for n in [1, 2, len(test_files_paths)]: - stdout, stderr = run_magika_python_cli( - test_files_paths[:n], compatibility_mode=True - ) - utils.check_magika_cli_output_matches_expected_by_ext( - test_files_paths[:n], stdout, stderr, compatibility_mode=True - ) - - stdout, stderr = run_magika_python_cli( - test_files_paths[:n], extra_cli_options=["-c"] - ) - utils.check_magika_cli_output_matches_expected_by_ext( - test_files_paths[:n], stdout, stderr, compatibility_mode=True - ) - - stdout, stderr = run_magika_python_cli( - test_files_paths[:n], extra_cli_options=["--compatibility-mode"] - ) - utils.check_magika_cli_output_matches_expected_by_ext( - test_files_paths[:n], stdout, stderr, compatibility_mode=True - ) - - -def test_magika_cli_output_with_low_confidence_prediction() -> None: - # This is something that looks like MarkDown, such that the model's best - # guess will be MarkDown, but not high enought to be trusted. Here we check - # that what we print is reasonable. - low_confidence_test_content = "# This is a very simple text" - # This is a short textual string, which will not even hit the DL model. - high_confidence_test_content = "Test" - - ctm = ContentTypesManager() - txt_ct = ctm.get_or_raise(ContentType.GENERIC_TEXT) - txt_description = txt_ct.description - txt_group = txt_ct.group - md_ct = ctm.get_or_raise("markdown") - md_description = md_ct.description - md_group = md_ct.group - - with tempfile.TemporaryDirectory() as td: - # test the low confidence prediction - low_confidence_tf_path = Path(td) / "low_confidence_test.txt" - low_confidence_tf_path.write_text(low_confidence_test_content) - stdout, stderr = run_magika_python_cli( - [low_confidence_tf_path], - ) - - low_confidence_expected_stdout_prefix = f"{str(low_confidence_tf_path)}: {txt_description} ({txt_group}) [Low-confidence model best-guess: {md_description} ({md_group}), score=" - - assert stdout.startswith(low_confidence_expected_stdout_prefix) - assert stderr == "" - - # test the high confidence prediction - high_confidence_tf_path = Path(td) / "high_confidence_test.txt" - high_confidence_tf_path.write_text(high_confidence_test_content) - stdout, stderr = run_magika_python_cli( - [high_confidence_tf_path], - ) - - high_confidence_expected_stdout = ( - f"{str(high_confidence_tf_path)}: {txt_description} ({txt_group})" - ) - - assert stdout.strip() == high_confidence_expected_stdout - assert stderr == "" - - -def test_magika_cli_with_basic_test_files_and_different_prediction_modes() -> None: - # Here we test only the CLI aspect; we test the different behaviors with - # different prediction modes when we test the Magika module. - test_files_paths = utils.get_basic_test_files_paths() - - for n in [1, 2]: - stdout, stderr = run_magika_python_cli(test_files_paths[:n]) - utils.check_magika_cli_output_matches_expected_by_ext( - test_files_paths[:n], stdout, stderr - ) - - stdout, stderr = run_magika_python_cli( - test_files_paths[:n], - extra_cli_options=["--prediction-mode", PredictionMode.MEDIUM_CONFIDENCE], - ) - utils.check_magika_cli_output_matches_expected_by_ext( - test_files_paths[:n], stdout, stderr - ) - - stdout, stderr = run_magika_python_cli( - test_files_paths[:n], - extra_cli_options=["--prediction-mode", PredictionMode.BEST_GUESS], - ) - utils.check_magika_cli_output_matches_expected_by_ext( - test_files_paths[:n], stdout, stderr - ) - - stdout, stderr = run_magika_python_cli( - test_files_paths[:n], - extra_cli_options=["--prediction-mode", PredictionMode.HIGH_CONFIDENCE], - ) - utils.check_magika_cli_output_matches_expected_by_ext( - test_files_paths[:n], stdout, stderr - ) - - # Test with invalid prediction mode - with pytest.raises(MagikaClientError): - _ = run_magika_python_cli( - test_files_paths[:n], - extra_cli_options=["--prediction-mode", "non-existing-mode"], - ) - - -def test_magika_cli_with_python_and_not_python_files() -> None: - with tempfile.TemporaryDirectory() as td: - # the test needs to be longer than "too small for DL model" - python_test_path = Path(td) / "real.py" - python_test_path.write_text("import flask\nimport requests") - not_python_test_path = Path(td) / "not-real.py" - not_python_test_path.write_text("xmport asd\nxmport requests") - - # check that a python file is detected as such - stdout, stderr = run_magika_python_cli( - [python_test_path], extra_cli_options=["--label"] - ) - predicted_ct = utils.get_magika_cli_output_from_stdout_stderr(stdout, stderr)[ - 0 - ][1] - assert predicted_ct == "python" - - # check that a file that is very far from being a python file is - # detected as text - stdout, stderr = run_magika_python_cli( - [not_python_test_path], extra_cli_options=["--label"] - ) - predicted_ct = utils.get_magika_cli_output_from_stdout_stderr(stdout, stderr)[ - 0 - ][1] - assert predicted_ct == "txt" - - -def test_magika_cli_with_basic_test_files_and_custom_batch_sizes() -> None: - test_files_paths = utils.get_basic_test_files_paths() - - for batch_size in [1, 2, 3, 16]: - for n in [1, 2, 5, len(test_files_paths)]: - stdout, stderr = run_magika_python_cli( - test_files_paths[:n], batch_size=batch_size - ) - utils.check_magika_cli_output_matches_expected_by_ext( - test_files_paths[:n], stdout, stderr - ) - - stdout, stderr = run_magika_python_cli( - test_files_paths[:n], - extra_cli_options=["--batch-size", str(batch_size)], - ) - utils.check_magika_cli_output_matches_expected_by_ext( - test_files_paths[:n], stdout, stderr - ) - - -def test_magika_cli_with_multiple_copies_of_the_same_file() -> None: - max_repetitions_num = 10 - test_file_path = utils.get_one_basic_test_file_path() - test_files_paths = [test_file_path] * max_repetitions_num - - for n in [2, max_repetitions_num]: - stdout, stderr = run_magika_python_cli(test_files_paths[:n]) - utils.check_magika_cli_output_matches_expected_by_ext( - test_files_paths[:n], stdout, stderr - ) - - stdout, stderr = run_magika_python_cli(test_files_paths[:n], json_output=True) - utils.check_magika_cli_output_matches_expected_by_ext( - test_files_paths[:n], stdout, stderr, json_output=True - ) - - stdout, stderr = run_magika_python_cli(test_files_paths[:n], jsonl_output=True) - utils.check_magika_cli_output_matches_expected_by_ext( - test_files_paths[:n], stdout, stderr, jsonl_output=True - ) - - -def test_magika_cli_with_many_files() -> None: - test_file_path = utils.get_one_basic_test_file_path() - - for n in [100, 1000]: - test_files_paths = [test_file_path] * n - stdout, stderr = run_magika_python_cli(test_files_paths) - utils.check_magika_cli_output_matches_expected_by_ext( - test_files_paths, stdout, stderr - ) - - -@pytest.mark.slow -def test_magika_cli_with_really_many_files() -> None: - test_file_path = utils.get_one_basic_test_file_path() - - for n in [10000]: - test_files_paths = [test_file_path] * n - stdout, stderr = run_magika_python_cli(test_files_paths) - utils.check_magika_cli_output_matches_expected_by_ext( - test_files_paths, stdout, stderr - ) - - -@pytest.mark.slow -def test_magika_cli_with_big_file() -> None: - def signal_handler(signum: int, frame: Any) -> None: - raise Exception("Timeout") - - signal.signal(signal.SIGALRM, signal_handler) - - # It should take much less than this, but pytest weird scheduling sometimes - # creates unexpected slow downs. - timeout = 2 - - for sample_size in [1000, 10000, 1_000_000, 1_000_000_000, 10_000_000_000]: - with tempfile.TemporaryDirectory() as td: - sample_path = Path(td) / "sample.dat" - utils.write_random_file_with_size(sample_path, sample_size) - print(f"Starting running Magika with a timeout of {timeout}") - signal.alarm(timeout) - _ = run_magika_python_cli([sample_path]) - signal.alarm(0) - print("Done running Magika") - - -def test_magika_cli_with_bad_input() -> None: - test_file_path = utils.get_one_basic_test_file_path() - - # Test without any argument or option - with pytest.raises(MagikaClientError) as e_info: - p = Path("/this/does/not/exist") - _ = run_magika_python_cli([]) - assert e_info.value.stdout == "" - assert ( - e_info.value.stderr - == "ERROR: You need to pass at least one path, or - to read from stdin.\n" - ) - - # Test with file that does not exist - with pytest.raises(MagikaClientError) as e_info: - p = Path("/this/does/not/exist") - _ = run_magika_python_cli([p], label_output=True) - assert e_info.value.stdout == "" - assert ( - e_info.value.stderr == f'ERROR: File or directory "{str(p)}" does not exist.\n' - ) - - # Test with incompatible list of options - with pytest.raises(MagikaClientError) as e_info: - _ = run_magika_python_cli([test_file_path], json_output=True, jsonl_output=True) - assert e_info.value.stdout == "" - assert ( - e_info.value.stderr - == "ERROR: You should use either --json or --jsonl, not both.\n" - ) - - # Test with an option does not exist - with pytest.raises(MagikaClientError) as e_info: - _ = run_magika_python_cli( - [test_file_path], extra_cli_options=["--non-existing-option"] - ) - assert e_info.value.stdout == "" - error_lines = e_info.value.stderr.split("\n") - assert error_lines[0].startswith("Usage: magika [OPTIONS] [FILE]...") - assert error_lines[-2].startswith("Error: No such option:") - assert error_lines[-1] == "" - - -def test_magika_cli_with_reading_from_stdin() -> None: - ctm = ContentTypesManager() - test_file_path = utils.get_one_basic_test_file_path() - - cmd = f"cat {str(test_file_path)} | magika - --jsonl" - p = subprocess.run(cmd, capture_output=True, text=True, check=True, shell=True) - stdout, stderr = p.stdout, p.stderr - - entries = utils.get_magika_cli_output_from_stdout_stderr( - stdout, stderr, jsonl_output=True - ) - sample_path, entry = entries[0] - assert isinstance(sample_path, Path) - assert isinstance(entry, dict) - - file_ext = test_file_path.suffix.lstrip(".") - true_cts = ctm.get_cts_by_ext(file_ext) - true_cts_names = [ct.name for ct in true_cts] - - assert str(sample_path) == "-" - assert str(entry["path"]) == "-" - assert entry["output"]["ct_label"] in true_cts_names - - # test with some bad input - cmd = f"cat {str(test_file_path)} | magika - {str(test_file_path)}" - p = subprocess.run(cmd, capture_output=True, text=True, check=False, shell=True) - assert p.returncode == 1 - assert p.stdout == "" - assert p.stderr.find('ERROR: If you pass "-", you cannot pass anything else.') >= 0 - - cmd = f"cat {str(test_file_path)} | magika - -r" - p = subprocess.run(cmd, capture_output=True, text=True, check=False, shell=True) - assert p.returncode == 1 - assert p.stdout == "" - assert ( - p.stderr.find('ERROR: If you pass "-", recursive scan is not meaningful.') >= 0 - ) - - -def test_magika_cli_with_colors() -> None: - test_file_path = utils.get_one_basic_test_file_path() - - # check that it does not crash when using colors and that we are actually - # using colors - stdout, stderr = run_magika_python_cli([test_file_path], with_colors=True) - assert stdout.find("\033") >= 0 or stderr.find("\033") >= 0 - stdout, stderr = run_magika_python_cli( - [test_file_path], with_colors=True, mime_output=True - ) - assert stdout.find("\033") >= 0 or stderr.find("\033") >= 0 - stdout, stderr = run_magika_python_cli( - [test_file_path], with_colors=True, verbose=True, debug=True - ) - assert stdout.find("\033") >= 0 or stderr.find("\033") >= 0 - - -def test_magika_cli_with_no_colors() -> None: - test_file_path = utils.get_one_basic_test_file_path() - - # check that we are not using colors when --no-colors is passed - stdout, stderr = run_magika_python_cli([test_file_path], with_colors=False) - assert stdout.find("\033") == -1 and stderr.find("\033") == -1 - stdout, stderr = run_magika_python_cli( - [test_file_path], with_colors=False, mime_output=True - ) - assert stdout.find("\033") == -1 and stderr.find("\033") == -1 - stdout, stderr = run_magika_python_cli( - [test_file_path], with_colors=False, verbose=True, debug=True - ) - assert stdout.find("\033") == -1 and stderr.find("\033") == -1 - - -def test_magika_cli_generate_report() -> None: - test_files_paths = utils.get_basic_test_files_paths() - - for n in [1, 2, len(test_files_paths)]: - stdout, stderr = run_magika_python_cli( - test_files_paths[:n], generate_report=True - ) - utils.check_magika_cli_output_matches_expected_by_ext( - test_files_paths[:n], stdout, stderr, generate_report=True - ) - - stdout, stderr = run_magika_python_cli( - test_files_paths[:n], extra_cli_options=["--generate-report"] - ) - utils.check_magika_cli_output_matches_expected_by_ext( - test_files_paths[:n], stdout, stderr, generate_report=True - ) - - stdout, stderr = run_magika_python_cli( - test_files_paths[:n], extra_cli_options=["--mime-type", "--generate-report"] - ) - utils.check_magika_cli_output_matches_expected_by_ext( - test_files_paths[:n], stdout, stderr, mime_output=True, generate_report=True - ) - - -def test_magika_cli_output_version() -> None: - stdout, stderr = run_magika_python_cli([], extra_cli_options=["--version"]) - - lines = utils.get_lines_from_stream(stdout) - assert len(lines) == 2 - assert lines[0].startswith("Magika version") - assert lines[1].startswith("Default model") - - assert stderr == "" - - -def test_magika_cli_help() -> None: - stdout_short, stderr_short = run_magika_python_cli([], extra_cli_options=["-h"]) - stdout_long, stderr_long = run_magika_python_cli([], extra_cli_options=["--help"]) - - for stdout, stderr in zip([stdout_short, stdout_long], [stderr_short, stderr_long]): - assert stdout.find("Magika version") >= 0 - assert stdout.find("Default model") >= 0 - - assert stderr == "" - - -def test_magika_cli_list_content_types() -> None: - test_file_path = utils.get_one_basic_test_file_path() - - stdout, stderr = run_magika_python_cli([], list_output_content_types=True) - - lines = utils.get_lines_from_stream(stdout) - header = lines[0] - assert header.find("Content Type Label") >= 0 - assert header.find("Description") >= 0 - assert stderr == "" - - with pytest.raises(MagikaClientError): - _ = run_magika_python_cli([test_file_path], list_output_content_types=True) - - -def test_magika_cli_performance_statistics_report() -> None: - test_files_paths = utils.get_basic_test_files_paths() - - _, stderr = run_magika_python_cli([test_files_paths[0]]) - assert stderr == "" - - _, stderr = run_magika_python_cli(test_files_paths) - assert stderr == "" - - _, stderr = run_magika_python_cli( - test_files_paths[:10], - batch_size=10, - extra_cli_options=["--dump-performance-stats"], - ) - stderr_lines = stderr.split("\n") - assert stderr_lines[0].startswith("PERFORMANCE STATISTICS REPORT") - assert stderr_lines[1].startswith("Not enough data") - - _, stderr = run_magika_python_cli( - test_files_paths[:10], - batch_size=1, - extra_cli_options=["--dump-performance-stats"], - ) - stderr_lines = stderr.split("\n") - assert stderr_lines[0].startswith("PERFORMANCE STATISTICS REPORT") - assert stderr_lines[1].startswith("KEY") - assert stderr_lines[2].startswith("mean") diff --git a/python/tests/test_magika_python_module.py b/python/tests/test_magika_python_module.py index 0524af06..c61f6a78 100644 --- a/python/tests/test_magika_python_module.py +++ b/python/tests/test_magika_python_module.py @@ -12,13 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +import signal import tempfile from pathlib import Path +from typing import Any import pytest from magika import Magika, PredictionMode -from magika.content_types import ContentType, ContentTypesManager +from magika.types import ContentTypeLabel, Status from tests import utils @@ -48,17 +50,15 @@ def test_magika_module_with_basic_tests_by_paths() -> None: tests_paths = utils.get_basic_test_files_paths() m = Magika(model_dir=model_dir) - ctm = ContentTypesManager() results = m.identify_paths(tests_paths) for test_path, result in zip(tests_paths, results): - file_ext = test_path.suffix.lstrip(".") - true_cts = ctm.get_cts_by_ext(file_ext) - assert len(true_cts) > 0 - true_cts_labels = [ct.name for ct in true_cts] - assert result.path == str(test_path) - assert result.output.ct_label in true_cts_labels + assert result.ok + expected_ct_label = get_expected_content_type_label_from_test_file_path( + test_path + ) + assert result.value.output.label == expected_ct_label def test_magika_module_with_basic_tests_by_path() -> None: @@ -66,16 +66,14 @@ def test_magika_module_with_basic_tests_by_path() -> None: tests_paths = utils.get_basic_test_files_paths() m = Magika(model_dir=model_dir) - ctm = ContentTypesManager() for test_path in tests_paths: result = m.identify_path(test_path) - file_ext = test_path.suffix.lstrip(".") - true_cts = ctm.get_cts_by_ext(file_ext) - assert len(true_cts) > 0 - true_cts_labels = [ct.name for ct in true_cts] - assert result.path == str(test_path) - assert result.output.ct_label in true_cts_labels + assert result.ok + expected_ct_label = get_expected_content_type_label_from_test_file_path( + test_path + ) + assert result.value.output.label == expected_ct_label def test_magika_module_with_basic_tests_by_bytes() -> None: @@ -83,17 +81,32 @@ def test_magika_module_with_basic_tests_by_bytes() -> None: tests_paths = utils.get_basic_test_files_paths() m = Magika(model_dir=model_dir) - ctm = ContentTypesManager() for test_path in tests_paths: content = test_path.read_bytes() result = m.identify_bytes(content) - file_ext = test_path.suffix.lstrip(".") - true_cts = ctm.get_cts_by_ext(file_ext) - assert len(true_cts) > 0 - true_cts_labels = [ct.name for ct in true_cts] - assert result.path == "-" - assert result.output.ct_label in true_cts_labels + assert result.ok + expected_ct_label = get_expected_content_type_label_from_test_file_path( + test_path + ) + assert result.value.output.label == expected_ct_label + + +def test_magika_module_with_mitra_tests_by_paths() -> None: + model_dir = utils.get_default_model_dir() + tests_paths = utils.get_mitra_test_files_paths() + + m = Magika(model_dir=model_dir) + + results = m.identify_paths(tests_paths) + + for test_path, result in zip(tests_paths, results): + print(f"Test: {test_path}") + assert result.ok + expected_ct_label = get_expected_content_type_label_from_test_file_path( + test_path + ) + assert result.value.output.label == expected_ct_label def test_magika_module_with_empty_content() -> None: @@ -102,18 +115,19 @@ def test_magika_module_with_empty_content() -> None: empty_content = b"" res = m.identify_bytes(empty_content) - assert res.path == "-" - assert res.dl.ct_label is None - assert res.output.ct_label == ContentType.EMPTY - assert res.output.score == 1.0 + assert res.ok + assert res.value.dl.label == ContentTypeLabel.UNDEFINED + assert res.value.output.label == ContentTypeLabel.EMPTY + assert res.value.score == 1.0 with tempfile.TemporaryDirectory() as td: tf_path = Path(td) / "empty.dat" tf_path.write_bytes(empty_content) res = m.identify_path(tf_path) - assert res.path == str(tf_path) - assert res.dl.ct_label is None - assert res.output.score == 1.0 + assert res.ok + assert res.value.dl.label == ContentTypeLabel.UNDEFINED + assert res.value.output.label == ContentTypeLabel.EMPTY + assert res.value.score == 1.0 def test_magika_module_with_short_content() -> None: @@ -122,68 +136,268 @@ def test_magika_module_with_short_content() -> None: text_content = b"asd" binary_content = b"\x80\x80\x80" - res = m.identify_bytes(text_content) - assert res.path == "-" - assert res.dl.ct_label is None - assert res.output.ct_label == ContentType.GENERIC_TEXT - assert res.output.score == 1.0 - - res = m.identify_bytes(binary_content) - assert res.path == "-" - assert res.dl.ct_label is None - assert res.output.ct_label == ContentType.UNKNOWN - assert res.output.score == 1.0 - for content, expected_ct_label in zip( - [text_content, binary_content], [ContentType.GENERIC_TEXT, ContentType.UNKNOWN] + [text_content, binary_content], + [ContentTypeLabel.TXT, ContentTypeLabel.UNKNOWN], ): with tempfile.TemporaryDirectory() as td: + # prediction via path tf_path = Path(td) / "file.txt" tf_path.write_bytes(content) res = m.identify_path(tf_path) - assert res.path == str(tf_path) - assert res.dl.ct_label is None - assert res.output.ct_label == expected_ct_label - assert res.output.score == 1.0 + assert res.ok + assert res.value.dl.label == ContentTypeLabel.UNDEFINED + assert res.value.output.label == expected_ct_label + assert res.value.score == 1.0 + + # prediction via content + res = m.identify_bytes(content) + assert res.ok + assert res.value.dl.label == ContentTypeLabel.UNDEFINED + assert res.value.output.label == expected_ct_label + assert res.value.score == 1.0 + + +def test_magika_module_with_python_and_non_python_content() -> None: + python_content = ( + b"import flask\nimport requests\n\ndef foo(a):\n print(f'Test {a}')\n" + ) + non_python_content = b"xmport asd\nxmport requests" + + m = Magika() + + res = m.identify_bytes(python_content) + assert res.ok + assert res.value.output.label == ContentTypeLabel.PYTHON + + res = m.identify_bytes(non_python_content) + assert res.ok + assert res.value.output.label == ContentTypeLabel.TXT def test_magika_module_with_different_prediction_modes() -> None: model_dir = utils.get_default_model_dir() m = Magika(model_dir=model_dir, prediction_mode=PredictionMode.BEST_GUESS) - assert m._get_output_ct_label_from_dl_result("python", 0.01) == "python" - assert m._get_output_ct_label_from_dl_result("python", 0.40) == "python" - assert m._get_output_ct_label_from_dl_result("python", 0.60) == "python" - assert m._get_output_ct_label_from_dl_result("python", 0.99) == "python" + assert ( + m._get_output_ct_label_from_dl_result(ContentTypeLabel.PYTHON, 0.01) + == ContentTypeLabel.PYTHON + ) + assert ( + m._get_output_ct_label_from_dl_result(ContentTypeLabel.PYTHON, 0.40) + == ContentTypeLabel.PYTHON + ) + assert ( + m._get_output_ct_label_from_dl_result(ContentTypeLabel.PYTHON, 0.60) + == ContentTypeLabel.PYTHON + ) + assert ( + m._get_output_ct_label_from_dl_result(ContentTypeLabel.PYTHON, 0.99) + == ContentTypeLabel.PYTHON + ) - # test that the default is HIGH_CONFIDENCE - m = Magika(model_dir=model_dir) - assert m._get_output_ct_label_from_dl_result("python", 0.01) == "txt" + m = Magika(model_dir=model_dir, prediction_mode=PredictionMode.MEDIUM_CONFIDENCE) + assert ( + m._get_output_ct_label_from_dl_result(ContentTypeLabel.PYTHON, 0.01) + == ContentTypeLabel.TXT + ) + assert ( + m._get_output_ct_label_from_dl_result( + ContentTypeLabel.PYTHON, m._model_config.medium_confidence_threshold - 0.01 + ) + == ContentTypeLabel.TXT + ) + assert ( + m._get_output_ct_label_from_dl_result(ContentTypeLabel.PYTHON, 0.60) + == ContentTypeLabel.PYTHON + ) + assert ( + m._get_output_ct_label_from_dl_result(ContentTypeLabel.PYTHON, 0.99) + == ContentTypeLabel.PYTHON + ) + + m = Magika(model_dir=model_dir, prediction_mode=PredictionMode.HIGH_CONFIDENCE) + high_confidence_threshold = m._model_config.thresholds.get( + ContentTypeLabel.PYTHON, m._model_config.medium_confidence_threshold + ) + assert ( + m._get_output_ct_label_from_dl_result(ContentTypeLabel.PYTHON, 0.01) + == ContentTypeLabel.TXT + ) assert ( m._get_output_ct_label_from_dl_result( - "python", m._medium_confidence_threshold - 0.01 + ContentTypeLabel.PYTHON, high_confidence_threshold - 0.01 ) - == "txt" + == ContentTypeLabel.TXT ) assert ( m._get_output_ct_label_from_dl_result( - "python", m._medium_confidence_threshold + 0.01 + ContentTypeLabel.PYTHON, high_confidence_threshold + 0.01 ) - == "txt" + == ContentTypeLabel.PYTHON + ) + assert ( + m._get_output_ct_label_from_dl_result(ContentTypeLabel.PYTHON, 0.99) + == ContentTypeLabel.PYTHON ) - assert m._get_output_ct_label_from_dl_result("python", 0.99) == "python" - m = Magika(model_dir=model_dir, prediction_mode=PredictionMode.MEDIUM_CONFIDENCE) - assert m._get_output_ct_label_from_dl_result("python", 0.01) == "txt" + # test that the default is HIGH_CONFIDENCE + m = Magika(model_dir=model_dir) + high_confidence_threshold = m._model_config.thresholds.get( + ContentTypeLabel.PYTHON, m._model_config.medium_confidence_threshold + ) + assert ( + m._get_output_ct_label_from_dl_result(ContentTypeLabel.PYTHON, 0.01) + == ContentTypeLabel.TXT + ) assert ( m._get_output_ct_label_from_dl_result( - "python", m._medium_confidence_threshold - 0.01 + ContentTypeLabel.PYTHON, high_confidence_threshold - 0.01 ) - == "txt" + == ContentTypeLabel.TXT + ) + assert ( + m._get_output_ct_label_from_dl_result( + ContentTypeLabel.PYTHON, high_confidence_threshold + 0.01 + ) + == ContentTypeLabel.PYTHON + ) + assert ( + m._get_output_ct_label_from_dl_result(ContentTypeLabel.PYTHON, 0.99) + == ContentTypeLabel.PYTHON ) - assert m._get_output_ct_label_from_dl_result("python", 0.60) == "python" - assert m._get_output_ct_label_from_dl_result("python", 0.99) == "python" - m = Magika(model_dir=model_dir, prediction_mode=PredictionMode.HIGH_CONFIDENCE) - assert m._get_output_ct_label_from_dl_result("python", 0.01) == "txt" - assert m._get_output_ct_label_from_dl_result("python", 0.60) == "txt" - assert m._get_output_ct_label_from_dl_result("python", 0.99) == "python" + +def test_magika_module_with_directory() -> None: + m = Magika() + + with tempfile.TemporaryDirectory() as td: + td_path = Path(td) + res = m.identify_path(td_path) + assert res.ok + assert res.value.dl.label == ContentTypeLabel.UNDEFINED + assert res.value.output.label == ContentTypeLabel.DIRECTORY + assert res.value.score == 1.0 + + +def test_magika_module_multiple_copies_of_the_same_file() -> None: + with tempfile.TemporaryDirectory() as td: + test_path = Path(td) / "test.txt" + test_path.write_text("test") + + test_paths = [test_path] * 3 + + m = Magika() + results = m.identify_paths(test_paths) + assert len(results) == len(test_paths) + for result in results: + assert result.ok + assert result.value.output.label == ContentTypeLabel.TXT + + +def test_magika_cli_with_many_files() -> None: + test_file_path = utils.get_one_basic_test_file_path() + + m = Magika() + + for n in [10, 100]: + test_files_paths = [test_file_path] * n + results = m.identify_paths(test_files_paths) + for result in results: + assert result.ok + # TODO: check that the result is actually correct + + +def test_magika_module_with_symlink() -> None: + with tempfile.TemporaryDirectory() as td: + test_path = Path(td) / "test.txt" + test_path.write_text("test") + + symlink_path = Path(td) / "symlink-test.txt" + symlink_path.symlink_to(test_path) + + m = Magika() + res = m.identify_path(test_path) + assert res.ok + assert res.value.output.label == ContentTypeLabel.TXT + res = m.identify_path(symlink_path) + assert res.ok + assert res.value.output.label == ContentTypeLabel.TXT + + m = Magika(no_dereference=True) + res = m.identify_path(test_path) + assert res.ok + assert res.value.output.label == ContentTypeLabel.TXT + res = m.identify_path(symlink_path) + assert res.ok + assert res.value.output.label == ContentTypeLabel.SYMLINK + + +def test_magika_module_with_non_existing_file() -> None: + m = Magika() + + with tempfile.TemporaryDirectory() as td: + non_existing_path = Path(td) / "non_existing.txt" + + res = m.identify_path(non_existing_path) + assert not res.ok + assert res.status == Status.FILE_NOT_FOUND_ERROR + + +def test_magika_module_with_permission_error() -> None: + m = Magika() + + with tempfile.TemporaryDirectory() as td: + unreadable_test_path = Path(td) / "test.txt" + unreadable_test_path.write_text("text") + + unreadable_test_path.chmod(0o000) + + res = m.identify_path(unreadable_test_path) + assert not res.ok + assert res.status == Status.PERMISSION_ERROR + + +@pytest.mark.skip +def test_magika_module_with_really_many_files() -> None: + test_file_path = utils.get_one_basic_test_file_path() + + m = Magika() + + for n in [10000]: + test_files_paths = [test_file_path] * n + + results = m.identify_paths(test_files_paths) + for result in results: + assert result.ok + # TODO: add more checks + + +@pytest.mark.slow +def test_magika_module_with_big_file() -> None: + def signal_handler(signum: int, frame: Any) -> None: + raise Exception("Timeout") + + signal.signal(signal.SIGALRM, signal_handler) + + # It should take much less than this, but pytest weird scheduling sometimes + # creates unexpected slow downs. + timeout = 2 + + m = Magika() + + for sample_size in [1000, 10000, 1_000_000, 1_000_000_000, 10_000_000_000]: + with tempfile.TemporaryDirectory() as td: + sample_path = Path(td) / "sample.dat" + utils.write_random_file_with_size(sample_path, sample_size) + print(f"Starting running Magika with a timeout of {timeout}") + signal.alarm(timeout) + res = m.identify_path(sample_path) + assert res.ok + signal.alarm(0) + print("Done running Magika") + + +def get_expected_content_type_label_from_test_file_path( + test_path: Path, +) -> ContentTypeLabel: + return ContentTypeLabel(test_path.parent.name) diff --git a/python/tests/utils.py b/python/tests/utils.py index a249c58e..611474e0 100644 --- a/python/tests/utils.py +++ b/python/tests/utils.py @@ -12,19 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import json import random import string from pathlib import Path -from typing import Any, Dict, List, Tuple, Union - -from magika.content_types import ContentTypesManager -from magika.types import ( - MagikaOutputFields, - MagikaResult, - ModelFeatures, - ModelOutputFields, -) +from typing import List def get_tests_data_dir() -> Path: @@ -51,13 +42,13 @@ def get_mitra_tests_files_dir() -> Path: def get_basic_test_files_paths() -> List[Path]: tests_files_dir = get_basic_tests_files_dir() - test_files_paths = sorted(filter(lambda p: p.is_file(), tests_files_dir.iterdir())) + test_files_paths = sorted(filter(lambda p: p.is_file(), tests_files_dir.rglob("*"))) return test_files_paths def get_mitra_test_files_paths() -> List[Path]: tests_files_dir = get_mitra_tests_files_dir() - test_files_paths = sorted(filter(lambda p: p.is_file(), tests_files_dir.iterdir())) + test_files_paths = sorted(filter(lambda p: p.is_file(), tests_files_dir.rglob("*"))) return test_files_paths @@ -110,155 +101,3 @@ def get_default_model_dir() -> Path: / Magika.get_default_model_name() ) return model_dir - - -def check_magika_cli_output_matches_expected_by_ext( - samples_paths: List[Path], stdout: str, stderr: str, **kwargs: Any -) -> None: - assert len(samples_paths) > 0 - json_output = kwargs.get("json_output", False) - jsonl_output = kwargs.get("jsonl_output", False) - mime_output = kwargs.get("mime_output", False) - label_output = kwargs.get("label_output", False) - compatibility_mode = kwargs.get("compatibility_mode", False) - cpp_output = kwargs.get("cpp_output", False) - ctm = ContentTypesManager() - predicted_cts = get_magika_cli_output_from_stdout_stderr(stdout, stderr, **kwargs) - assert len(predicted_cts) > 0 - assert len(samples_paths) == len(predicted_cts) - remaining_samples_paths = samples_paths[:] - for file_path, output in predicted_cts: - remaining_samples_paths.remove(file_path) - file_ext = file_path.suffix.lstrip(".") - if file_ext != "": - true_cts = ctm.get_cts_by_ext(file_ext) - else: - # The test file does not have any extension. In this case, we assume - # this is a test file path with the // - # pattern. - true_ct_name = file_path.parent.name - true_cts = [ctm.get_or_raise(true_ct_name)] - assert len(true_cts) > 0, f'File extension: "{file_ext}"' - - true_cts_names = [ct.name for ct in true_cts] - - if json_output or jsonl_output: - # check that each JSON entry satisfies the requirements - assert isinstance(output, dict) - dict_output: Dict[str, Any] = output - assert dict_output["output"]["ct_label"] in true_cts_names - elif cpp_output: - assert isinstance(output, str) - assert output.lower() in true_cts_names - else: - assert isinstance(output, str) - expected_outputs = [] - if mime_output: - expected_outputs = [ctm.get_mime_type(ct.name) for ct in true_cts] - elif label_output: - expected_outputs = true_cts_names - elif compatibility_mode: - expected_outputs = [ctm.get_magic(ct.name) for ct in true_cts] - else: - expected_outputs = [ - f"{ctm.get_description(ct.name)} ({ctm.get_group(ct.name)})" - for ct in true_cts - ] - assert ( - output in expected_outputs - ), f'Output: "{output}", expected output: "{expected_outputs}"' - - # Check that all input samples have been scanned - assert len(remaining_samples_paths) == 0 - - -def get_magika_cli_output_from_stdout_stderr( - stdout: str, stderr: str, **kwargs: Any -) -> List[Tuple[Path, Union[Dict[str, Any], str]]]: - json_output = kwargs.get("json_output", False) - jsonl_output = kwargs.get("jsonl_output", False) - output_score = kwargs.get("output_score", False) - generate_report = kwargs.get("generate_report", False) - cpp_output = kwargs.get("cpp_output", False) - """ - This function returns the output of magika for each input file. In case of - JSON or JSONL, it returns the full information dictionary for - each of them, not just the output content type label. - """ - - predicted_cts = [] - if json_output: - # expect json - entries = json.loads(stdout) - for entry in entries: - predicted_cts.append((Path(entry["path"]), entry)) - elif jsonl_output: - # expect jsonl - lines = get_lines_from_stream(stdout) - for line in lines: - entry = json.loads(line) - predicted_cts.append((Path(entry["path"]), entry)) - elif cpp_output: - # output from magika-cpp client - lines = get_lines_from_stream(stdout) - for line in lines: - file_path_str, output = line.split(": ", 1) - ct_output, score_str = output.split(" ") - score_num = float(score_str) - assert 0 <= score_num <= 1 - predicted_cts.append((Path(file_path_str), ct_output)) - else: - # plain output - lines = get_lines_from_stream(stdout) - for line in lines: - if output_score: - file_path_str, output = line.split(": ", 1) - ct_output, score_str = output.rsplit(" ", 1) - assert score_str.endswith("%") - score_num_str = score_str[:-1] - assert 0 <= int(score_num_str) <= 100 - else: - file_path_str, ct_output = line.split(": ", 1) - predicted_cts.append((Path(file_path_str), ct_output)) - - # check that we output the expected warnings - if generate_report: - stderr_lines = get_lines_from_stream(stderr) - assert len(stderr_lines) >= 1 - if generate_report: - assert stderr_lines[0].startswith("#" * 10) - assert stderr_lines[1].find("REPORT") >= 0 - assert stderr_lines[2].startswith("#" * 10) - assert stderr_lines[-4].startswith("#" * 10) - assert stderr_lines[-3].startswith("Please") - assert stderr_lines[-2].startswith("Please") - assert ( - stderr_lines[-1].startswith("IMPORTANT") - and stderr_lines[-1].find("NOT") >= 0 - and stderr_lines[-1].find("PII") >= 0 - ) - report_info = json.loads(stderr_lines[3]) - assert set(report_info.keys()) == { - "version", - "model_dir_name", - "python_version", - "reports", - } - for report in report_info["reports"]: - assert set(report.keys()) == {"hash", "features", "result"} - assert isinstance(report["hash"], str) - # try to parse "features" as ModelFeatures - _ = ModelFeatures(**json.loads(report["features"])) - # try to parse "result" as MagikaResult - result_dict = report["result"] - mr = MagikaResult( - path=result_dict["path"], - dl=ModelOutputFields(**result_dict["dl"]), - output=MagikaOutputFields( - **result_dict["output"], - ), - ) - assert mr.path == "" - assert isinstance(mr.output.ct_label, str) - - return predicted_cts diff --git a/tests_data/basic/code.asm b/tests_data/basic/asm/code.asm similarity index 100% rename from tests_data/basic/code.asm rename to tests_data/basic/asm/code.asm diff --git a/tests_data/basic/code.c b/tests_data/basic/c/code.c similarity index 100% rename from tests_data/basic/code.c rename to tests_data/basic/c/code.c diff --git a/tests_data/basic/code.css b/tests_data/basic/css/code.css similarity index 100% rename from tests_data/basic/code.css rename to tests_data/basic/css/code.css diff --git a/tests_data/basic/dockerfile/Dockerfile b/tests_data/basic/dockerfile/Dockerfile new file mode 100644 index 00000000..0052488e --- /dev/null +++ b/tests_data/basic/dockerfile/Dockerfile @@ -0,0 +1,14 @@ +# syntax=docker/dockerfile:1 + +ARG PYTHON_VERSION=3.11 +FROM python:${PYTHON_VERSION}-slim as base + +WORKDIR /magika + +# This requires buildx +# RUN --mount=type=cache,target=/root/.cache/pip \ +# pip install magika + +RUN pip install magika + +ENTRYPOINT ["magika"] diff --git a/tests_data/basic/doc.docx b/tests_data/basic/docx/doc.docx similarity index 100% rename from tests_data/basic/doc.docx rename to tests_data/basic/docx/doc.docx diff --git a/tests_data/basic/doc.epub b/tests_data/basic/epub/doc.epub similarity index 100% rename from tests_data/basic/doc.epub rename to tests_data/basic/epub/doc.epub diff --git a/tests_data/basic/doc.html b/tests_data/basic/html/doc.html similarity index 100% rename from tests_data/basic/doc.html rename to tests_data/basic/html/doc.html diff --git a/tests_data/basic/doc.ini b/tests_data/basic/ini/doc.ini similarity index 100% rename from tests_data/basic/doc.ini rename to tests_data/basic/ini/doc.ini diff --git a/tests_data/basic/code.js b/tests_data/basic/javascript/code.js similarity index 100% rename from tests_data/basic/code.js rename to tests_data/basic/javascript/code.js diff --git a/tests_data/basic/doc.json b/tests_data/basic/json/doc.json similarity index 100% rename from tests_data/basic/doc.json rename to tests_data/basic/json/doc.json diff --git a/tests_data/basic/doc.odt b/tests_data/basic/odt/doc.odt similarity index 100% rename from tests_data/basic/doc.odt rename to tests_data/basic/odt/doc.odt diff --git a/tests_data/basic/doc.pem b/tests_data/basic/pem/doc.pem similarity index 100% rename from tests_data/basic/doc.pem rename to tests_data/basic/pem/doc.pem diff --git a/tests_data/basic/doc.pub b/tests_data/basic/pem/doc.pub similarity index 100% rename from tests_data/basic/doc.pub rename to tests_data/basic/pem/doc.pub diff --git a/tests_data/basic/code.py b/tests_data/basic/python/code.py similarity index 100% rename from tests_data/basic/code.py rename to tests_data/basic/python/code.py diff --git a/tests_data/basic/doc.rtf b/tests_data/basic/rtf/doc.rtf similarity index 100% rename from tests_data/basic/doc.rtf rename to tests_data/basic/rtf/doc.rtf diff --git a/tests_data/basic/code.rs b/tests_data/basic/rust/code.rs similarity index 100% rename from tests_data/basic/code.rs rename to tests_data/basic/rust/code.rs diff --git a/tests_data/basic/code.smali b/tests_data/basic/smali/code.smali similarity index 100% rename from tests_data/basic/code.smali rename to tests_data/basic/smali/code.smali diff --git a/tests_data/basic/toml/doc.toml b/tests_data/basic/toml/doc.toml new file mode 100644 index 00000000..228b2da6 --- /dev/null +++ b/tests_data/basic/toml/doc.toml @@ -0,0 +1,40 @@ +[tool.poetry] +name = "magika" +version = "0.6.0-dev" +description = "A tool to determine the content type of a file with deep-learning" +authors = ["Yanick Fratantonio "] +readme = "README.md" +packages = [{include = "magika"}] + +[tool.poetry.dependencies] +python = "^3.8,<3.13" +click = "^8.1.3" +tqdm = "^4.66.2" +onnxruntime = "^1.17.0" +numpy = [ + {version = "^1.24", python = ">=3.8,<3.9"}, + {version = "^1.26", python = ">=3.9,<3.13"} +] +tabulate = "^0.9.0" +python-dotenv = "^1.0.1" + +[tool.poetry.group.dev.dependencies] +pytest = "^8.0.1" +ipython = [ + {version = "^8.12.3", python = ">=3.8,<3.9"}, + {version = "^8.18.1", python = ">=3.9,<3.10"}, + {version = "^8.21.0", python = ">=3.10,<3.13"} +] +ruff = ">=0.2.2,<0.4.0" +mypy = "^1.8.0" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" + +[tool.ruff.lint] +# Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`) codes by default. +# Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or +# McCabe complexity (`C901`) by default. +select = ["E4", "E7", "E9", "F", "I001"] +ignore = [] diff --git a/tests_data/basic/text.txt b/tests_data/basic/txt/text.txt similarity index 100% rename from tests_data/basic/text.txt rename to tests_data/basic/txt/text.txt diff --git a/tests_data/basic/typescript/code.ts b/tests_data/basic/typescript/code.ts new file mode 100644 index 00000000..8f2076b5 --- /dev/null +++ b/tests_data/basic/typescript/code.ts @@ -0,0 +1,52 @@ +#! /usr/bin/env node +// Command line tool to test MagikaJs. Please use the official command line +// tool (`pip install magika`) for normal use. + +// To run this, you need to install the optional dependencies too. +import {program} from 'commander'; +import {readFile} from 'fs/promises'; +import chalk from 'chalk'; +import {MagikaNode as Magika} from './magika_node.js'; + +program + .description('Magika JS - file type detection with ML. https://google.github.io/magika') + .option('--json-output', 'Format output in JSON') + .option('--model-url ', 'Model URL', Magika.MODEL_URL) + .option( '--model-path ', 'Modle file path') + .option( '--config-url ', 'Config URL', Magika.CONFIG_URL) + .option( '--config-path ', 'Config file path') + .argument('', 'Paths of the files to detect'); + +program.parse(); + +const flags = program.opts(); +const magika = new Magika(); + +(async () => { + await magika.load({ + modelURL: flags.modelUrl, + modelPath: flags.modelPath, + configURL: flags.configUrl, + configPath: flags.configPath + }); + await Promise.all(program.args.map(async (path) => { + let data = null; + try { + data = await readFile(path); + } catch (error) { + console.error('Skipping file', path, error); + } + + if (data != null) { + const prediction = await magika.identifyBytes(data); + if (flags.jsonOutput) { + console.log({path, ...prediction}); + } else { + console.log( + chalk.blue(path), + chalk.green(prediction?.label, chalk.white(prediction?.score)), + ); + } + } + })); +})(); \ No newline at end of file diff --git a/tests_data/basic/yara/rule.yar b/tests_data/basic/yara/rule.yar new file mode 100644 index 00000000..8a6397e8 --- /dev/null +++ b/tests_data/basic/yara/rule.yar @@ -0,0 +1,12 @@ +rule Rule_485729_77379 { + strings: + $s1 = "HKEY_LOCAL_MACHINE\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Run" + $s2 = "Win32_Process" + $s3 = "Create" wide + condition: + $s1 and ($s2 and $s3) + meta: + author = "CyberThreatResearch" + date = "2019-09-23" + tags = "malware, persistence, registry" +} diff --git a/tests_data/mitra/bmp.bmp b/tests_data/mitra/bmp/bmp.bmp similarity index 100% rename from tests_data/mitra/bmp.bmp rename to tests_data/mitra/bmp/bmp.bmp diff --git a/tests_data/mitra/bzip2.bz2 b/tests_data/mitra/bzip/bzip2.bz2 similarity index 100% rename from tests_data/mitra/bzip2.bz2 rename to tests_data/mitra/bzip/bzip2.bz2 diff --git a/tests_data/mitra/cab.cab b/tests_data/mitra/cab/cab.cab similarity index 100% rename from tests_data/mitra/cab.cab rename to tests_data/mitra/cab/cab.cab diff --git a/tests_data/mitra/elf.elf b/tests_data/mitra/elf/elf.elf similarity index 100% rename from tests_data/mitra/elf.elf rename to tests_data/mitra/elf/elf.elf diff --git a/tests_data/mitra/elf64.elf b/tests_data/mitra/elf/elf64.elf similarity index 100% rename from tests_data/mitra/elf64.elf rename to tests_data/mitra/elf/elf64.elf diff --git a/tests_data/mitra/flac.flac b/tests_data/mitra/flac/flac.flac similarity index 100% rename from tests_data/mitra/flac.flac rename to tests_data/mitra/flac/flac.flac diff --git a/tests_data/mitra/tiny.flac b/tests_data/mitra/flac/tiny.flac similarity index 100% rename from tests_data/mitra/tiny.flac rename to tests_data/mitra/flac/tiny.flac diff --git a/tests_data/mitra/gif87.gif b/tests_data/mitra/gif/gif87.gif similarity index 100% rename from tests_data/mitra/gif87.gif rename to tests_data/mitra/gif/gif87.gif diff --git a/tests_data/mitra/gif89.gif b/tests_data/mitra/gif/gif89.gif similarity index 100% rename from tests_data/mitra/gif89.gif rename to tests_data/mitra/gif/gif89.gif diff --git a/tests_data/mitra/gzip.gz b/tests_data/mitra/gzip/gzip.gz similarity index 100% rename from tests_data/mitra/gzip.gz rename to tests_data/mitra/gzip/gzip.gz diff --git a/tests_data/mitra/iso.iso b/tests_data/mitra/iso/iso.iso similarity index 100% rename from tests_data/mitra/iso.iso rename to tests_data/mitra/iso/iso.iso diff --git a/tests_data/mitra/java.class b/tests_data/mitra/javabytecode/java.class similarity index 100% rename from tests_data/mitra/java.class rename to tests_data/mitra/javabytecode/java.class diff --git a/tests_data/mitra/jpg.jpg b/tests_data/mitra/jpeg/jpg.jpg similarity index 100% rename from tests_data/mitra/jpg.jpg rename to tests_data/mitra/jpeg/jpg.jpg diff --git a/tests_data/mitra/id3v1.mp3 b/tests_data/mitra/mp3/id3v1.mp3 similarity index 100% rename from tests_data/mitra/id3v1.mp3 rename to tests_data/mitra/mp3/id3v1.mp3 diff --git a/tests_data/mitra/id3v2.mp3 b/tests_data/mitra/mp3/id3v2.mp3 similarity index 100% rename from tests_data/mitra/id3v2.mp3 rename to tests_data/mitra/mp3/id3v2.mp3 diff --git a/tests_data/mitra/mp4.mp4 b/tests_data/mitra/mp4/mp4.mp4 similarity index 100% rename from tests_data/mitra/mp4.mp4 rename to tests_data/mitra/mp4/mp4.mp4 diff --git a/tests_data/mitra/vorbis.ogg b/tests_data/mitra/ogg/vorbis.ogg similarity index 100% rename from tests_data/mitra/vorbis.ogg rename to tests_data/mitra/ogg/vorbis.ogg diff --git a/tests_data/mitra/pcap.pcap b/tests_data/mitra/pcap/pcap.pcap similarity index 100% rename from tests_data/mitra/pcap.pcap rename to tests_data/mitra/pcap/pcap.pcap diff --git a/tests_data/mitra/pdf.pdf b/tests_data/mitra/pdf/pdf.pdf similarity index 100% rename from tests_data/mitra/pdf.pdf rename to tests_data/mitra/pdf/pdf.pdf diff --git a/tests_data/mitra/pe32.exe b/tests_data/mitra/pebin/pe32.exe similarity index 100% rename from tests_data/mitra/pe32.exe rename to tests_data/mitra/pebin/pe32.exe diff --git a/tests_data/mitra/pe64.exe b/tests_data/mitra/pebin/pe64.exe similarity index 100% rename from tests_data/mitra/pe64.exe rename to tests_data/mitra/pebin/pe64.exe diff --git a/tests_data/mitra/php.php b/tests_data/mitra/php/php.php similarity index 100% rename from tests_data/mitra/php.php rename to tests_data/mitra/php/php.php diff --git a/tests_data/mitra/cgbi.png b/tests_data/mitra/png/cgbi.png similarity index 100% rename from tests_data/mitra/cgbi.png rename to tests_data/mitra/png/cgbi.png diff --git a/tests_data/mitra/png.png b/tests_data/mitra/png/png.png similarity index 100% rename from tests_data/mitra/png.png rename to tests_data/mitra/png/png.png diff --git a/tests_data/mitra/rar4.rar b/tests_data/mitra/rar/rar4.rar similarity index 100% rename from tests_data/mitra/rar4.rar rename to tests_data/mitra/rar/rar4.rar diff --git a/tests_data/mitra/rar5.rar b/tests_data/mitra/rar/rar5.rar similarity index 100% rename from tests_data/mitra/rar5.rar rename to tests_data/mitra/rar/rar5.rar diff --git a/tests_data/mitra/rich.rtf b/tests_data/mitra/rtf/rich.rtf similarity index 100% rename from tests_data/mitra/rich.rtf rename to tests_data/mitra/rtf/rich.rtf diff --git a/tests_data/mitra/7-zip.7z b/tests_data/mitra/sevenzip/7-zip.7z similarity index 100% rename from tests_data/mitra/7-zip.7z rename to tests_data/mitra/sevenzip/7-zip.7z diff --git a/tests_data/mitra/svg.svg b/tests_data/mitra/svg/svg.svg similarity index 100% rename from tests_data/mitra/svg.svg rename to tests_data/mitra/svg/svg.svg diff --git a/tests_data/mitra/hello-gnu.tar b/tests_data/mitra/tar/hello-gnu.tar similarity index 100% rename from tests_data/mitra/hello-gnu.tar rename to tests_data/mitra/tar/hello-gnu.tar diff --git a/tests_data/mitra/hello-pax.tar b/tests_data/mitra/tar/hello-pax.tar similarity index 100% rename from tests_data/mitra/hello-pax.tar rename to tests_data/mitra/tar/hello-pax.tar diff --git a/tests_data/mitra/hello-ustar.tar b/tests_data/mitra/tar/hello-ustar.tar similarity index 100% rename from tests_data/mitra/hello-ustar.tar rename to tests_data/mitra/tar/hello-ustar.tar diff --git a/tests_data/mitra/tar.tar b/tests_data/mitra/tar/tar.tar similarity index 100% rename from tests_data/mitra/tar.tar rename to tests_data/mitra/tar/tar.tar diff --git a/tests_data/mitra/footer.tga b/tests_data/mitra/tga/footer.tga similarity index 100% rename from tests_data/mitra/footer.tga rename to tests_data/mitra/tga/footer.tga diff --git a/tests_data/mitra/tiff-be.tif b/tests_data/mitra/tiff/tiff-be.tif similarity index 100% rename from tests_data/mitra/tiff-be.tif rename to tests_data/mitra/tiff/tiff-be.tif diff --git a/tests_data/mitra/tiff-le.tif b/tests_data/mitra/tiff/tiff-le.tif similarity index 100% rename from tests_data/mitra/tiff-le.tif rename to tests_data/mitra/tiff/tiff-le.tif diff --git a/tests_data/mitra/riff.wav b/tests_data/mitra/wav/riff.wav similarity index 100% rename from tests_data/mitra/riff.wav rename to tests_data/mitra/wav/riff.wav diff --git a/tests_data/mitra/rifx.wav b/tests_data/mitra/wav/rifx.wav similarity index 100% rename from tests_data/mitra/rifx.wav rename to tests_data/mitra/wav/rifx.wav diff --git a/tests_data/mitra/webm.webm b/tests_data/mitra/webm/webm.webm similarity index 100% rename from tests_data/mitra/webm.webm rename to tests_data/mitra/webm/webm.webm diff --git a/tests_data/mitra/webp.webp b/tests_data/mitra/webp/webp.webp similarity index 100% rename from tests_data/mitra/webp.webp rename to tests_data/mitra/webp/webp.webp diff --git a/tests_data/mitra/webpl.webp b/tests_data/mitra/webp/webpl.webp similarity index 100% rename from tests_data/mitra/webpl.webp rename to tests_data/mitra/webp/webpl.webp diff --git a/tests_data/mitra/hello-world.xar b/tests_data/mitra/xar/hello-world.xar similarity index 100% rename from tests_data/mitra/hello-world.xar rename to tests_data/mitra/xar/hello-world.xar diff --git a/tests_data/mitra/mini.xar b/tests_data/mitra/xar/mini.xar similarity index 100% rename from tests_data/mitra/mini.xar rename to tests_data/mitra/xar/mini.xar diff --git a/tests_data/mitra/xz.xz b/tests_data/mitra/xz/xz.xz similarity index 100% rename from tests_data/mitra/xz.xz rename to tests_data/mitra/xz/xz.xz diff --git a/tests_data/mitra/NT.zip b/tests_data/mitra/zip/NT.zip similarity index 100% rename from tests_data/mitra/NT.zip rename to tests_data/mitra/zip/NT.zip diff --git a/tests_data/mitra/NTFS.zip b/tests_data/mitra/zip/NTFS.zip similarity index 100% rename from tests_data/mitra/NTFS.zip rename to tests_data/mitra/zip/NTFS.zip diff --git a/tests_data/mitra/PPMd.zip b/tests_data/mitra/zip/PPMd.zip similarity index 100% rename from tests_data/mitra/PPMd.zip rename to tests_data/mitra/zip/PPMd.zip diff --git a/tests_data/mitra/aes.zip b/tests_data/mitra/zip/aes.zip similarity index 100% rename from tests_data/mitra/aes.zip rename to tests_data/mitra/zip/aes.zip diff --git a/tests_data/mitra/bz2.zip b/tests_data/mitra/zip/bz2.zip similarity index 100% rename from tests_data/mitra/bz2.zip rename to tests_data/mitra/zip/bz2.zip diff --git a/tests_data/mitra/deflate64.zip b/tests_data/mitra/zip/deflate64.zip similarity index 100% rename from tests_data/mitra/deflate64.zip rename to tests_data/mitra/zip/deflate64.zip diff --git a/tests_data/mitra/directory.zip b/tests_data/mitra/zip/directory.zip similarity index 100% rename from tests_data/mitra/directory.zip rename to tests_data/mitra/zip/directory.zip diff --git a/tests_data/mitra/drive.zip b/tests_data/mitra/zip/drive.zip similarity index 100% rename from tests_data/mitra/drive.zip rename to tests_data/mitra/zip/drive.zip diff --git a/tests_data/mitra/dual.zip b/tests_data/mitra/zip/dual.zip similarity index 100% rename from tests_data/mitra/dual.zip rename to tests_data/mitra/zip/dual.zip diff --git a/tests_data/mitra/filecomment.zip b/tests_data/mitra/zip/filecomment.zip similarity index 100% rename from tests_data/mitra/filecomment.zip rename to tests_data/mitra/zip/filecomment.zip diff --git a/tests_data/mitra/implode.zip b/tests_data/mitra/zip/implode.zip similarity index 100% rename from tests_data/mitra/implode.zip rename to tests_data/mitra/zip/implode.zip diff --git a/tests_data/mitra/implodeV3.zip b/tests_data/mitra/zip/implodeV3.zip similarity index 100% rename from tests_data/mitra/implodeV3.zip rename to tests_data/mitra/zip/implodeV3.zip diff --git a/tests_data/mitra/jpeg.zip b/tests_data/mitra/zip/jpeg.zip similarity index 100% rename from tests_data/mitra/jpeg.zip rename to tests_data/mitra/zip/jpeg.zip diff --git a/tests_data/mitra/lzma.zip b/tests_data/mitra/zip/lzma.zip similarity index 100% rename from tests_data/mitra/lzma.zip rename to tests_data/mitra/zip/lzma.zip diff --git a/tests_data/mitra/mini.zip b/tests_data/mitra/zip/mini.zip similarity index 100% rename from tests_data/mitra/mini.zip rename to tests_data/mitra/zip/mini.zip diff --git a/tests_data/mitra/reduced1.zip b/tests_data/mitra/zip/reduced1.zip similarity index 100% rename from tests_data/mitra/reduced1.zip rename to tests_data/mitra/zip/reduced1.zip diff --git a/tests_data/mitra/reduced2.zip b/tests_data/mitra/zip/reduced2.zip similarity index 100% rename from tests_data/mitra/reduced2.zip rename to tests_data/mitra/zip/reduced2.zip diff --git a/tests_data/mitra/reduced3.zip b/tests_data/mitra/zip/reduced3.zip similarity index 100% rename from tests_data/mitra/reduced3.zip rename to tests_data/mitra/zip/reduced3.zip diff --git a/tests_data/mitra/reduced4.zip b/tests_data/mitra/zip/reduced4.zip similarity index 100% rename from tests_data/mitra/reduced4.zip rename to tests_data/mitra/zip/reduced4.zip diff --git a/tests_data/mitra/shrunk.zip b/tests_data/mitra/zip/shrunk.zip similarity index 100% rename from tests_data/mitra/shrunk.zip rename to tests_data/mitra/zip/shrunk.zip diff --git a/tests_data/mitra/simple.zip b/tests_data/mitra/zip/simple.zip similarity index 100% rename from tests_data/mitra/simple.zip rename to tests_data/mitra/zip/simple.zip diff --git a/tests_data/mitra/store.zip b/tests_data/mitra/zip/store.zip similarity index 100% rename from tests_data/mitra/store.zip rename to tests_data/mitra/zip/store.zip diff --git a/tests_data/mitra/unicode.zip b/tests_data/mitra/zip/unicode.zip similarity index 100% rename from tests_data/mitra/unicode.zip rename to tests_data/mitra/zip/unicode.zip diff --git a/tests_data/mitra/unicode2.zip b/tests_data/mitra/zip/unicode2.zip similarity index 100% rename from tests_data/mitra/unicode2.zip rename to tests_data/mitra/zip/unicode2.zip diff --git a/tests_data/mitra/unix.zip b/tests_data/mitra/zip/unix.zip similarity index 100% rename from tests_data/mitra/unix.zip rename to tests_data/mitra/zip/unix.zip diff --git a/tests_data/mitra/unixdesc.zip b/tests_data/mitra/zip/unixdesc.zip similarity index 100% rename from tests_data/mitra/unixdesc.zip rename to tests_data/mitra/zip/unixdesc.zip diff --git a/tests_data/mitra/volumecomment.zip b/tests_data/mitra/zip/volumecomment.zip similarity index 100% rename from tests_data/mitra/volumecomment.zip rename to tests_data/mitra/zip/volumecomment.zip diff --git a/tests_data/mitra/wavpack.zip b/tests_data/mitra/zip/wavpack.zip similarity index 100% rename from tests_data/mitra/wavpack.zip rename to tests_data/mitra/zip/wavpack.zip diff --git a/tests_data/mitra/zip.zip b/tests_data/mitra/zip/zip.zip similarity index 100% rename from tests_data/mitra/zip.zip rename to tests_data/mitra/zip/zip.zip diff --git a/tests_data/mitra/zip64.zip b/tests_data/mitra/zip/zip64.zip similarity index 100% rename from tests_data/mitra/zip64.zip rename to tests_data/mitra/zip/zip64.zip diff --git a/tests_data/mitra/zipcrypto.zip b/tests_data/mitra/zip/zipcrypto.zip similarity index 100% rename from tests_data/mitra/zipcrypto.zip rename to tests_data/mitra/zip/zipcrypto.zip diff --git a/tests_data/mitra/zopfli.zip b/tests_data/mitra/zip/zopfli.zip similarity index 100% rename from tests_data/mitra/zopfli.zip rename to tests_data/mitra/zip/zopfli.zip diff --git a/tests_data/mitra/html.htm b/tests_data/mitra_candidates/html.htm similarity index 100% rename from tests_data/mitra/html.htm rename to tests_data/mitra_candidates/html.htm diff --git a/tests_data/mitra/pcapng.pcapng b/tests_data/mitra_candidates/pcapng.pcapng similarity index 100% rename from tests_data/mitra/pcapng.pcapng rename to tests_data/mitra_candidates/pcapng.pcapng