From 5e22b48669995df7a5bf0d5a704becd2c03d47b0 Mon Sep 17 00:00:00 2001 From: Sergey Okatov Date: Fri, 26 Jul 2024 20:48:33 +0300 Subject: [PATCH] M8l4 ml (#37) * M8l4 mlkotlin (#35) * m8l4 ML in Kotlin project * m8l4 ML in Kotlin project1 * m8l4 ML in Kotlin project3 * m8l4 ML in Kotlin project3 * m8l4 ML in Kotlin project3 (cherry picked from commit a63f224e18f74c2780bff7a2ff6faba585242baf) * M8l4 ML (cherry picked from commit 5d42aa43f041bc360c978a8c0ababa9ce74f4911) --- .gitattributes | 2 + docs/ml-models-list.md | 26 +++ gradle/libs.versions.toml | 3 + ok-marketplace-ml/README.md | 8 + ok-marketplace-ml/build.gradle.kts | 26 +++ ok-marketplace-ml/onnx-model/.gitignore | 3 + ok-marketplace-ml/settings.gradle.kts | 29 ++++ ok-marketplace-ml/src/main/kotlin/Inferrer.kt | 150 ++++++++++++++++++ .../src/main/kotlin/InferringResult.kt | 21 +++ .../src/test/kotlin/OnnxInferTest.kt | 35 ++++ settings.gradle.kts | 1 + 11 files changed, 304 insertions(+) create mode 100644 .gitattributes create mode 100644 docs/ml-models-list.md create mode 100644 ok-marketplace-ml/README.md create mode 100644 ok-marketplace-ml/build.gradle.kts create mode 100644 ok-marketplace-ml/onnx-model/.gitignore create mode 100644 ok-marketplace-ml/settings.gradle.kts create mode 100644 ok-marketplace-ml/src/main/kotlin/Inferrer.kt create mode 100644 ok-marketplace-ml/src/main/kotlin/InferringResult.kt create mode 100644 ok-marketplace-ml/src/test/kotlin/OnnxInferTest.kt diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..bfb6df4 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +*.onnx_data filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text diff --git a/docs/ml-models-list.md b/docs/ml-models-list.md new file mode 100644 index 0000000..ef7a61d --- /dev/null +++ b/docs/ml-models-list.md @@ -0,0 +1,26 @@ +# Список моделей машинного обучения в 2023 году + +| Model | Best For | Main Contributor/Author | Languages Supported | Versions | Input Parameters | Min GPU for Inference | Min GPU for Learning | Smartness | License | Significant Restrictions | Web Link | +|--------------------------|----------------------------------------------------------|---------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------|-------------------------|------------------------|-------------------------|---------------|--------------------------|-----------------------------------------|---------------------------------------------------------------------------------------------------| +| **LLaMA** | General-purpose NLP tasks | Meta AI, USA | Multiple languages | LLaMA-7B | 7 billion | 16 GB VRAM | 32 GB VRAM | High | Custom | Non-commercial academic use only | [LLaMA on GitHub](https://github.com/facebookresearch/llama) | +| | | | | LLaMA-13B | 13 billion | 24 GB VRAM | 48 GB VRAM | Very High | | | | +| **mT5** | Text-to-text transformation tasks | Google Research, USA | Over 100 languages including English, Spanish, French, Chinese, and many more | mT5-Base | 580 million | 8 GB VRAM | 16 GB VRAM | High | Apache 2.0 | Few restrictions, permissive license | [mT5 on Hugging Face](https://huggingface.co/google/mt5-base) | +| | | | | mT5-Large | 1.2 billion | 16 GB VRAM | 32 GB VRAM | Very High | | | | +| | | | | mT5-XL | 3.7 billion | 24 GB VRAM | 48 GB VRAM | Very High | | | | +| **XLM-R** | Multilingual understanding tasks | Facebook AI Research (FAIR), USA | 100 languages including major languages like English, Chinese, Spanish, Arabic, etc. | XLM-R Base | 270 million | 8 GB VRAM | 16 GB VRAM | High | MIT | Very few restrictions, highly permissive | [XLM-R on Hugging Face](https://huggingface.co/xlm-roberta-large) | +| | | | | XLM-R Large | 550 million | 16 GB VRAM | 32 GB VRAM | Very High | | | | +| | | | | XLM-R-XXL | 3.5 billion | 24 GB VRAM | 48 GB VRAM | Very High | | | | +| **GPT-NeoX** | Text generation and language understanding tasks | EleutherAI, USA | Primarily trained on English data, but adaptable to other languages with fine-tuning | GPT-NeoX-6B | 6 billion | 24 GB VRAM | 48 GB VRAM | Very High | Apache 2.0 | Few restrictions, permissive license | [GPT-NeoX on GitHub](https://github.com/EleutherAI/gpt-neox) | +| **BERT-multilingual** | Text classification, named entity recognition, QA | Google Research, USA | 104 languages including English, Chinese, French, Spanish, Arabic, Hindi, etc. | BERT-Base Multilingual Cased | 110 million | 8 GB VRAM | 16 GB VRAM | High | Apache 2.0 | Few restrictions, permissive license | [BERT-multilingual on Hugging Face](https://huggingface.co/bert-base-multilingual-cased) | +| | | | | BERT-Large Multilingual Cased (unofficial) | 340 million | 16 GB VRAM | 32 GB VRAM | Very High | | | | +| **Chinese-BERT-wwm** | Chinese NLP tasks | Chinese NLP Group, Harbin Institute of Technology, China | Primarily Chinese | Chinese-BERT-wwm | 110 million | 8 GB VRAM | 16 GB VRAM | High | Apache 2.0 | Few restrictions, permissive license | [Chinese-BERT-wwm on GitHub](https://github.com/ymcui/Chinese-BERT-wwm) | +| | | | | Chinese-BERT-wwm-ext | 110 million | 8 GB VRAM | 16 GB VRAM | High | | | | +| **MuRIL** | NLP tasks specific to Indian languages | Google Research India, India | 17 Indian languages including Hindi, Bengali, Tamil, Telugu, Marathi, and more | MuRIL Base | 110 million | 8 GB VRAM | 16 GB VRAM | High | Apache 2.0 | Few restrictions, permissive license | [MuRIL on Hugging Face](https://huggingface.co/google/muril-base-cased) | +| | | | | MuRIL Large | 340 million | 16 GB VRAM | 32 GB VRAM | Very High | | | | +| **AlephBERT** | Hebrew NLP tasks | AlephBERT team, Israel | Primarily Hebrew | AlephBERT-Base | 110 million | 8 GB VRAM | 16 GB VRAM | High | Apache 2.0 | Few restrictions, permissive license | [AlephBERT on Hugging Face](https://huggingface.co/avichr/AlephBERTgimmel-Base) | +| **ruBERT** | Russian NLP tasks | SberAI, Russia | Primarily Russian | ruBERT | 110 million | 8 GB VRAM | 16 GB VRAM | High | Apache 2.0 | Few restrictions, permissive license | [ruBERT on Hugging Face](https://huggingface.co/DeepPavlov/rubert-base-cased) | +| **ruGPT-3** | Text generation and language understanding in Russian | SberAI, Russia | Primarily Russian | ruGPT-3 Small | 760 million | 12 GB VRAM | 24 GB VRAM | High | Apache 2.0 | Few restrictions, permissive license | [ruGPT-3 on GitHub](https://github.com/sberbank-ai/ru-gpts) | +| | | | | ruGPT-3 Medium | 1.3 billion | 16 GB VRAM | 32 GB VRAM | High | | | | +| | | | | ruGPT-3 Large | 2.6 billion | 24 GB VRAM | 48 GB VRAM | Very High | | | | + +This table should provide a comprehensive overview of various open-source, multilingual language models, including those from non-US contributors. diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index 3718ab1..5f85eec 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -132,6 +132,9 @@ mkpl-cor = { module = "ru.otus.otuskotlin.marketplace.libs:ok-marketplace-lib-co mkpl-state-common = { module = "ru.otus.otuskotlin.marketplace.state:ok-marketplace-states-common", version.ref = "mkpl" } mkpl-state-biz = { module = "ru.otus.otuskotlin.marketplace.state:ok-marketplace-states-biz", version.ref = "mkpl" } +# Machine Learning +ml-tokenizer = "ai.djl.huggingface:tokenizers:0.25.0" +ml-onnx-runtime = "com.microsoft.onnxruntime:onnxruntime:1.16.3" [bundles] kotest = ["kotest-junit5", "kotest-core", "kotest-datatest", "kotest-property"] diff --git a/ok-marketplace-ml/README.md b/ok-marketplace-ml/README.md new file mode 100644 index 0000000..00b15a8 --- /dev/null +++ b/ok-marketplace-ml/README.md @@ -0,0 +1,8 @@ +# Kotlin ONNX ML Sample + +Демонстрация использования [ONNX](https://onnxruntime.ai/docs/get-started/with-java.html) в Kotlin на примере NLP +модели [Roberta NER model](https://huggingface.co/xlm-roberta-large-finetuned-conll03-english). Необходимо скачать файлы модели (`model.onnx`, `model.onnx_data`, `tokenizer.json`) в +папку [onnx-model](onnx-model). + + +[Ноутбук с python-моделью](./Ml_demo1.ipynb) diff --git a/ok-marketplace-ml/build.gradle.kts b/ok-marketplace-ml/build.gradle.kts new file mode 100644 index 0000000..7f8e1e3 --- /dev/null +++ b/ok-marketplace-ml/build.gradle.kts @@ -0,0 +1,26 @@ +plugins { + id("build-jvm") +} + +group = "ru.otus.otuskotlin.marketplace.ml" +version = "0.0.1" + +dependencies { + implementation(libs.ml.onnx.runtime) + implementation(libs.ml.tokenizer) + implementation(libs.logback) + + testImplementation(kotlin("test-junit5")) +} + +tasks { + test { + useJUnitPlatform() + } +} + +allprojects { + repositories { + mavenCentral() + } +} diff --git a/ok-marketplace-ml/onnx-model/.gitignore b/ok-marketplace-ml/onnx-model/.gitignore new file mode 100644 index 0000000..fada18b --- /dev/null +++ b/ok-marketplace-ml/onnx-model/.gitignore @@ -0,0 +1,3 @@ +* +!/.gitattributes +!/.gitignore diff --git a/ok-marketplace-ml/settings.gradle.kts b/ok-marketplace-ml/settings.gradle.kts new file mode 100644 index 0000000..2dffdb2 --- /dev/null +++ b/ok-marketplace-ml/settings.gradle.kts @@ -0,0 +1,29 @@ +rootProject.name = "ok-marketplace-ml" + +dependencyResolutionManagement { + versionCatalogs { + create("libs") { + from(files("../gradle/libs.versions.toml")) + } + } +} + +pluginManagement { + includeBuild("../build-plugin") + plugins { + id("build-jvm") apply false + id("build-kmp") apply false + } + repositories { + mavenCentral() + gradlePluginPortal() + } +} + +plugins { + id("org.gradle.toolchains.foojay-resolver-convention") version "0.5.0" +} + +// Включает вот такую конструкцию +//implementation(projects.m2l5Gradle.sub1.ssub1) +enableFeaturePreview("TYPESAFE_PROJECT_ACCESSORS") diff --git a/ok-marketplace-ml/src/main/kotlin/Inferrer.kt b/ok-marketplace-ml/src/main/kotlin/Inferrer.kt new file mode 100644 index 0000000..b4ca38e --- /dev/null +++ b/ok-marketplace-ml/src/main/kotlin/Inferrer.kt @@ -0,0 +1,150 @@ +import ai.djl.huggingface.tokenizers.HuggingFaceTokenizer +import ai.onnxruntime.OnnxTensor +import ai.onnxruntime.OrtEnvironment +import ai.onnxruntime.OrtException +import ai.onnxruntime.OrtSession +import ai.onnxruntime.OrtSession.SessionOptions +import java.io.IOException +import java.nio.file.Paths + +/** + * Основной класс, выполняющий анализ текста с использованием модели машинного обучения + */ +class Inferrer( + modelPath: String = "model.onnx", + private val tokenizerJson: String = "tokenizer.json", +) { + /** + * Токенайзер преобразует текст в набор токенов + */ + private val tokenizer: HuggingFaceTokenizer by lazy { + runCatching { HuggingFaceTokenizer.newInstance(Paths.get(tokenizerJson)) } + .onFailure { e -> e.printStackTrace() } + .getOrThrow() + } + + /** + * Onnx-runtime environment - среда исполнения модели + */ + private val env: OrtEnvironment by lazy { + OrtEnvironment.getEnvironment() ?: throw Exception("Failed to get ORT environment") + } + + /** + * Onnx-runtime session - сессия среды исполнения модели + */ + private val session: OrtSession by lazy { + val s = env.createSession(modelPath, SessionOptions()) ?: throw Exception("Failed to get session") + println( + """ + Model Input Names: ${s.inputNames.joinToString()} + Model Input info: ${s.inputInfo.entries.joinToString { "${it.key}=${it.value}" }} + Model Output Names: ${s.outputNames.joinToString()} + Model Output info: ${s.outputInfo.entries.joinToString { "${it.key}=${it.value}" }} + """.trimIndent() + ) + s + } + + /* + Расширение для разбора результатов инференса + separates tokens into arrays according to class ids + + below is the relation from class id to the label + "id2label": { + "0": "B-LOC", + "1": "B-MISC", + "2": "B-ORG", + "3": "I-LOC", + "4": "I-MISC", + "5": "I-ORG", + "6": "I-PER", + "7": "O" + * */ + private fun InferringResult.post( + clazz: Int, + token: String, + ) = when (clazz) { + 6 -> persons += token + 2, 5 -> organizations += token + 3, 0 -> locations += token + 1, 4 -> misc += token + else -> Unit + } + + private fun findMaxIndex(arr: FloatArray): Int = arr.indices.maxBy { arr[it] } + + /** + * Инференс - главный метод вычисления результатов машинного анализа + */ + fun infer(inputText: String) = try { + + // Выполняем предварительное кодирования текста в массивы + val encoding = try { + tokenizer.encode(inputText) + } catch (ioException: IOException) { + ioException.printStackTrace() + throw ioException + } + + val tokens = encoding.tokens ?: throw Exception("No tokens detected") // извлечение токенов + // Формируем входные данные для модели + val modelInputs = mapOf( + "input_ids" to OnnxTensor.createTensor( + env, + arrayOf(encoding.ids ?: throw Exception("Empty ids")) + ), + "attention_mask" to OnnxTensor.createTensor( + env, + arrayOf(encoding.attentionMask ?: throw Exception("Empty attention mask")) + ), + ) + + // Объект для хранения результатов инференса + val inferringResult = InferringResult() + + // Выполняем инференс + session.run(modelInputs) + // извлекаем результат инференса и преобразуем в нужный формат + ?.firstOrNull() + ?.value + ?.value + ?.let { + @Suppress("UNCHECKED_CAST") + it as? Array> + } + ?.firstOrNull() + ?.forEachIndexed { i, logits0i -> + try { + inferringResult.post(findMaxIndex(logits0i), tokens[i]) + } catch (exception: Exception) { + exception.printStackTrace() + } + } + ?: throw Exception("Empty result") + + // выводим результат инференса + inferringResult.displayResult(tokens) + } catch (e: OrtException) { + e.printStackTrace() + } + + /** + * Вывод результатов в консоль + */ + private fun InferringResult.displayResult(tokens: Array) { + val tokensSpecialChar = tokens[1][0].toString() // word seperators in tokens + println("All persons in the text: ${persons.cleanResult(tokensSpecialChar)}") + println("All Organizations in the text: ${organizations.cleanResult(tokensSpecialChar)}") + println("All Locations in the text: ${locations.cleanResult(tokensSpecialChar)}") + println("All Miscellanous entities in the text: ${misc.cleanResult(tokensSpecialChar)}") + } + + /** + * Вспомогательная функция для вывода результатов инференса в консоль + */ + private fun String.cleanResult(tokensSpecialChar: String) = split(tokensSpecialChar.toRegex()) + .dropLastWhile { it.isEmpty() } + .filter { it.isNotBlank() } + .joinToString() +} diff --git a/ok-marketplace-ml/src/main/kotlin/InferringResult.kt b/ok-marketplace-ml/src/main/kotlin/InferringResult.kt new file mode 100644 index 0000000..da8fbca --- /dev/null +++ b/ok-marketplace-ml/src/main/kotlin/InferringResult.kt @@ -0,0 +1,21 @@ +/** + * Модель для представления результатов инференса + */ +data class InferringResult( + /** + * Персоны в тексте + */ + var persons: String = "", + /** + * Локации в тексте + */ + var locations: String = "", + /** + * Организации в тексте + */ + var organizations: String = "", + /** + * Остальные значимые элементы в тексте + */ + var misc: String = "", +) diff --git a/ok-marketplace-ml/src/test/kotlin/OnnxInferTest.kt b/ok-marketplace-ml/src/test/kotlin/OnnxInferTest.kt new file mode 100644 index 0000000..6de0b9a --- /dev/null +++ b/ok-marketplace-ml/src/test/kotlin/OnnxInferTest.kt @@ -0,0 +1,35 @@ +import kotlin.test.Test + +class OnnxInferTest { + @Test + fun onnxinferTest() { + val inferrer = Inferrer( + modelPath = "onnx-model/model.onnx", + tokenizerJson = "onnx-model/tokenizer.json", + ) + inputTexts.forEach { + println("========================================") + println("TEXT: $it") + inferrer.infer(it) + } + } + + companion object { + val inputTexts = listOf( + "Ahwar wants to work at Google in london. EU rejected German call to boycott British lamb.", + """ +KotlinDL is a high-level Deep Learning API written in Kotlin and inspired by Keras. Under the hood, it uses TensorFlow Java API and ONNX Runtime API for Java. KotlinDL offers simple APIs for training deep learning models from scratch, importing existing Keras and ONNX models for inference, and leveraging transfer learning for tailoring existing pre-trained models to your tasks. +This project aims to make Deep Learning easier for JVM and Android developers and simplify deploying deep learning models in production environments. +Here's an example of what a classic convolutional neural network LeNet would look like in KotlinDL: + + """.trimIndent(), + """ +«Я́ндекс» — российская транснациональная компания в отрасли информационных технологий, чьё головное юридическое лицо зарегистрировано в Нидерландах, владеющая одноимённой системой поиска в интернете, интернет-порталом и веб-службами в нескольких странах. Наиболее заметное положение занимает на рынках России, Белоруссии и Казахстана[5]. + +Поисковая система Yandex.ru была официально анонсирована 23 сентября 1997 года и первое время развивалась в рамках компании CompTek International. Как отдельная компания «Яндекс» образовалась в 2000 году. + +В мае 2011 года «Яндекс» провёл первичное размещение акций, заработав на этом больше, чем какая-либо из интернет-компаний со времён IPO-поисковика Google в 2004 году[6][7]. + """.trimIndent(), + ) + } +} diff --git a/settings.gradle.kts b/settings.gradle.kts index b3e680a..e75c637 100644 --- a/settings.gradle.kts +++ b/settings.gradle.kts @@ -16,4 +16,5 @@ includeBuild("ok-marketplace-states") includeBuild("ok-marketplace-libs") includeBuild("ok-marketplace-tests") +includeBuild("ok-marketplace-ml") includeBuild("pgkn")