diff --git a/.fdignore b/.fdignore new file mode 100644 index 00000000..09cffa8a --- /dev/null +++ b/.fdignore @@ -0,0 +1,4 @@ +.* +*.{png,cff,json} +LICENSE +docs \ No newline at end of file diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 00000000..57f9f818 --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1,6 @@ +!.gitignore +!* +!*/* +cache_db.json +cache_tree.json +vector_cache diff --git a/docs/cache_title.json b/docs/cache_title.json new file mode 100644 index 00000000..313f1abc --- /dev/null +++ b/docs/cache_title.json @@ -0,0 +1 @@ +{"_default": {"1": {"path": "/README.md", "hash": "a68c63f87ef3416bbd81e2445b7f8892", "title": "RWKV Language Model: Enhanced Features and Efficient Sampling"}, "2": {"path": "/README.md:1-26", "hash": "85e61b89290079e1197f6391edb8a301", "title": "Install, Prepare, Train RWKV Language Model"}, "3": {"path": "/README.md:28-38", "hash": "42de55360a644b028367107f6d23d19f", "title": "RWKV: Efficient Transformer-RNN Hybrid"}, "4": {"path": "/README.md:40-61", "hash": "1edbdb5736412b260f3be9dd6977d5c6", "title": "RWKV Model Setup and Forward Passes"}, "5": {"path": "/README.md:62-90", "hash": "9a5e57a1ed62a6300991439e548c33da", "title": "Explore RWKV Projects and Resources"}, "6": {"path": "/README.md:92-114", "hash": "fe60567dc181cdc0d2d56f508e5985d1", "title": "RWKV Model Resources and Community"}, "7": {"path": "/README.md:114-128", "hash": "9d2d95455097d5c40df8ac0268642205", "title": "RWKV vs GPT2: Performance, Locality & Training"}, "8": {"path": "/README.md:128-141", "hash": "f70e538fedde1eba292039ce68fcb73b", "title": "RWKV: Efficient Language Model on Various Hardware"}, "9": {"path": "/README.md:141-150", "hash": "48b7083aca0041553075fb89abdcae34", "title": "Adaptable RNN Model for Edge Devices"}, "10": {"path": "/README.md:150-161", "hash": "d08f8bfd48804a53d451a4322eaf98b1", "title": "RWKV-4a Improvements and Results"}, "11": {"path": "/README.md:163-184", "hash": "c072b1bb1d079fc283b3ff7a68f699f0", "title": "Quick Start Guide for RWKV-LM"}, "12": {"path": "/README.md:186-196", "hash": "4b814fdf295d386620aa1f4d8ffbe101", "title": "Training RWKV-4 with Script and Dataset"}, "13": {"path": "/README.md:198-218", "hash": "06a8ad4604afc6c44aae48d6975e00d0", "title": "RWKV-4 Pile Fine-Tuning Guide"}, "14": {"path": "/README.md:220-237", "hash": "acc7dd80234efc86b2eb22fe57448aa4", "title": "RWKV Embeddings: Statistic Collection & Classifier Training"}, "15": {"path": "/README.md:238-245", "hash": "61e843ea1015c8ab1235b8e0bbfdb749", "title": "Matrix Equation with Three Rows and Variable Weights"}, "16": {"path": "/README.md:246-265", "hash": "30d0f7e421dc9db25ae2a367d7629d49", "title": "Matrix Vector Calculation Algorithm"}, "17": {"path": "/README.md:266-284", "hash": "6344b4449422fe7f7ab5483c9ca4c678", "title": "Learnable Parameters for RWKV-6 Model Mixing"}, "18": {"path": "/README.md:285-306", "hash": "ad226f52e607898d4a26b66e3e6c2d7c", "title": "Time-Dependent Mixing and Shifting in RWKV-7"}, "19": {"path": "/README.md:306-325", "hash": "e7fd21b41f21ab9cb4358deea62e4bc4", "title": "Exploring Decay, Lie Groups, and Optimization Techniques in Transformers"}, "20": {"path": "/README.md:326-342", "hash": "eae34ef92d0403cefcb3f92d4888c866", "title": "Image-Language Model: Position Embedding and Token Shift"}, "21": {"path": "/README.md:344-375", "hash": "eeb355f9dc8ab06a59f8af9426a2495b", "title": "Improved Tokenization via Hardcoded Channels"}, "22": {"path": "/README.md:377-401", "hash": "99fa1ceeba165678df95d005ef5ccf45", "title": "Enhancing RWKV Initial States for Model Discovery"}, "23": {"path": "/README.md:401-413", "hash": "9b4d05887bcf4e0cebc150baf1e59257", "title": "Transformer Performance Optimizations"}, "24": {"path": "/README.md:415-441", "hash": "551c586e0228ba2399de4b50ece1c4c6", "title": "RWKV-v2-RNN Architecture: Time-Decay Curve Improvements"}, "25": {"path": "/README.md:442-469", "hash": "a7a2c06ffe7dd570aa946a13cd4b8026", "title": "RWKV-3 GPT Model: LN After Embedding"}, "26": {"path": "/README.md:471-496", "hash": "849e5395bdfe1fe9a1bacb750d13271e", "title": "Implementing ATT Mechanism in RWKV-3 Model"}, "27": {"path": "/README.md:497-528", "hash": "ee21eb93bfe8d56581357b1f8d8816f1", "title": "Time-Mixing Convolution Operations"}, "28": {"path": "/README.md:529-557", "hash": "19e89ab01d86000081e758029cfcfb0e", "title": "Efficient FFN Operation for GPT"}, "29": {"path": "/README.md:557-563", "hash": "cb03403644223d6c095629264eaf1ffb", "title": "Time-Series Prediction Formula for RWKV"}, "30": {"path": "/README.md:563-567", "hash": "a9eda783569ae0aadce1392c2df5216d", "title": "Weighted Sum Calculation in GPT"}, "31": {"path": "/README.md:567-572", "hash": "9f104d3f7c3822011d12b813479a3996", "title": "Sigmoid-weighted RWKV Contribution Formula"}, "32": {"path": "/README.md:572-578", "hash": "501c60a87d0549a78e518ce250c1308a", "title": "Time-Decay Exponential RNN Formula"}, "33": {"path": "/README.md:578-582", "hash": "cf381cc4c025e5fe305bf4afdd42a993", "title": "Sigmoid-Exponentiated Matrix Calculation"}, "34": {"path": "/README.md:582-596", "hash": "307c8f7c2b8765a49bb8fb3b0b0cf970", "title": "Matrix-Exponential-Based RWKV"}, "35": {"path": "/README.md:598-609", "hash": "02c4c665293dbf5d918810464a71425a", "title": "Efficient Image Processing with LM Loss"}, "36": {"path": "/README.md:610-627", "hash": "290b4b8873397558e70cfb8c46ef9c86", "title": "Prime Sampling for Deterministic Randomness"}, "37": {"path": "/README.md:629-645", "hash": "2ad6a3a822b9f44d1cec815a46f17db2", "title": "Efficient Learning Rate Scheduling with Top-p Sampling"}, "38": {"path": "/README.md:647-656", "hash": "886ba3bdef881550c4eebb278583f6d6", "title": "RWKV v1 Architecture: Time-Mix and Channel-Mix Layers"}, "39": {"path": "/README.md:656-668", "hash": "e9adba1ec9d4be74cdd9a14076fee6b5", "title": "Calculating RWKV's TM and CM Components"}, "40": {"path": "/README.md:669-685", "hash": "ec934bc682f084128fb0581e8876ca8d", "title": "Fast & Stable Convergence with Zero Initialization"}, "41": {"path": "/README.md:687-711", "hash": "0a7c79addef3c008f3a10737a99f4f30", "title": "Token-Shift Enhancement for LMs"}, "42": {"path": "/README.md:711-724", "hash": "e0477a43e9e20b6683a68bb7063c8024", "title": "Head-QK Trick in Transformer Model"}, "43": {"path": "/README.md:725-754", "hash": "5bc8c0d8d87457d50e50092a34a2b5b7", "title": "One-hot multiplication and sampling improvement"}, "44": {"path": "/README.md:756-785", "hash": "0d6b5e12c5b343be9ad8cd6fc283726a", "title": "Attention Mechanisms: RWKV Outperforms VRAM"}, "45": {"path": "/RWKV-v1/src/model.py", "hash": "b48fe23dc79fc204c54f040a06275928", "title": "Rotary Transformer-Based RWKV Model Initialization"}, "46": {"path": "/RWKV-v1/src/model.py:1-22", "hash": "ec0a89effdcb74cc908580c9d46991e4", "title": "Fancy Initialization for Linear and Embedding Layers"}, "47": {"path": "/RWKV-v1/src/model.py:23-46", "hash": "365ba8a56c25eee1ad6e3f1c7a025c5f", "title": "Neural Network Gain and Scale Initialization"}, "48": {"path": "/RWKV-v1/src/model.py:48-70", "hash": "05991257d06b755c4be862fe2949a353", "title": "RWKV TimeMix Initialization"}, "49": {"path": "/RWKV-v1/src/model.py:71-94", "hash": "b74a7d1d45b0725968979186cf3a429c", "title": "Multi-Head Attention Layer Initialization"}, "50": {"path": "/RWKV-v1/src/model.py:96-125", "hash": "000f349a364ef0114951c90babec94bb", "title": "Forward Pass: Element-wise Operations and Layer Normalization"}, "51": {"path": "/RWKV-v1/src/model.py:127-158", "hash": "e280c971581d3c4706e7635482d73f62", "title": "Attention Mechanisms for RWKV Model"}, "52": {"path": "/RWKV-v1/src/model.py:159-180", "hash": "35c7f5320bb5219df882c8af5b017216", "title": "RWKV-v1 Attention Model Implementation"}, "53": {"path": "/RWKV-v1/src/model.py:181-203", "hash": "9ceea8a5bb1a0a0a22c8f4fcf52ce4be", "title": "Multi-Head Attention with Rotary Embedding"}, "54": {"path": "/RWKV-v1/src/model.py:204-230", "hash": "8b86de2219807e48cc05d1219fbed1b4", "title": "Rotary Multi-Head Attention in RWKV"}, "55": {"path": "/RWKV-v1/src/model.py:232-253", "hash": "80e56a8366bbbe62adbae9160791fec1", "title": "Initializing Transformer Model"}, "56": {"path": "/RWKV-v1/src/model.py:254-270", "hash": "73d9796248e98f16e65d89a6f6b9aacf", "title": "Rotary Attention Encoding"}, "57": {"path": "/RWKV-v1/src/model.py:271-299", "hash": "930efd54da331b55eda9816e3f1a29a5", "title": "Multi-Head Attention Module Definition"}, "58": {"path": "/RWKV-v1/src/model.py:300-320", "hash": "1108b363b0c36130bf1ea5fabc986844", "title": "Transformer Attention Mechanism Initialization"}, "59": {"path": "/RWKV-v1/src/model.py:322-339", "hash": "2a479ab940f13d1fbb1168345ca0924b", "title": "Attention Model in Transformer Architecture"}, "60": {"path": "/RWKV-v1/src/model.py:340-352", "hash": "6f2f068fc0d471c36fec8d56ffa6162c", "title": "Rotary Positional Encoding Attention"}, "61": {"path": "/RWKV-v1/src/model.py:354-385", "hash": "2b11799f49fbfb8152edcc481f01fed6", "title": "GPT Normalization Layers: RMSNorm and FixedNorm"}, "62": {"path": "/RWKV-v1/src/model.py:386-414", "hash": "0d3d4d618f3939f2dcecc05006d07b36", "title": "Dynamic Transformer Blocks for RWKV-v1"}, "63": {"path": "/RWKV-v1/src/model.py:415-445", "hash": "65bcc7f513c5e97378e03070ea3c853e", "title": "RWKV Model Initialization"}, "64": {"path": "/RWKV-v1/src/model.py:446-472", "hash": "b56aadabc2a0e8faa58fc507705598b2", "title": "RWKV Model Initialization"}, "65": {"path": "/RWKV-v1/src/model.py:473-489", "hash": "253a90c904f7626382857b31f32f5625", "title": "Organizing Model Parameters for Weight Decay"}, "66": {"path": "/RWKV-v1/src/model.py:490-517", "hash": "366cf6f73cae201f4295f0c4fbc01d06", "title": "Attention Mechanism in RWKV Model"}, "67": {"path": "/RWKV-v1/src/trainer.py", "hash": "45905cd48cc77fd705c01ff23e2c00ca", "title": "Comprehensive Trainer for RWKV-v1"}, "68": {"path": "/RWKV-v1/src/trainer.py:1-35", "hash": "46762d1169e97822a45dd63dd665d2c3", "title": "Trainer Class Initialization"}, "69": {"path": "/RWKV-v1/src/trainer.py:36-57", "hash": "40423e66945c50898b2a64f61f9e923d", "title": "Trainer Class Initialization"}, "70": {"path": "/RWKV-v1/src/trainer.py:58-81", "hash": "5b74a2c7599a94745152dbe4f443f9b7", "title": "Training Model with Optimizer and Calculating Average Loss"}, "71": {"path": "/RWKV-v1/src/trainer.py:83-99", "hash": "c9b846f18539b66d552282ee3c2ad7c0", "title": "Trainer: Backpropagation and Learning Rate Decay"}, "72": {"path": "/RWKV-v1/src/trainer.py:100-119", "hash": "c01234fbe7d90bb46e2cd692f99b9168", "title": "Dynamic Learning Rate Adjustment"}, "73": {"path": "/RWKV-v1/src/trainer.py:120-130", "hash": "856f4989c57c7f3ea2baf022d62c5be1", "title": "Progress Bar Initialization and Training Epochs"}, "74": {"path": "/RWKV-v1/src/utils.py", "hash": "6a02d00db51229f695b24fdd2cf91f34", "title": "Text Generation with Top-k, Top-p, and Temperature Scaling"}, "75": {"path": "/RWKV-v1/src/utils.py:1-32", "hash": "ca4ea90121d862c5089159b29e92a3f8", "title": "Top-k/p Sampling Functions"}, "76": {"path": "/RWKV-v1/src/utils.py:33-50", "hash": "58b20d1723005627778255b7caec3f02", "title": "Model Sampling with Logits and Top-K/P"}, "77": {"path": "/RWKV-v1/train.py", "hash": "713cde0a611c6430804a6ed26daa6980", "title": "Training RWKV-v1 Language Model"}, "78": {"path": "/RWKV-v1/train.py:1-21", "hash": "31d9b5b16f30c77d1fc405f8744e0839", "title": "RWKV Model Setup & Configuration"}, "79": {"path": "/RWKV-v1/train.py:23-43", "hash": "a45a51ceea4e256656499e5550779316", "title": "RWKV-v1 Training Settings"}, "80": {"path": "/RWKV-v1/train.py:45-68", "hash": "a9666fa19c2332db145c27012804e063", "title": "RWKV-v1 Hyperparameter Setup"}, "81": {"path": "/RWKV-v1/train.py:69-87", "hash": "8986ef2d8712a91c6078825e10c6f2d4", "title": "Initialize RWKV Model Parameters"}, "82": {"path": "/RWKV-v1/train.py:88-117", "hash": "d8367d0d7fef0b224e1751e52ab42016", "title": "Train RWKV-LM: Vocab JSON and Dictionaries"}, "83": {"path": "/RWKV-v1/train.py:118-135", "hash": "f34b2ca2e747fffb8f454351fa27e4b8", "title": "GPT Model Initialization and Training"}, "84": {"path": "/RWKV-v1/train.py:136-142", "hash": "63cbffd54137ed6e16a07d2abcc1ce19", "title": "Trainer Initialization and Training Savings"}, "85": {"path": "/RWKV-v2-RNN/cuda/timex_cuda.cu", "hash": "b9b23670c3f9b21a2ffa7eafb6606283", "title": "CUDA-Optimized RWKV-v2 RNN Dot Products"}, "86": {"path": "/RWKV-v2-RNN/cuda/timex_cuda.cu:1-33", "hash": "79e81e08c51ab3f6d4d442052ae0617b", "title": "CUDA-Optimized RWKV-v2-RNN Forward Pass"}, "87": {"path": "/RWKV-v2-RNN/cuda/timex_cuda.cu:34-63", "hash": "54c61921e4cf6143aa1b1bb5b93cb42f", "title": "CUDA Dot Product Update Kernel"}, "88": {"path": "/RWKV-v2-RNN/cuda/timex_cuda.cu:64-93", "hash": "7ea45eb1ff4392c41c86eae2daae4993", "title": "CUDA RNN Forward Pass Calculation"}, "89": {"path": "/RWKV-v2-RNN/cuda/timex_cuda.cu:94-130", "hash": "4d3f91854bd36b6144a6b97006c55f51", "title": "Shared Memory Optimized RNN CUDA Calculation"}, "90": {"path": "/RWKV-v2-RNN/cuda/timex_cuda.cu:131-163", "hash": "68996f23889e4635b85e5d732254c59d", "title": "Matrix Variable Updating with Unroll Optimization"}, "91": {"path": "/RWKV-v2-RNN/cuda/timex_cuda.cu:164-172", "hash": "ea6832a9b006e6e119c1df269f633cd5", "title": "CUDA Time Step Calculation"}, "92": {"path": "/RWKV-v2-RNN/cuda/timex_op.cpp", "hash": "6ba70b0ebd38e2ac06bc4e5d1df55b51", "title": "CUDA Kernel for Timex Operations in PyTorch"}, "93": {"path": "/RWKV-v2-RNN/cuda/timex_op.cpp:1-20", "hash": "b9e9b57702814926c91dc41b85d55d55", "title": "CUDA-Accelerated TimeX Operations for PyTorch"}, "94": {"path": "/RWKV-v2-RNN/cuda/timex_op.cpp:21-21", "hash": "9290d056c93b36c6e6cd61ca59948350", "title": "Closing Brace in Function/Class Definitions"}, "95": {"path": "/RWKV-v2-RNN/run.py", "hash": "0b483420e3f9c2e80e4cf838413760e4", "title": "RWKV Model Training Script"}, "96": {"path": "/RWKV-v2-RNN/run.py:1-31", "hash": "b1a7baac478d94108209f9c8641a14a4", "title": "RWKV-v2 RNN Initialization"}, "97": {"path": "/RWKV-v2-RNN/run.py:32-58", "hash": "2b9cc5d4fc73a0eb7779a10e798490d9", "title": "Initializing RWKV Model Parameters"}, "98": {"path": "/RWKV-v2-RNN/run.py:59-89", "hash": "c999f80e7dcd2bbb43f4456c977ebb5e", "title": "Evaluating Model on Dataset with Random Samples"}, "99": {"path": "/RWKV-v2-RNN/run.py:91-120", "hash": "92ab459c3dd59d3ae56b301cb8923171", "title": "RWKV-v2 RNN: First Run and Saving/Loading States"}, "100": {"path": "/RWKV-v2-RNN/run.py:121-133", "hash": "7121d4731065c7e621546cd58f9393e3", "title": "Model Load or Train and Evaluate Script"}, "101": {"path": "/RWKV-v2-RNN/src/model.py", "hash": "ca539d308097c4df0f1e01c6b5238eb4", "title": "RWKV v2-RNN Optimization with GPT"}, "102": {"path": "/RWKV-v2-RNN/src/model.py:1-23", "hash": "8d4cffca76a2731223b7087f8f1e9d93", "title": "RWKV v2-RNN Model Setup"}, "103": {"path": "/RWKV-v2-RNN/src/model.py:23-47", "hash": "ed04bab3a5b8ede26054a180865639d9", "title": "TimeX CUDA Implementation"}, "104": {"path": "/RWKV-v2-RNN/src/model.py:48-73", "hash": "97611ae847d7def4be2f5227beba9aeb", "title": "RWKV Model Initialization"}, "105": {"path": "/RWKV-v2-RNN/src/model.py:74-101", "hash": "cc79240c20920b5c732b2b527747799d", "title": "Dynamic Layer Initialization in RWKV-v2 RNN"}, "106": {"path": "/RWKV-v2-RNN/src/model.py:102-129", "hash": "edf6f29b91729f650df8746382672755", "title": "RWKV TimeMix Module Initialization"}, "107": {"path": "/RWKV-v2-RNN/src/model.py:130-147", "hash": "6effa5b2a2e381c5ccc685e2f8d77cf9", "title": "Time-Decay Transformer Initialization"}, "108": {"path": "/RWKV-v2-RNN/src/model.py:148-177", "hash": "16f06c81732e9a6a605789a0d8016210", "title": "RWKV-v2 Model Initialization"}, "109": {"path": "/RWKV-v2-RNN/src/model.py:178-207", "hash": "3cde2b4438c4092f09cd40c748c7f6fe", "title": "RWKV Channel Mix Module"}, "110": {"path": "/RWKV-v2-RNN/src/model.py:208-242", "hash": "5f366c45a602d61b6a5b8f3bff9e70ef", "title": "Defining GPT Model with RWKV Blocks"}, "111": {"path": "/RWKV-v2-RNN/src/model.py:243-275", "hash": "37b79d698fc2f872af62c1ed74711d7c", "title": "GPT Model: RWKV-ffnPre Implementation"}, "112": {"path": "/RWKV-v2-RNN/src/model.py:276-304", "hash": "0d9f9e9cad455253c2b40b641edf13bd", "title": "RWKV Model Configuration and Optimization"}, "113": {"path": "/RWKV-v2-RNN/src/model.py:305-329", "hash": "c0c1e769ddf2c943d48d3be3febacad7", "title": "Optimizing Model Parameters with Weight Decay"}, "114": {"path": "/RWKV-v2-RNN/src/model.py:330-349", "hash": "3436fcb9aa53187e23dbd2e7e1cf6f59", "title": "Model Forward Pass Code"}, "115": {"path": "/RWKV-v2-RNN/src/model_run.py", "hash": "880a07667d34c1f789c435d606d49d05", "title": "RWKV-v2 RNN Model Initialization"}, "116": {"path": "/RWKV-v2-RNN/src/model_run.py:1-37", "hash": "fb0dae8efb3944ea7eb80251e73674ba", "title": "RWKV-RNN Model Initialization"}, "117": {"path": "/RWKV-v2-RNN/src/model_run.py:38-69", "hash": "9d405c687d963d848decd27522bb9796", "title": "Nested Dictionary Storage with Clear, Save, Load Functions"}, "118": {"path": "/RWKV-v2-RNN/src/model_run.py:70-98", "hash": "b5573a9c2009e527b1adb72ab8442f7e", "title": "Neural Network Model Methods"}, "119": {"path": "/RWKV-v2-RNN/src/model_run.py:99-130", "hash": "0af3e210b0ec77a5c20a166b618154a8", "title": "RWKV Model Language Generation"}, "120": {"path": "/RWKV-v2-RNN/src/model_run.py:131-143", "hash": "1158655d5721d6fd4d79e65cc8c32c0c", "title": "Reshape and Multiply for Context"}, "121": {"path": "/RWKV-v2-RNN/src/trainer.py", "hash": "75d492cb5701b4f92d43a21253dfc1f3", "title": "Trainer Config and Training Functionality"}, "122": {"path": "/RWKV-v2-RNN/src/trainer.py:1-36", "hash": "e53840a8868b02c6be6c049f4fc5470c", "title": "TrainerConfig: Params & Efficient GPU Setup"}, "123": {"path": "/RWKV-v2-RNN/src/trainer.py:37-67", "hash": "32d5e871bf66e4fdeb50f6102ea81c99", "title": "Wandb-Assisted Trainer Initialization"}, "124": {"path": "/RWKV-v2-RNN/src/trainer.py:68-92", "hash": "a370b841580e36501b68498e5a2e1599", "title": "Model Training Initialization and Configuration"}, "125": {"path": "/RWKV-v2-RNN/src/trainer.py:93-117", "hash": "78d7a55c4238a1af9429d28a3c5a225c", "title": "Training DataLoader with Disabled Shuffling"}, "126": {"path": "/RWKV-v2-RNN/src/trainer.py:118-132", "hash": "309f1a59c4900f0693fde0d76ae76daa", "title": "Learning Rate Multiplier Calculation"}, "127": {"path": "/RWKV-v2-RNN/src/trainer.py:133-154", "hash": "62acd61dee85f69218ec6df3c0077088", "title": "Learning Rate Config and Logging"}, "128": {"path": "/RWKV-v2-RNN/src/trainer.py:156-170", "hash": "2431b0a2b45542decbbf29ad8112a11e", "title": "Epoch-wise Token Counter and Training"}, "129": {"path": "/RWKV-v2-RNN/src/utils.py", "hash": "1ce7852796f79ccb2338e18cb56a727f", "title": "RWKV v2-RNN Dataset Class"}, "130": {"path": "/RWKV-v2-RNN/src/utils.py:1-33", "hash": "42c2ee15b6ce19d23241ec56a22850b4", "title": "Custom Dataset Class for RWKV v2-RNN"}, "131": {"path": "/RWKV-v2-RNN/src/utils.py:34-59", "hash": "ed925c99164299f05af2b152089f7d47", "title": "RWKV Tokenizer Class"}, "132": {"path": "/RWKV-v2-RNN/src/utils.py:60-90", "hash": "32d2e5f57e96dcbf7097bf1cc71188f0", "title": "RWKV-v2 RNN Word Table Initialization"}, "133": {"path": "/RWKV-v2-RNN/src/utils.py:92-122", "hash": "e55ab2142af0e5cfd5292b1c2bf67ab1", "title": "Sample from Probabilities"}, "134": {"path": "/RWKV-v2-RNN/train.py", "hash": "b0ef758c96c818cf66927d4567d552a6", "title": "RWKV-v2 RNN/GPT Model Training and Optimization"}, "135": {"path": "/RWKV-v2-RNN/train.py:1-29", "hash": "857cc46c36e38cf1527abbed85226be2", "title": "RWKV-v2 RNN Training Script"}, "136": {"path": "/RWKV-v2-RNN/train.py:30-56", "hash": "dea972b53988925c4a2f6ec57304b80a", "title": "Training RWKV Model Parameters"}, "137": {"path": "/RWKV-v2-RNN/train.py:57-82", "hash": "b9deaf3e19fd799c745b6b246d4fb60c", "title": "GPT Model Training with RWKV-v2-RNN"}, "138": {"path": "/RWKV-v2-RNN/train.py:84-98", "hash": "44dc89876058c68fdb4c36e505afe46f", "title": "Train and Save RWKV-v2 Model"}, "139": {"path": "/RWKV-v3/cuda/timex_cuda.cu", "hash": "b9b23670c3f9b21a2ffa7eafb6606283", "title": "CUDA RNN Dot Product Optimization"}, "140": {"path": "/RWKV-v3/cuda/timex_op.cpp", "hash": "6ba70b0ebd38e2ac06bc4e5d1df55b51", "title": "Timex Operations CUDA Kernel"}, "141": {"path": "/RWKV-v3/run.py", "hash": "dbd4f328626af744186c143c9479d89f", "title": "RWKV-v3 Text Generation & Tokenization"}, "142": {"path": "/RWKV-v3/run.py:1-30", "hash": "e2291f9e732ab793bcaeb50c383f7ad5", "title": "RWKV Model Setup"}, "143": {"path": "/RWKV-v3/run.py:31-58", "hash": "e7674a9d654284f5a384545e664b0868", "title": "Loading and Tokenizing RWKV Model"}, "144": {"path": "/RWKV-v3/run.py:58-89", "hash": "af782446ca6ae365090eb2aa2257d890", "title": "Model Preparation and Processing"}, "145": {"path": "/RWKV-v3/run.py:90-98", "hash": "9b243ffd5f2de482894da8ffc5e76f5a", "title": "RWKV Text Generation and Timing"}, "146": {"path": "/RWKV-v3/src/model.py", "hash": "9fa86b5b3cb1b596a96df21add3e04d0", "title": "Efficient Time-based RWKV LM with GPT Model"}, "147": {"path": "/RWKV-v3/src/model.py:1-25", "hash": "72de1fb85ed4cefcb85dcd27f31a220e", "title": "RWKV Language Model: PyTorch CUDA Implementation"}, "148": {"path": "/RWKV-v3/src/model.py:27-50", "hash": "455b4366d82ba2217e08dd7d2027b301", "title": "TimeX Operation Class"}, "149": {"path": "/RWKV-v3/src/model.py:51-69", "hash": "f780f291d93714738707a2d4c96506fa", "title": "RWKV-v3 Weight Initialization"}, "150": {"path": "/RWKV-v3/src/model.py:70-98", "hash": "776cb9ddee0dc11b4cb3498f8a507573", "title": "Dynamic Weight Initialization for Neural Network Layers"}, "151": {"path": "/RWKV-v3/src/model.py:99-127", "hash": "12a7547b51794b068d0aa37bf168c990", "title": "RWKV TimeMix: Orthogonal/Normal Initialization"}, "152": {"path": "/RWKV-v3/src/model.py:128-148", "hash": "6bf39020d3e24ec703790d39468c66be", "title": "Attention Mechanism Initialization in RWKV-v3 Model"}, "153": {"path": "/RWKV-v3/src/model.py:150-175", "hash": "7a3951c9646b8468097a51129421cb90", "title": "Time-Mixing Attention Model Initialization"}, "154": {"path": "/RWKV-v3/src/model.py:176-204", "hash": "971f2cf99443d75714498e856ebfae01", "title": "RWKV Channel Mix Module"}, "155": {"path": "/RWKV-v3/src/model.py:205-233", "hash": "061dc85cd3bde3be7925a4abc9dc1a1f", "title": "RWKV-v3 Time-based Mixing GPT Model"}, "156": {"path": "/RWKV-v3/src/model.py:234-266", "hash": "806f091dba0f4c22bfaa74a4618923b8", "title": "RWKV Model Block Definition"}, "157": {"path": "/RWKV-v3/src/model.py:267-297", "hash": "a5cfda9eb726b5054bf8005d75d22f47", "title": "GPT Model Initialization"}, "158": {"path": "/RWKV-v3/src/model.py:299-324", "hash": "1e7140801540ecec6a13be3bd91aa7d9", "title": "Weight Decay and Optimizer Configuring in Model"}, "159": {"path": "/RWKV-v3/src/model.py:325-354", "hash": "e1a2a335217c928b82fb77862e39396c", "title": "RWKV Model and Optimization"}, "160": {"path": "/RWKV-v3/src/model.py:355-363", "hash": "b2c027107f35cb4895d540b5e6ac71ce", "title": "Model Output and Loss Calculation"}, "161": {"path": "/RWKV-v3/src/model_run.py", "hash": "cd9981b4f706de47777695be84237de9", "title": "RWKV-v3 Transformer Model Initialization"}, "162": {"path": "/RWKV-v3/src/model_run.py:1-30", "hash": "64bf6499f0c67f9c11f0e4363a32e34e", "title": "RWKV Channel Mixing Module"}, "163": {"path": "/RWKV-v3/src/model_run.py:32-57", "hash": "d0a7ea11287916ed67338499af3c44c4", "title": "RWKV-v3 Model: Forward and TimeMix Initialization"}, "164": {"path": "/RWKV-v3/src/model_run.py:58-85", "hash": "06e288b1692d6ede6fed2f8677adbd6d", "title": "Transformer Model Initialization and Forward Pass"}, "165": {"path": "/RWKV-v3/src/model_run.py:85-115", "hash": "f8dde90299c5c3c07f6f822e90fbf2e2", "title": "Transformer Block with Time-Mix and Channel-Mix"}, "166": {"path": "/RWKV-v3/src/model_run.py:116-146", "hash": "86e23baf496867012bb4329e7fbf14e4", "title": "RWKV-GPT Model Architecture"}, "167": {"path": "/RWKV-v3/src/model_run.py:147-177", "hash": "cc90922e7eebbccca789734c13e31e92", "title": "RWKV-v3 Model Initialization"}, "168": {"path": "/RWKV-v3/src/model_run.py:179-207", "hash": "505a2ef874b7fe36ef7c7d2c53deba93", "title": "Loading and Initializing RWKV_RNN Model"}, "169": {"path": "/RWKV-v3/src/model_run.py:208-238", "hash": "ff9924e20003086c3654c696d5047781", "title": "Hierarchical Data Storage Class"}, "170": {"path": "/RWKV-v3/src/model_run.py:239-264", "hash": "6f2710fa2bad896f1ae51978781c5c67", "title": "Functions for RWKV-v3 Model"}, "171": {"path": "/RWKV-v3/src/model_run.py:265-294", "hash": "1bd358ecc1c3b1b80e5205332fd41314", "title": "Layered LN, SA, FF Model Run"}, "172": {"path": "/RWKV-v3/src/model_run.py:296-319", "hash": "39ab066a1bd5eacf67f3c6143fd455ca", "title": "Context-Aware Attention Calculations"}, "173": {"path": "/RWKV-v3/src/trainer.py", "hash": "6e658e0cb994abc67ba33c3924a09fdd", "title": "Trainer: CUDA Optimizations and Data Handling"}, "174": {"path": "/RWKV-v3/src/trainer.py:1-36", "hash": "87dc63f30679a5c2730c47f02e00aed8", "title": "Training Language Model with Trainer Config"}, "175": {"path": "/RWKV-v3/src/trainer.py:37-67", "hash": "892ca0cc67eae42692e83f59535963d2", "title": "Trainer Class: WandB, Learning Rate Decay"}, "176": {"path": "/RWKV-v3/src/trainer.py:68-92", "hash": "68d2b15db0d6dda4caf971e7e784c473", "title": "CUDA-Powered Model Trainer Initialization"}, "177": {"path": "/RWKV-v3/src/trainer.py:93-117", "hash": "c0c92cdf33009ec152dc60ed04c9fa26", "title": "DataLoader Iteration and Training Process"}, "178": {"path": "/RWKV-v3/src/trainer.py:118-133", "hash": "2656b825330421be3ebcbc0a0348a461", "title": "Warmup and Exponential Learning Rate Calculation"}, "179": {"path": "/RWKV-v3/src/trainer.py:134-155", "hash": "1529f78592fa5c15f3bff9e799b58b78", "title": "Dynamic Learning Rate Adjustment"}, "180": {"path": "/RWKV-v3/src/trainer.py:157-171", "hash": "1b5ebfc5af621aa6d9ba31583dda0549", "title": "Train and Save RWKV-v3 Model with Decaying Learning Rate"}, "181": {"path": "/RWKV-v3/src/utils.py", "hash": "969ae4f2bef156a6557db5c3ff231e01", "title": "RWKV Dataset and Tokenizer"}, "182": {"path": "/RWKV-v3/src/utils.py:1-34", "hash": "665d9c8e2d69c4551707af5f0ad6e4b7", "title": "Custom Dataset Class for RWKV Model"}, "183": {"path": "/RWKV-v3/src/utils.py:35-62", "hash": "03a44adfee87fcf707dd877a2d12a5c4", "title": "Tokenizer: Converting Text to Numerical Reps"}, "184": {"path": "/RWKV-v3/src/utils.py:64-95", "hash": "da5e781bf57d49d4f54ab9c8cc6159d3", "title": "Refine Context and Calculate Probs"}, "185": {"path": "/RWKV-v3/src/utils.py:96-122", "hash": "fdafaded7f67b8f3cec5588f6ff5be35", "title": "Top-P Sampling in RWKV-LM Utils"}, "186": {"path": "/RWKV-v3/train.py", "hash": "ee9d90e774f220c037d8b34977d057ea", "title": "Training RWKV-LM and GPT Models Efficiently"}, "187": {"path": "/RWKV-v3/train.py:1-22", "hash": "9e11bb84c876648b7cf771f912122e48", "title": "Training RWKV-LM with GPT Model"}, "188": {"path": "/RWKV-v3/train.py:23-48", "hash": "50ab92be8286a4728b60940ed22d1c34", "title": "Optimizing RWKV-v3 Training Parameters"}, "189": {"path": "/RWKV-v3/train.py:50-65", "hash": "5635e26a763f993565541e26d2b2299c", "title": "Optimizing RWKV-v3 Training Parameters"}, "190": {"path": "/RWKV-v3/train.py:67-96", "hash": "c8d7bc4980787cf242ac6e458e0297be", "title": "RWKV Language Model Training Parameters"}, "191": {"path": "/RWKV-v3/train.py:97-112", "hash": "826909d707f1d72f5101a962c53af626", "title": "GPT Model Training with Hyperparameters"}, "192": {"path": "/RWKV-v3/train.py:113-118", "hash": "98f43ffec904dc8eadde09b0329b7dc0", "title": "Saving Model State with Timestamp"}, "193": {"path": "/RWKV-v3/verify.py", "hash": "bbf963c1b40c52168350b6fe5e1a1ea8", "title": "RWKV-GPT Verification Script"}, "194": {"path": "/RWKV-v3/verify.py:1-31", "hash": "bcf21b4a4ee951d0d7e8b3e572361ce8", "title": "RWKV-LM Verification with GPT Architecture"}, "195": {"path": "/RWKV-v3/verify.py:32-61", "hash": "1c8e033754d1ef416c5c7a0b12b4a400", "title": "RWKV-GPT & RNN Model Outputs from Checkpoint"}, "196": {"path": "/RWKV-v3/verify.py:62-65", "hash": "6327db2ac34a094b42c84c30a13acbf5", "title": "Padding and Training Model with Detach and Print"}, "197": {"path": "/RWKV-v4/cuda/wkv_cuda.cu", "hash": "2ed69020d792275be2b23ee84c2536e3", "title": "Efficient RWKV-v4 CUDA Kernel with Optimized Execution"}, "198": {"path": "/RWKV-v4/cuda/wkv_cuda.cu:1-29", "hash": "0dd888793a9f4f8de001f8ab0e40ee48", "title": "CUDA RNN Forward Pass Kernel Function"}, "199": {"path": "/RWKV-v4/cuda/wkv_cuda.cu:31-56", "hash": "90b70954c814ea4df1c4c7876361ddc4", "title": "Backward Propagation Kernel"}, "200": {"path": "/RWKV-v4/cuda/wkv_cuda.cu:58-97", "hash": "7fe2dd340bea96aa88925f3ebba800dc", "title": "Calculating Gradients for WKV Model"}, "201": {"path": "/RWKV-v4/cuda/wkv_cuda.cu:99-121", "hash": "8bf510261a7e673ef0af411bda386b46", "title": "CUDA Matrix Operations with RWKV-v4"}, "202": {"path": "/RWKV-v4/cuda/wkv_cuda.cu:122-125", "hash": "d1e66f3c8336e595b2241f26c625e9fd", "title": "Efficient GPU Kernel Execution"}, "203": {"path": "/RWKV-v4/cuda/wkv_op.cpp", "hash": "e274a6e3b477300222307de60ac74fc6", "title": "RWKV-v4 GPU WKV Operations"}, "204": {"path": "/RWKV-v4/cuda/wkv_op.cpp:1-14", "hash": "7f53f9273c0565f4f144c14919c42292", "title": "WaveGrad RWKV Model C++ Functions"}, "205": {"path": "/RWKV-v4/cuda/wkv_op.cpp:15-21", "hash": "3c8b46de8107e683f3109e10a4b34a05", "title": "Wavelet Quantized Variational Kalman Filter Implementation"}, "206": {"path": "/RWKV-v4/run.py", "hash": "10fff2de9d519877297c38d3053c0f57", "title": "RWKV Language Model Text Generation"}, "207": {"path": "/RWKV-v4/run.py:1-26", "hash": "aa3f3be0df9ce8208d41ed062d94ff53", "title": "RWKV Model Setup and Configuration"}, "208": {"path": "/RWKV-v4/run.py:28-60", "hash": "74a20ddff11207b817629d169b1ddf13", "title": "Model Parameter Setter"}, "209": {"path": "/RWKV-v4/run.py:61-87", "hash": "8965f153c393903a76fd0b0fe24092f1", "title": "RWKV Model Setup and Inference"}, "210": {"path": "/RWKV-v4/run.py:87-109", "hash": "c2eef5b717d304189243d571ecb5808c", "title": "RWKV Tokenizer & Processing Efficiency"}, "211": {"path": "/RWKV-v4/run.py:110-142", "hash": "6517219a00af38a8ee81639fdec02bee", "title": "RWKV-v4 Text Generation and Model Saving"}, "212": {"path": "/RWKV-v4/run.py:143-149", "hash": "6f274f4a12cf61ba4047cad017ba2417", "title": "Tokenizing and Timing"}, "213": {"path": "/RWKV-v4/src/binidx.py", "hash": "ec12e41bb362274096e9fda7e1a98bee", "title": "Binary Indexed Dataset Class"}, "214": {"path": "/RWKV-v4/src/binidx.py:1-48", "hash": "c04f7063ca51f57c5afd3269adb0f26e", "title": "Binary Indexing in RWKV-v4"}, "215": {"path": "/RWKV-v4/src/binidx.py:49-72", "hash": "c188645df91f92d5a4c0755e219a0719", "title": "Index File Reader and Validator"}, "216": {"path": "/RWKV-v4/src/binidx.py:73-98", "hash": "48207b8ecdeb0e6f0d26dd9bc5557eb4", "title": "Binary File Loading with Numpy Memmap"}, "217": {"path": "/RWKV-v4/src/binidx.py:100-141", "hash": "7d25cb623c75c1ee4f3252f4c0f32f43", "title": "Binary Data File Index Class"}, "218": {"path": "/RWKV-v4/src/binidx.py:142-170", "hash": "4bdb0c19d8cbafa21195740997facd39", "title": "Numpy Buffer Memory View and Indexing"}, "219": {"path": "/RWKV-v4/src/binidx.py:171-203", "hash": "188e693482303ee8ecc34355490c69df", "title": "Binary Indexing Functions"}, "220": {"path": "/RWKV-v4/src/binidx.py:205-216", "hash": "23690ca1da27870918044cff6bfd053b", "title": "Document Index and Data File Existence Check"}, "221": {"path": "/RWKV-v4/src/model.py", "hash": "eaa5cb06f49b0063db40f87b8718ba40", "title": "Time-Mixing Channel Model for RWKV v4"}, "222": {"path": "/RWKV-v4/src/model.py:1-32", "hash": "0b9e95d2d3c75a00daca92ff49ebe176", "title": "L2Wrap Loss Calculation"}, "223": {"path": "/RWKV-v4/src/model.py:33-55", "hash": "1031e261b4a78d5b448a580b81c7e524", "title": "WKV Class and Computations"}, "224": {"path": "/RWKV-v4/src/model.py:56-84", "hash": "c2a0d9ec8c93497e6fbef30a946ea405", "title": "Tensor Mode Initialization and Backward Pass"}, "225": {"path": "/RWKV-v4/src/model.py:85-103", "hash": "d8da46c1cdfbda7549079d94dfe253f0", "title": "CUDA Backward Pass for RWKV Model"}, "226": {"path": "/RWKV-v4/src/model.py:104-126", "hash": "4ab62c59ee818de7a2ed844554d7333a", "title": "Fancy Initialization for Model Layers"}, "227": {"path": "/RWKV-v4/src/model.py:127-156", "hash": "a499d5dc8da5a8815ecc7b28a02c16e2", "title": "Efficient Weight Matrix Initialization for RWKV Models"}, "228": {"path": "/RWKV-v4/src/model.py:157-184", "hash": "0c2b45b304b1ac8c76bd2f6aef9cbf5e", "title": "RWKV TimeMix Layer Initialization"}, "229": {"path": "/RWKV-v4/src/model.py:185-206", "hash": "95ae25fded0c5d88679eeb892a8a989b", "title": "Attention Layer Initialization in Transformer Model"}, "230": {"path": "/RWKV-v4/src/model.py:208-242", "hash": "abde3c3a2ceba0b22e594439359ffdbe", "title": "Time-Shifting Channel Mixing Model"}, "231": {"path": "/RWKV-v4/src/model.py:243-270", "hash": "2b2a410f0a2ab48e5e448a02bb4cbd8e", "title": "Time-Mixing Transformer Layer Initialization"}, "232": {"path": "/RWKV-v4/src/model.py:271-303", "hash": "016e86404ca5681a93ad2fb194249eae", "title": "Customizable GPT Model with Blocks"}, "233": {"path": "/RWKV-v4/src/model.py:305-333", "hash": "6bf9e46158ee7ba1b2b7d74f9b012674", "title": "GPT Model Class with Layer Normalization"}, "234": {"path": "/RWKV-v4/src/model.py:334-364", "hash": "0fee6d895dd2e596e7f7476e25eec915", "title": "Model Initialization and Configuration"}, "235": {"path": "/RWKV-v4/src/model.py:365-387", "hash": "0cd16c99fc2cf979ff8eb504ca4878c4", "title": "Optimizer Initialization for Models"}, "236": {"path": "/RWKV-v4/src/model.py:389-414", "hash": "713d586a28f388add927d160e0b8d25a", "title": "Multi-Head Attention with Cross-Entropy Loss"}, "237": {"path": "/RWKV-v4/src/model_run.py", "hash": "b91c918a7d1be2cbb5da4b28ae5d0350", "title": "Efficient Transformer Model Execution with RWKV-v4"}, "238": {"path": "/RWKV-v4/src/model_run.py:1-25", "hash": "b5e9a5100995e385aed67fd3abe6d545", "title": "RWKV Language Model: CUDA-Friendly NLP"}, "239": {"path": "/RWKV-v4/src/model_run.py:26-48", "hash": "aa2505fe5dc78b2fd47837e2645140e2", "title": "RWKV Model Initialization"}, "240": {"path": "/RWKV-v4/src/model_run.py:49-72", "hash": "ba1031e583c5a58620c61f9d4d5a86b0", "title": "Forward and Backward Functions for RWKV Model"}, "241": {"path": "/RWKV-v4/src/model_run.py:73-96", "hash": "569effdebfc0a5d258ae1cf40f48c510", "title": "CUDA-powered RWKV Model"}, "242": {"path": "/RWKV-v4/src/model_run.py:98-124", "hash": "a0b534195c94dc0a8217ff36716f705f", "title": "RWKV TimeMix: Time Decay & Mixing Model"}, "243": {"path": "/RWKV-v4/src/model_run.py:125-155", "hash": "fd6651f8d58e53e6c70fd45152dfb1a2", "title": "Time-Based Mixing Layer in RWKV Model"}, "244": {"path": "/RWKV-v4/src/model_run.py:156-186", "hash": "e472b392a3348d1c450193322481b1dc", "title": "RWKV-GPT Model Initialization and Forward Pass"}, "245": {"path": "/RWKV-v4/src/model_run.py:187-217", "hash": "d1bf53220251bd75f97458ba762adb37", "title": "RWKV Model Initialization and Forward Pass"}, "246": {"path": "/RWKV-v4/src/model_run.py:219-242", "hash": "f12a5b7881d9fb47b802b8bea2c2c92a", "title": "Head-Multihead Attention Model Initialization"}, "247": {"path": "/RWKV-v4/src/model_run.py:243-272", "hash": "a0e3abf5821be9bd850e45fccaa7dd74", "title": "RWKV-v4 Model Weights Loading and Processing"}, "248": {"path": "/RWKV-v4/src/model_run.py:273-306", "hash": "55be87ce4ec48dae47dd9169af8b2729", "title": "Dynamic Variable Class with LN and FF Operations"}, "249": {"path": "/RWKV-v4/src/model_run.py:307-330", "hash": "f823c908d08f9d15fcf4ac5774896b18", "title": "Attention Mechanism in Transformer Models"}, "250": {"path": "/RWKV-v4/src/model_run.py:332-366", "hash": "0917777d064d65352fa1ca1052ed2b1a", "title": "RWKV Model Function: LN-FFN-SA Layers"}, "251": {"path": "/RWKV-v4/src/model_run.py:367-392", "hash": "e5a62b156789a3cd9980ab07ae2f359b", "title": "Feed-Forward Network and Layer Normalization in RWKV-v4"}, "252": {"path": "/RWKV-v4/src/trainer.py", "hash": "687e6e7fe9bb537d89a85d44a126c369", "title": "RWKV Trainer: Training and Evaluation"}, "253": {"path": "/RWKV-v4/src/trainer.py:1-34", "hash": "1227338bcda6fc7f2a71f5c296a7fb7a", "title": "RWKV Trainer Configuration"}, "254": {"path": "/RWKV-v4/src/trainer.py:35-63", "hash": "157eeccca64291118af9ab8c14d3bf89", "title": "Trainer Class and Model Training"}, "255": {"path": "/RWKV-v4/src/trainer.py:64-87", "hash": "a12b37c06a25c576217d894644334b87", "title": "Loading Model and Initializing Logging"}, "256": {"path": "/RWKV-v4/src/trainer.py:89-109", "hash": "0c31ba4271f1d8699cddce3c4367ffdc", "title": "Configure and Load Data for Training or Testing"}, "257": {"path": "/RWKV-v4/src/trainer.py:111-136", "hash": "de6283d0f935267056eb7d08107071fe", "title": "RWKV Model Training with Dataloader"}, "258": {"path": "/RWKV-v4/src/trainer.py:137-156", "hash": "2675e4a5613af8e2715f3cd119f611f3", "title": "RWKV Learning Rate Calculation"}, "259": {"path": "/RWKV-v4/src/trainer.py:157-179", "hash": "0054f17a4ee1acfc77c30f0375283fac", "title": "Average Loss Calculation and Updates"}, "260": {"path": "/RWKV-v4/src/trainer.py:181-187", "hash": "8dc6b612226e174d68d98cf6fbd0b3bc", "title": "Epoch-based Model Saving and Logging"}, "261": {"path": "/RWKV-v4/src/utils.py", "hash": "f07d4ad3a0bf0f148bbf7c018a28b0d9", "title": "RWKV Dataset Generation"}, "262": {"path": "/RWKV-v4/src/utils.py:1-29", "hash": "7470669604ba4f37ab8295e626d88f34", "title": "Initializing Dataset in RWKV-v4"}, "263": {"path": "/RWKV-v4/src/utils.py:30-53", "hash": "6b52536e4b4ab906560f89fcdd3b6937", "title": "Vocab Size Setter"}, "264": {"path": "/RWKV-v4/src/utils.py:55-81", "hash": "627c2c3d170c7369684ad5065d09e9d0", "title": "RWKV Tokenizer Class"}, "265": {"path": "/RWKV-v4/src/utils.py:82-106", "hash": "a9d0403d5adbb5691a1c923ce6649372", "title": "Tokenizer and Word Table Initialization"}, "266": {"path": "/RWKV-v4/src/utils.py:108-137", "hash": "79c184022b40696b07cf2b01c37e8d9d", "title": "Softmax Sampling with Top-P Control"}, "267": {"path": "/RWKV-v4/src/utils.py:139-153", "hash": "1c7411b555363b5cc882542f43fcb687", "title": "Utility Functions"}, "268": {"path": "/RWKV-v4/train.py", "hash": "219c5b9902b3d4b566efb10ed92551c2", "title": "Efficient RWKV-v4 Training with DeepSpeed"}, "269": {"path": "/RWKV-v4/train.py:1-19", "hash": "b397bc5134311580bd4a3ae86ae8a841", "title": "RWKV-v4 Import and Setup"}, "270": {"path": "/RWKV-v4/train.py:21-42", "hash": "39335ccec206eb3580275c7bc646054c", "title": "Train RWKV Text Model: Data and Config"}, "271": {"path": "/RWKV-v4/train.py:43-65", "hash": "7172035dd48983f841bffd20e4ba7011", "title": "RWKV-v4 Training Setup"}, "272": {"path": "/RWKV-v4/train.py:66-88", "hash": "6d202ba48e2a389ce06a3296db8e4b91", "title": "Environment Variable Configuration"}, "273": {"path": "/RWKV-v4/train.py:90-114", "hash": "1b7c50e7acf9221ec977c4a48844e8a0", "title": "RWKV-v4 Hyperparameter Configuration"}, "274": {"path": "/RWKV-v4/train.py:115-135", "hash": "75d4693c8ea68568e232942d432a4db7", "title": "Optimizing RWKV-v4 Training Parameters"}, "275": {"path": "/RWKV-v4/train.py:136-168", "hash": "ae3ca1d1ff1c5d3364fa7c718e14c49e", "title": "Model Training Parameters Configuration"}, "276": {"path": "/RWKV-v4/train.py:169-190", "hash": "97247378c6014a747e41191f5e9a71ce", "title": "Data Loading and Preparation"}, "277": {"path": "/RWKV-v4/train.py:191-209", "hash": "068e91e052f955bc1c0ffe266653a760", "title": "RWKV Trainer Setup and Config"}, "278": {"path": "/RWKV-v4/train.py:210-233", "hash": "258301cf9d1251d8e40f1ba7727371e0", "title": "Trainer Type Determination: RWKV-v4/train.py"}, "279": {"path": "/RWKV-v4/train.py:234-263", "hash": "9776f1bc6df83c0410329b131fa949dd", "title": "DeepSpeed RWKV-v4 Training Config"}, "280": {"path": "/RWKV-v4/train.py:264-280", "hash": "35c6095492ba729cc6540c91fbe96514", "title": "Environment-Guided DeepSpeed Strategy"}, "281": {"path": "/RWKV-v4/verify.py", "hash": "e58242c6e2ba2d6fd53edf243249b616", "title": "Initialize and Compare RWKV Models"}, "282": {"path": "/RWKV-v4/verify.py:1-26", "hash": "67cc9ece79946c575c6d76d0e838fe35", "title": "Consistent Model Verification"}, "283": {"path": "/RWKV-v4/verify.py:27-57", "hash": "2bee48140f5f1578cb827266450a1dcf", "title": "RWKV-v4 Model Initialization"}, "284": {"path": "/RWKV-v4/verify.py:58-81", "hash": "2a8719c186b981b9d9e18e50debd7248", "title": "RWKV-GPT Tokenization and Model Output"}, "285": {"path": "/RWKV-v4/verify.py:82-90", "hash": "a0259dcf8ef2f4155c541da3c75aa245", "title": "RWKV-v4 Model Output Check"}, "286": {"path": "/RWKV-v4neo/chat.py", "hash": "dc26c2666fb59f81e28cd7b5b0220b02", "title": "Multilingual RWKV Chatbot Environment"}, "287": {"path": "/RWKV-v4neo/chat.py:1-31", "hash": "170b837dabc1392e64e0e28402f07bbd", "title": "RWKV Language Model Setup and Configuration"}, "288": {"path": "/RWKV-v4neo/chat.py:32-59", "hash": "7d5436318975eb33da911294eb383a08", "title": "RWKV-v4neo Hyperparameter Configuration"}, "289": {"path": "/RWKV-v4neo/chat.py:59-86", "hash": "956d2887b62af9e40c042611cef9ea01", "title": "AI Conversation Samples and Commands"}, "290": {"path": "/RWKV-v4neo/chat.py:87-118", "hash": "3b7619a378d15e92dc3ae10b2fb971dc", "title": "Multi-Language RWKV Chatbot"}, "291": {"path": "/RWKV-v4neo/chat.py:119-156", "hash": "60d9200eb0c37837df5daf80eb858301", "title": "RWKV-v4neo Chatbot Code"}, "292": {"path": "/RWKV-v4neo/chat.py:157-194", "hash": "d07d50c97c54ff6089498584f9b24f06", "title": "RNN Inference and Reply Function"}, "293": {"path": "/RWKV-v4neo/chat.py:195-230", "hash": "fb263555d07a50ee71d6ef1736d8f409", "title": "Chatbot Processing Function"}, "294": {"path": "/RWKV-v4neo/chat.py:232-255", "hash": "41c75ec8d5f15f4030067ce7693fe10b", "title": "Chatbot Response Generation with RWKV-v4neo"}, "295": {"path": "/RWKV-v4neo/chat.py:256-289", "hash": "c86f01050febf6f6bee3a28408e421dc", "title": "RWKV Chat: Token Generation and Error Handling"}, "296": {"path": "/RWKV-v4neo/chat.py:290-319", "hash": "08bd875ec6ccb8a7b3f8c96c6cf37fdd", "title": "Chatbot RWKV-v4neo Model Interaction"}, "297": {"path": "/RWKV-v4neo/chat.py:320-344", "hash": "9794762eb7aab885dd69a6e55bdebdc1", "title": "RWKV Text Generator with Tokenization"}, "298": {"path": "/RWKV-v4neo/chat.py:345-361", "hash": "0f08e3067762f1a3b5e6d776b11b75ce", "title": "Chat Data Saving and User Input Handling"}, "299": {"path": "/RWKV-v4neo/cuda/wkv5_cuda.cu", "hash": "d4b9e074d60eb7f065e9a7ac93064ad8", "title": "CUDA-Optimized Convolutional Neural Network"}, "300": {"path": "/RWKV-v4neo/cuda/wkv5_cuda.cu:1-36", "hash": "06e3166ac4e161db937654a439f7f72e", "title": "CUDA Kernel for Neural Network Layer Forward Pass"}, "301": {"path": "/RWKV-v4neo/cuda/wkv5_cuda.cu:37-66", "hash": "aae5b35273d7df60a4f8661776259c59", "title": "Matrix Multiplication CUDA Implementation"}, "302": {"path": "/RWKV-v4neo/cuda/wkv5_cuda.cu:67-104", "hash": "a50fdd69a7c1ca660a83cd369207001d", "title": "CUDA Parallel Recurrent Weight Updates"}, "303": {"path": "/RWKV-v4neo/cuda/wkv5_cuda.cu:105-149", "hash": "6c0e7d802de9b76c11a5879ebfdf356e", "title": "Cuda Array Manipulation & Synchronization"}, "304": {"path": "/RWKV-v4neo/cuda/wkv5_cuda.cu:150-197", "hash": "22902a0e098d92c642f45ec8e26951de", "title": "CUDA Convolution: Kernel Functions and Shared Memory"}, "305": {"path": "/RWKV-v4neo/cuda/wkv5_cuda.cu:197-202", "hash": "5c9b7979aced41a0698844e0e9469480", "title": "CUDA Kernel Launcher: cuda_backward"}, "306": {"path": "/RWKV-v4neo/cuda/wkv5_op.cpp", "hash": "4a2a9facaa426e74897949437487da38", "title": "CUDA-Optimized PyTorch Neural Network Operations"}, "307": {"path": "/RWKV-v4neo/cuda/wkv5_op.cpp:1-12", "hash": "15a40c364cf25b9cb0469e48c523fb67", "title": "Neural Network CUDA Implementation"}, "308": {"path": "/RWKV-v4neo/cuda/wkv5_op.cpp:12-22", "hash": "72d1b2c91c8941b191f50d0170d9b42b", "title": "WKV5 Python Operations"}, "309": {"path": "/RWKV-v4neo/cuda/wkv_cuda.cu", "hash": "c784e6f7945631500afa0cfa39517836", "title": "Efficient CUDA Implementation of RWKV Model"}, "310": {"path": "/RWKV-v4neo/cuda/wkv_cuda.cu:1-31", "hash": "7a271a666fc9b28e6d306820c0905a73", "title": "Matrix Multiplication with Kernel Function"}, "311": {"path": "/RWKV-v4neo/cuda/wkv_cuda.cu:32-58", "hash": "c44d718c2c9d9164edac4bf8c2b7df41", "title": "Matrix Multiplication Kernel"}, "312": {"path": "/RWKV-v4neo/cuda/wkv_cuda.cu:59-94", "hash": "f5b72fb46097667c3d5022f4bbc68825", "title": "Variable Initialization and Iteration for WKV-Cuda"}, "313": {"path": "/RWKV-v4neo/cuda/wkv_cuda.cu:96-128", "hash": "f72d4349fd1f368c7b73c7bcdaa6e8d4", "title": "CUDA Implementation of RWKV Model Passes"}, "314": {"path": "/RWKV-v4neo/cuda/wkv_cuda.cu:129-133", "hash": "a26aedfdd1ee0612e30241165bb1ac15", "title": "CUDA Kernel Configuration: Optimal Threads, Blocks Distribution"}, "315": {"path": "/RWKV-v4neo/cuda/wkv_cuda_bf16.cu", "hash": "4383d52a71c7aae0c51df4be6b1ada17", "title": "Cuda BFloat16 Neural Network Forward-Backward Pass"}, "316": {"path": "/RWKV-v4neo/cuda/wkv_cuda_bf16.cu:1-26", "hash": "95ef7c16a7350700c5ca63eb6d9d109f", "title": "BF16 Forward Pass on GPU"}, "317": {"path": "/RWKV-v4neo/cuda/wkv_cuda_bf16.cu:28-53", "hash": "af5704890180156e44e609a4260506a4", "title": "Gradient Computation for Deep Learning Models"}, "318": {"path": "/RWKV-v4neo/cuda/wkv_cuda_bf16.cu:54-85", "hash": "60e34d051c052efbb8da708d778726ca", "title": "BF16 Softmax Multiplication CUDA Code"}, "319": {"path": "/RWKV-v4neo/cuda/wkv_cuda_bf16.cu:86-120", "hash": "87b7983ecba7a1851b738680520e5031", "title": "CUDA Neural Network Forward Pass"}, "320": {"path": "/RWKV-v4neo/cuda/wkv_cuda_bf16.cu:121-132", "hash": "c2765ca0cb838c02083469f95ccdda78", "title": "Optimized CUDA Matrix Operation Functions"}, "321": {"path": "/RWKV-v4neo/cuda/wkv_op.cpp", "hash": "d92ed30717a6e950617058d6852c6aaf", "title": "CUDA-Powered Torch Extensions for RWKV-v4neo"}, "322": {"path": "/RWKV-v4neo/cuda/wkv_op.cpp:1-10", "hash": "1e1f8643520f78712a8ddb9a026094ce", "title": "CUDA Torch Extension for RWKV-v4neo Forward and Backward Passes"}, "323": {"path": "/RWKV-v4neo/cuda/wkv_op.cpp:11-21", "hash": "ac8628cf5e9c123cea71a2db6d05d760", "title": "PyBind11 Torch Module Definition"}, "324": {"path": "/RWKV-v4neo/cuda/wkv_op_bf16.cpp", "hash": "8bf5ce15c242df4d4343e869ebbfc970", "title": "CUDA Functions for BF16 Torch Tensors"}, "325": {"path": "/RWKV-v4neo/cuda/wkv_op_bf16.cpp:1-14", "hash": "b50b60dd02a0358a26acf617b8009ed6", "title": "CUDA C++: BFloat16 Forward-Backward Functions"}, "326": {"path": "/RWKV-v4neo/cuda/wkv_op_bf16.cpp:15-25", "hash": "3fdd96efbfa6b648b8e9fcfa14014458", "title": "WKV Bindings for Forward and Backward Functions"}, "327": {"path": "/RWKV-v4neo/img_demoAE.py", "hash": "7dce2ec54e046feed881c4b79448b870", "title": "Binary Image Conversion with Neural Network"}, "328": {"path": "/RWKV-v4neo/img_demoAE.py:1-25", "hash": "a16142bde061f238c1fa9204eaa4d0dd", "title": "Binary Conversion with RWKV"}, "329": {"path": "/RWKV-v4neo/img_demoAE.py:27-52", "hash": "abcd009dff7e1303635a8a11cf511796", "title": "R_ENCODER: Image Processing Class"}, "330": {"path": "/RWKV-v4neo/img_demoAE.py:54-82", "hash": "d3ef332df0ef3ed7a5aea7401693c8ef", "title": "Neural Network Forward Pass Function"}, "331": {"path": "/RWKV-v4neo/img_demoAE.py:84-106", "hash": "fe8036cd2202ed1ceb14d5b8e013a31d", "title": "RDECODER: Image Decoding ConvNet"}, "332": {"path": "/RWKV-v4neo/img_demoAE.py:107-139", "hash": "eb6df8bd2b1de22e01e13f2890ed1de0", "title": "Image Generation Neural Network Model with Mish Activation"}, "333": {"path": "/RWKV-v4neo/img_demoAE.py:140-165", "hash": "16aca70605a74ac7f75e8acdc6b564b3", "title": "AE Image Generation and Processing"}, "334": {"path": "/RWKV-v4neo/math_demo/run.py", "hash": "d0daa09f989d79d4c74d9190bbc2a136", "title": "RWKV: Deep Learning NLP Model Development"}, "335": {"path": "/RWKV-v4neo/math_demo/run.py:1-25", "hash": "63a5941ca85f00cecec1d91beaee8a6b", "title": "RWKV Language Model Tokenization"}, "336": {"path": "/RWKV-v4neo/math_demo/run.py:26-53", "hash": "339cfa2f646afe261d7edf0f50468996", "title": "RWKV Tokenization and RNN Classes"}, "337": {"path": "/RWKV-v4neo/math_demo/run.py:54-77", "hash": "0e72585e61d1c394ea8b36946c5ac0e7", "title": "Channel-wise Mixing and Normalization Function"}, "338": {"path": "/RWKV-v4neo/math_demo/run.py:78-109", "hash": "758e6d257f4352b2b725c1c0d44adbe9", "title": "Time-Dependent RNN Mixing and Forward Pass"}, "339": {"path": "/RWKV-v4neo/math_demo/run.py:110-127", "hash": "e9f7b99a1fd5709d972d5e67816ce306", "title": "Time-Mixing and Channel-Mixing Operations"}, "340": {"path": "/RWKV-v4neo/math_demo/run.py:128-150", "hash": "163b957c72bd569991d7265a0b3227b2", "title": "RWKV-v4neo: Tokenizing and Predicting Characters"}, "341": {"path": "/RWKV-v4neo/run.py", "hash": "a47f164f1cba51d3ddb744bce179f5df", "title": "Initialize and Run RWKV Language Model"}, "342": {"path": "/RWKV-v4neo/run.py:1-23", "hash": "93036696f8dfa0ea04685b062db63467", "title": "RWKV Language Model Initialization"}, "343": {"path": "/RWKV-v4neo/run.py:24-56", "hash": "92dbc599c6ba027f7b285806305f7ea8", "title": "RWKV Model Configurator"}, "344": {"path": "/RWKV-v4neo/run.py:57-89", "hash": "62744560b7770f960e9f871d9f033c87", "title": "RWKV-v4neo: Model Parameter Setup"}, "345": {"path": "/RWKV-v4neo/run.py:90-117", "hash": "fbdba58f62c3d35b831472f94e109205", "title": "AI Politics and History Chatbot"}, "346": {"path": "/RWKV-v4neo/run.py:119-156", "hash": "6881317b7e7b027560a87d3253d963b9", "title": "Initializing RWKV-v4neo Model for Text Generation"}, "347": {"path": "/RWKV-v4neo/run.py:157-194", "hash": "49d5703d06a05f22fdd49c8458ea5b31", "title": "Neural Language Model Code Analysis"}, "348": {"path": "/RWKV-v4neo/run.py:196-227", "hash": "ce76d5a18ea4992e402248e4561e0c8b", "title": "RWKV Token Prediction Model"}, "349": {"path": "/RWKV-v4neo/run.py:228-237", "hash": "7c7f429b2d52aa47add10b2f2c4593e7", "title": "Flushing Buffer Instantly"}, "350": {"path": "/RWKV-v4neo/src/binidx.py", "hash": "aa27609553eee4d8e8ce234017a61724", "title": "Memory-Mapped Indexed Dataset Handler"}, "351": {"path": "/RWKV-v4neo/src/binidx.py:1-48", "hash": "5a5b4cce819f4f0f9106360767f34d2b", "title": "Indexed Dataset Handling in Binidx.py"}, "352": {"path": "/RWKV-v4neo/src/binidx.py:49-76", "hash": "243f44d6ee1f53f1dda256b1acb65284", "title": "Binary File Header Writer"}, "353": {"path": "/RWKV-v4neo/src/binidx.py:78-104", "hash": "c2e5816ac5a39ded0253d953e817c026", "title": "Binary File Writer Class"}, "354": {"path": "/RWKV-v4neo/src/binidx.py:105-128", "hash": "5d8e59e4e0ddfb4a4029c0b3eafa7d57", "title": "Index File Analyzer"}, "355": {"path": "/RWKV-v4neo/src/binidx.py:129-158", "hash": "e91da1182adcb4d160409cc69710b7ba", "title": "Binary Data Memory Management"}, "356": {"path": "/RWKV-v4neo/src/binidx.py:159-196", "hash": "215ea9e3816bfd226f25179d9115fc15", "title": "Binary Index Class Definition"}, "357": {"path": "/RWKV-v4neo/src/binidx.py:197-226", "hash": "ee59c3819ffa30bf2db0eabd45defe6a", "title": "Cache-Backed Numpy Memory View"}, "358": {"path": "/RWKV-v4neo/src/binidx.py:227-261", "hash": "b521e0d276125bfd064f5b4b8a6126cc", "title": "Binary Index Class and Methods"}, "359": {"path": "/RWKV-v4neo/src/binidx.py:262-269", "hash": "25c35a0c95e8dc311baa5b5d1732dbda", "title": "Prefetch Unsupported: Checks Index and Data File Existence"}, "360": {"path": "/RWKV-v4neo/src/dataset.py", "hash": "cbc28d5530cdada96e6587e7bcf56118", "title": "Custom PyTorch Dataset Management"}, "361": {"path": "/RWKV-v4neo/src/dataset.py:1-25", "hash": "73a15e800dbaf13c6bb0420b8bd359d6", "title": "Initializing MyDataset with \"binidx\""}, "362": {"path": "/RWKV-v4neo/src/dataset.py:26-42", "hash": "7718c6902ad7f7b9f6a88d99220d507c", "title": "Checks Version and Prepares Dataset"}, "363": {"path": "/RWKV-v4neo/src/dataset.py:43-60", "hash": "6ea362b49776042aff8e1e78600e8990", "title": "Tokenization Verification"}, "364": {"path": "/RWKV-v4neo/src/dataset.py:61-82", "hash": "229a7858371adacd71d0b96d5cb78f18", "title": "Dataset Loading and Resizing"}, "365": {"path": "/RWKV-v4neo/src/dataset.py:83-107", "hash": "6844abee124a98bf9c722533991c1a84", "title": "Dataset Construction in RWKV-v4neo"}, "366": {"path": "/RWKV-v4neo/src/dataset.py:108-128", "hash": "8bd949020966770a0fc431081fbe0165", "title": "Shuffling and Initializing Dataset"}, "367": {"path": "/RWKV-v4neo/src/dataset.py:129-150", "hash": "b7b935464bf6523caf74e0845539f960", "title": "Robust WebDataset Initialization"}, "368": {"path": "/RWKV-v4neo/src/dataset.py:151-175", "hash": "bd6477a33883c74cc49525cb4e5698b7", "title": "Random Data Selection and Tensor Creation"}, "369": {"path": "/RWKV-v4neo/src/dataset.py:176-192", "hash": "8fb19a35f02e1040c8faba1621a53d19", "title": "Random Dataset Chunk Selection"}, "370": {"path": "/RWKV-v4neo/src/dataset.py:193-213", "hash": "b5211634ddae209b9cd6dde9d8bfd81f", "title": "Random Index Retrieval for Data Types"}, "371": {"path": "/RWKV-v4neo/src/dataset.py:214-234", "hash": "23e6d9c1525d907f85275763ea715b32", "title": "Random Index Generation and Pattern Detection"}, "372": {"path": "/RWKV-v4neo/src/dataset.py:236-245", "hash": "71f3f120e7e39793d26f33cb6096a83d", "title": "Filtering and Returning Data Snippets"}, "373": {"path": "/RWKV-v4neo/src/model.py", "hash": "4df1a8098db0df31b9cad8e9f6ca6232", "title": "RWKV-v4neo Model: GPU Efficient Neural Network"}, "374": {"path": "/RWKV-v4neo/src/model.py:1-31", "hash": "f9bebb9e9748334e92be10cd0f50a1a8", "title": "Setting Up Torch Environment"}, "375": {"path": "/RWKV-v4neo/src/model.py:32-52", "hash": "d18343e1e16464c425e13e6fbdc3ba17", "title": "CUDA-Powered WKV Class"}, "376": {"path": "/RWKV-v4neo/src/model.py:53-73", "hash": "6852e1bcb7942f4eb63e2650d4768636", "title": "Neural Model Computation and Backpropagation"}, "377": {"path": "/RWKV-v4neo/src/model.py:74-93", "hash": "67e2ea1538798b877ab21f11c37aa198", "title": "Neural Model CUDA Tensor Calculation"}, "378": {"path": "/RWKV-v4neo/src/model.py:94-117", "hash": "45ffd8e8fe873138072f9c17640fd466", "title": "Precise Model Data Type Conversion"}, "379": {"path": "/RWKV-v4neo/src/model.py:118-135", "hash": "6128bb703fd1b52a3adcfefb47e95e56", "title": "CUDA-Powered BERT Transformer Gradients"}, "380": {"path": "/RWKV-v4neo/src/model.py:137-162", "hash": "02d5b197c3a78377fda92369832f89c5", "title": "RWKV TimeMix Module Class"}, "381": {"path": "/RWKV-v4neo/src/model.py:163-182", "hash": "713d7f5c96d063792ceff5ba3eaf63db", "title": "Time-Related Model Component Parameters"}, "382": {"path": "/RWKV-v4neo/src/model.py:184-204", "hash": "0cac28fe2cd99dd81cf028dbd368d49b", "title": "Time-Shifted Model with Group Normalization"}, "383": {"path": "/RWKV-v4neo/src/model.py:204-228", "hash": "87ab449e2e82b40a7ba64e2afdfc533d", "title": "Attention Mechanism in Transformer Model Implemented"}, "384": {"path": "/RWKV-v4neo/src/model.py:229-251", "hash": "771d9f545a388beabe5f4802065799ce", "title": "JIT Functions for Time Shifting and Tensor Operations"}, "385": {"path": "/RWKV-v4neo/src/model.py:252-281", "hash": "1d91029b618e605cad8de59c9395249a", "title": "Layer Normalization in RWKV-v4neo"}, "386": {"path": "/RWKV-v4neo/src/model.py:283-311", "hash": "6c5ccfcc1460a8dd6c1cda3fe03ff8a4", "title": "Tensor Manipulation and Reshaping"}, "387": {"path": "/RWKV-v4neo/src/model.py:312-334", "hash": "392ce52d38c0d71fb5df09c7467a8f57", "title": "CUDA-RWKV5 Kernel Wrapper Function"}, "388": {"path": "/RWKV-v4neo/src/model.py:335-356", "hash": "caf13172029e0382d6c77b27d6481b62", "title": "Exponentiated Tensor Initialization and Backpropagation"}, "389": {"path": "/RWKV-v4neo/src/model.py:357-367", "hash": "36c92acdb569ebc80ec2d109cd19fe1b", "title": "Initializing Model Tensors and CUDA Calculations"}, "390": {"path": "/RWKV-v4neo/src/model.py:369-392", "hash": "50fe8a7f53bd382e28b103e9a2a4423b", "title": "RWKV TimeMix Initialization"}, "391": {"path": "/RWKV-v4neo/src/model.py:393-411", "hash": "cd2a62d392bc8a42bd0623921989fe7b", "title": "Time-Learnable Parameters Initialization"}, "392": {"path": "/RWKV-v4neo/src/model.py:412-441", "hash": "9194e87647f9cf97ba09c7d31ff7da52", "title": "Time-Shifted Mixing Linear Model"}, "393": {"path": "/RWKV-v4neo/src/model.py:442-472", "hash": "52e1e1dad784ab550b4af1d8e31636a1", "title": "Introducing RWKV_TimeMix Class"}, "394": {"path": "/RWKV-v4neo/src/model.py:473-492", "hash": "c83eda725d4fecddab750aa60041f837", "title": "Time-Based Transformer Layer Initialization"}, "395": {"path": "/RWKV-v4neo/src/model.py:493-510", "hash": "66a99adf4f5920979f8e8301c442e7e9", "title": "Transformer Model Initialization Code"}, "396": {"path": "/RWKV-v4neo/src/model.py:511-533", "hash": "77a78fea6a4f166f9af7905b18b2582e", "title": "Jit-Mix Timesteps for RWKV"}, "397": {"path": "/RWKV-v4neo/src/model.py:534-558", "hash": "4eafdc2ad82d8a22bc3db046b0b68a3b", "title": "Time-Shifting QKV Decomposition"}, "398": {"path": "/RWKV-v4neo/src/model.py:559-581", "hash": "3e0e25be374f55236a3b285ba2a0d18d", "title": "RWKV Channel Mix Class Definition"}, "399": {"path": "/RWKV-v4neo/src/model.py:583-610", "hash": "897844655a1f8c8c602df558e8bbe897", "title": "MishGLU Class for Time-Shifting"}, "400": {"path": "/RWKV-v4neo/src/model.py:611-639", "hash": "52f81a09b8d38d94853acc89d0171f6c", "title": "Block Model with Linear Layers, Layer Normalization, and Mish Activation"}, "401": {"path": "/RWKV-v4neo/src/model.py:640-661", "hash": "7f76c91c25ce91c46a2f0fb2af6eabef", "title": "Model Initialization and Layer Creation"}, "402": {"path": "/RWKV-v4neo/src/model.py:662-688", "hash": "410436bbc8127c7b0f34ff48d41d2d09", "title": "Forward Model with Dropout, LayerNorm, Attention"}, "403": {"path": "/RWKV-v4neo/src/model.py:690-721", "hash": "023a02b76d3147670564824a9e8111f4", "title": "Scaled Dot Product Attention in RWKV Model"}, "404": {"path": "/RWKV-v4neo/src/model.py:722-745", "hash": "76f7d3b12b20f196f682ee1506c3a6e6", "title": "Transformer Model Initialization"}, "405": {"path": "/RWKV-v4neo/src/model.py:747-775", "hash": "18331355d9b9bcd526a146716abb227a", "title": "Optimizer Parameters Categorization"}, "406": {"path": "/RWKV-v4neo/src/model.py:777-797", "hash": "93265e38713235c5aa670f41211d301c", "title": "Layerwise Learning Rate Optimizer Groups"}, "407": {"path": "/RWKV-v4neo/src/model.py:798-810", "hash": "5e3069167ca8a0af7be5f82056a6abe2", "title": "Optimizer Group Initialization for Model Parameters"}, "408": {"path": "/RWKV-v4neo/src/model.py:811-834", "hash": "d13dabbb6b960725eb1c22b5b2c07a6f", "title": "Offloading Optimizer Model with FusedAdam"}, "409": {"path": "/RWKV-v4neo/src/model.py:835-860", "hash": "8c5ecb2b7c8be20bada98e8203bcb1dc", "title": "Checkpointing and Layer Execution Handler"}, "410": {"path": "/RWKV-v4neo/src/model.py:861-889", "hash": "83b0ac78f70a8e45c27a4cec2be11548", "title": "Forward-Training Pass Model"}, "411": {"path": "/RWKV-v4neo/src/model.py:890-916", "hash": "c7b36c6317ba41c5415310742e2ed75a", "title": "Training Neural Network: Loss Calculation and All-Gather"}, "412": {"path": "/RWKV-v4neo/src/model.py:917-943", "hash": "345b7d9e3d8cae6d49d53fbefad9892f", "title": "Efficient Model Weight Initialization"}, "413": {"path": "/RWKV-v4neo/src/model.py:944-966", "hash": "9002fbdc2e319d3e9138b9a48afa59e9", "title": "Scale Factor Initialization"}, "414": {"path": "/RWKV-v4neo/src/model.py:967-981", "hash": "417837e36892142ec04d36d7cd1ff4fd", "title": "Orthogonal Initialization of Model Weights"}, "415": {"path": "/RWKV-v4neo/src/model_img.py", "hash": "d4b2c61819d31b8390f70f5ae51d2b5b", "title": "Multi-Device RWKV-v4neo Model with VGG16"}, "416": {"path": "/RWKV-v4neo/src/model_img.py:1-30", "hash": "2d639d02c808453d3dce81be576ea346", "title": "Efficient Deep Learning Model with L2pooling"}, "417": {"path": "/RWKV-v4neo/src/model_img.py:31-63", "hash": "109f9e1b9cd8f47644ecc3d26d1d4519", "title": "VGG16-Based Neural Network with Custom Convolutional Layer"}, "418": {"path": "/RWKV-v4neo/src/model_img.py:64-86", "hash": "5784f5720039b09c48a9f5f338d0c016", "title": "VGG-19 Feature Pyramid Model"}, "419": {"path": "/RWKV-v4neo/src/model_img.py:87-116", "hash": "a7213d05dfc8d310718609b7cc4de830", "title": "Normalizing Parameters and Loading Weights"}, "420": {"path": "/RWKV-v4neo/src/model_img.py:117-141", "hash": "9398236a4cb97f3becc0549d2c469db3", "title": "Distance-Based Embedding Weights"}, "421": {"path": "/RWKV-v4neo/src/model_img.py:143-173", "hash": "8125fab588536c2172947d8662b499de", "title": "Batch-Averaged Distance Scoring Model"}, "422": {"path": "/RWKV-v4neo/src/model_img.py:174-193", "hash": "6190ebd7fc25084d20014a27793bb9f2", "title": "Convolutional Layers and Batch Normalization"}, "423": {"path": "/RWKV-v4neo/src/model_img.py:194-221", "hash": "1200afa76c06fde45b2b34ac3df3394a", "title": "Convolutional Neural Network with Mish Activation"}, "424": {"path": "/RWKV-v4neo/src/model_img.py:222-245", "hash": "cb167d8bf1344ca4dec040d13cf979ac", "title": "Decoding RWKV Model: Conv Layers and BatchNorm2d"}, "425": {"path": "/RWKV-v4neo/src/model_img.py:247-270", "hash": "83de5640064abdf20b0999f1d19960ed", "title": "RWKV-v4neo Model Architecture"}, "426": {"path": "/RWKV-v4neo/src/model_img.py:271-306", "hash": "79d759eb39cfd25b8862b2bf66956a00", "title": "RWKV-v4neo: LightningModule and Image Model"}, "427": {"path": "/RWKV-v4neo/src/model_img.py:307-332", "hash": "b878c35f653cbe5b58c6ba25c40f89c5", "title": "CLIP Model Initialization and Configuration"}, "428": {"path": "/RWKV-v4neo/src/model_img.py:333-360", "hash": "f09fa4daeff09c52d2badd2fff566ed1", "title": "Deepspeed-Optimizer: DeepSpeedCPUAdam or FusedAdam"}, "429": {"path": "/RWKV-v4neo/src/model_img.py:361-385", "hash": "d654f426bca57aa4279709494f21010a", "title": "Image Encoder-Decoder Model with Training Step"}, "430": {"path": "/RWKV-v4neo/src/model_img.py:386-413", "hash": "a385f008a3ce873a6a6fe031fdc4dd44", "title": "Loss Calculation in Deep Learning Model"}, "431": {"path": "/RWKV-v4neo/src/model_img.py:414-446", "hash": "da52f477ab1a705551ac72ca2e320ba8", "title": "Model Parameter Dictionary Creation"}, "432": {"path": "/RWKV-v4neo/src/model_run.py", "hash": "90661e91c42c452840ece3cacf77ead5", "title": "RWKV Model Optimization and Processing"}, "433": {"path": "/RWKV-v4neo/src/model_run.py:1-33", "hash": "d5fd1fdf08f36404c5b770df9df12268", "title": "Initializing RWKV Language Model in PyTorch"}, "434": {"path": "/RWKV-v4neo/src/model_run.py:35-61", "hash": "e08788390cb93f16f4a9bf8f6f2a5881", "title": "RWKV_RNN Model Initialization"}, "435": {"path": "/RWKV-v4neo/src/model_run.py:62-85", "hash": "aa88ff78758fced6f857c735a3a1a6b0", "title": "Model Parameter Manager"}, "436": {"path": "/RWKV-v4neo/src/model_run.py:86-114", "hash": "7e2f4f60d5cd469866d000705f5e4d70", "title": "Organizing Weights in RWKV-v4neo Model"}, "437": {"path": "/RWKV-v4neo/src/model_run.py:116-140", "hash": "96cdfb692ab3915c734488f7d11b2375", "title": "Feed-Forward Network with Learned Weights"}, "438": {"path": "/RWKV-v4neo/src/model_run.py:142-160", "hash": "32f949861a909262a0230753bbe2d028", "title": "Weighted Average Calculation for RWKV-v4neo"}, "439": {"path": "/RWKV-v4neo/src/model_run.py:161-200", "hash": "6b6e71af46473318a01e3f770a871889", "title": "Matrix Multiplication with Maximum Operations"}, "440": {"path": "/RWKV-v4neo/src/model_run.py:201-226", "hash": "955eadcc63211597efef4954b6ae6ca1", "title": "Input Data Processing in Deep Learning Model"}, "441": {"path": "/RWKV-v4neo/src/model_run.py:228-237", "hash": "d302c4473543c114cb78790f6ad49714", "title": "Rescaling Layer Normalization"}, "442": {"path": "/RWKV-v4neo/src/trainer.py", "hash": "3bcc333f47a180d5b3953171773ad5cd", "title": "PyTorch Lightning S3 Savings"}, "443": {"path": "/RWKV-v4neo/src/trainer.py:1-30", "hash": "5ffdd9eb5c9ca4af512d5a6504a29e80", "title": "Distributed S3 Training with Deepspeed"}, "444": {"path": "/RWKV-v4neo/src/trainer.py:31-51", "hash": "fee94374b37b3ab89d4c3d9e979885b5", "title": "LR Scheduler and GPU Cache Cleaner"}, "445": {"path": "/RWKV-v4neo/src/trainer.py:53-72", "hash": "27bca63c5b49e5e2df06a3b979856882", "title": "Dynamic Learning Rate Adjustment based on Exit Tokens"}, "446": {"path": "/RWKV-v4neo/src/trainer.py:73-97", "hash": "ef71e83ba765959b15de7ce4d64038b1", "title": "Dynamic Learning Rate Update"}, "447": {"path": "/RWKV-v4neo/src/trainer.py:98-120", "hash": "874f9d06f96699dd1b083db215b450fd", "title": "Exception-Handling Flush: Train Tracer Init"}, "448": {"path": "/RWKV-v4neo/src/trainer.py:121-142", "hash": "50414fd1632698e650a736edd1bc58a6", "title": "Trainer Performance Metrics Calculation"}, "449": {"path": "/RWKV-v4neo/src/trainer.py:143-164", "hash": "cb18efb0997f5f5470235d3f53717c36", "title": "Trainer: Model Checkpoint and Metrics Logging"}, "450": {"path": "/RWKV-v4neo/src/trainer.py:165-182", "hash": "717f9ad6cbc5565b92895a5365266582", "title": "Epoch End Handler for MyDataset"}, "451": {"path": "/RWKV-v4neo/src/trainer.py:183-208", "hash": "90cf59e7570c55231fa8d7f66f1fbb5e", "title": "Model Saving and Epoch Tracking"}, "452": {"path": "/RWKV-v4neo/src/trainer.py:209-233", "hash": "9e618bcd97db7bd703ce9253ceaca1b4", "title": "Load, Check, Reshape, and Interpolate Dictionary Data"}, "453": {"path": "/RWKV-v4neo/src/trainer.py:234-243", "hash": "54de8f98031f10b212f2219df64a0569", "title": "Model Training Checkpoint Saving and Progression"}, "454": {"path": "/RWKV-v4neo/src/utils.py", "hash": "49af45b70a091e83199737806f5c3210", "title": "Prime Number Verification Utility"}, "455": {"path": "/RWKV-v4neo/src/utils.py:1-30", "hash": "4c7427005753052f4f5db61b42fad4dd", "title": "Tokenizing Class with Time Recording"}, "456": {"path": "/RWKV-v4neo/src/utils.py:32-60", "hash": "9e3883e70ad7087b9b4f42f125b81595", "title": "Text Generation Model Utilities"}, "457": {"path": "/RWKV-v4neo/src/utils.py:61-85", "hash": "32e48008a17114c0e028e2af811b9ac8", "title": "Prime Number Calculation: Fermat & Miller-Rabin"}, "458": {"path": "/RWKV-v4neo/src/utils.py:86-122", "hash": "6eef316b4e440912e19fbdfedb4b9cd7", "title": "Prime Testing Algorithms: Fermat and Miller-Rabin"}, "459": {"path": "/RWKV-v4neo/src/utils.py:124-130", "hash": "bbddf158d93490f51dc30ab733ed617a", "title": "Prime Verification Code"}, "460": {"path": "/RWKV-v4neo/train.py", "hash": "c00b502ac9660a3d3c31c7fb32a7e386", "title": "Train RWKV Model with PyTorch Lightning"}, "461": {"path": "/RWKV-v4neo/train.py:1-23", "hash": "3ce04bc198807fcc546a8869f9ff595e", "title": "Train RWKV Language Model with PyTorch Lightning"}, "462": {"path": "/RWKV-v4neo/train.py:24-39", "hash": "9f9aa563df7134976166aa8bca0687bd", "title": "Training RWKV on Enwik8 and Fine-tuning V4Neo"}, "463": {"path": "/RWKV-v4neo/train.py:40-54", "hash": "963b8f5a8aed701b23e9144ca6746e35", "title": "RWKV-v4neo Fine-tuning with 8 GPUs and BF16"}, "464": {"path": "/RWKV-v4neo/train.py:56-69", "hash": "a96529ec2412e1f34cb351603d7f9bd6", "title": "Command Line Arguments Control RWKV Training"}, "465": {"path": "/RWKV-v4neo/train.py:69-83", "hash": "105ad43b3d942e8fbf245c0aeea01185", "title": "RWKV-v4neo Training Parameters"}, "466": {"path": "/RWKV-v4neo/train.py:84-96", "hash": "53002acb5a8608f684db4d36ac26bf81", "title": "Optimizing Hyperparameters with Argparse in RWKV-v4neo/train.py"}, "467": {"path": "/RWKV-v4neo/train.py:97-111", "hash": "6017c225b781401df459a08d30c39a90", "title": "Command Line Arguments in RWKV-v4neo Training"}, "468": {"path": "/RWKV-v4neo/train.py:112-129", "hash": "1b792eb6eb0c88853255124e66165ca4", "title": "Defining CLI Arguments for Model Training"}, "469": {"path": "/RWKV-v4neo/train.py:130-153", "hash": "eca0e645be28c2d46ffdfa5aba69abbd", "title": "Setting Up Training Environment"}, "470": {"path": "/RWKV-v4neo/train.py:154-178", "hash": "c84383d9931f56f85863054047e34d52", "title": "Customizing RWKV-v4neo Model Training Arguments"}, "471": {"path": "/RWKV-v4neo/train.py:179-208", "hash": "69eb99ffebd40fcf8281aa3906673fd1", "title": "Directory Creation and Value Adjustments"}, "472": {"path": "/RWKV-v4neo/train.py:209-233", "hash": "9d8e1ad28a07330d8098515262010013", "title": "Epoch Count and Model Selection"}, "473": {"path": "/RWKV-v4neo/train.py:234-258", "hash": "263dbb01b51deeec9d663d36a423aa70", "title": "RWKV Model Loading and Configuration"}, "474": {"path": "/RWKV-v4neo/train.py:259-280", "hash": "9a1656eb531b47b9220546c157d192eb", "title": "Variables and Learning Rate Details"}, "475": {"path": "/RWKV-v4neo/train.py:282-308", "hash": "8b7509821168857aa9d44b55f86425e1", "title": "Precision and CUDNN Adjustments"}, "476": {"path": "/RWKV-v4neo/train.py:310-335", "hash": "994deb14c439e1e7800fe4e370963efe", "title": "Initializing and Loading RWKV Model"}, "477": {"path": "/RWKV-v4neo/train.py:336-358", "hash": "d43bad67ddedf1cc178283d3fec8743b", "title": "Load and Verify Model Checkpoint"}, "478": {"path": "/RWKV-v4neo/train.py:359-378", "hash": "43d2a109ecf2e856391f9c1e60c8d1fd", "title": "Trainer Object with DeepSpeed Strategy"}, "479": {"path": "/RWKV-v4neo/train.py:378-383", "hash": "dacc7201aa284dcfdf1dd81fde22ec83", "title": "Configure Data Loader and Fit Model"}, "480": {"path": "/RWKV-v4neo/verify.py", "hash": "9acfaead65bfb3e3dae29493ef6c5c6a", "title": "RWKV Model Training and Output Printing"}, "481": {"path": "/RWKV-v4neo/verify.py:1-27", "hash": "eda48b7bab37856cb39bfd7f6db03d90", "title": "RWKV Language Model Verification and Setup"}, "482": {"path": "/RWKV-v4neo/verify.py:28-66", "hash": "07657a8f47a31292b37926b6ae62bbcf", "title": "Initialize and Load RWKV Model"}, "483": {"path": "/RWKV-v4neo/verify.py:67-91", "hash": "6394a2f26d1c9305b4d06c5b7ae5bfaf", "title": "Verify and Set Float Mode"}, "484": {"path": "/RWKV-v4neo/verify.py:92-104", "hash": "b0a8d00cd9de329a3461f20da54b2172", "title": "RWKV-RNN Output Printing in verify.py"}, "485": {"path": "/RWKV-v5/cuda/wkv5_cuda.cu", "hash": "d4b9e074d60eb7f065e9a7ac93064ad8", "title": "Efficient CUDA ConvNet Forward Pass"}, "486": {"path": "/RWKV-v5/cuda/wkv5_op.cpp", "hash": "4a2a9facaa426e74897949437487da38", "title": "RWKV-v5 Neural Network Operations with CUDA & BFloat16"}, "487": {"path": "/RWKV-v5/demo.jsonl", "hash": "56385324eec393ca1dc9b4d9f55c5e05", "title": "AI Applications and Legal Battles"}, "488": {"path": "/RWKV-v5/demo.jsonl:1-1", "hash": "c3e681d93f579fea2f9060db74d518b5", "title": "Sydney Stabbing Spree: One Dead, Two Charged"}, "489": {"path": "/RWKV-v5/demo.jsonl:2-2", "hash": "c00d7c1b210cc5f5f6d392edffa380d0", "title": "Stratification's Impact on Nutrient Mixing"}, "490": {"path": "/RWKV-v5/demo.jsonl:2-3", "hash": "4c79f962542eb0f89a55b16d0b408fec", "title": "Lakes and Nutrient Levels: High vs. Stratified"}, "491": {"path": "/RWKV-v5/demo.jsonl:3-3", "hash": "f7486b5aabf88f223eafa250fcc8ad3a", "title": "Master Xbox Screenshots with Game DVR"}, "492": {"path": "/RWKV-v5/demo.jsonl:4-5", "hash": "71a1febba34333928ff639fae6055329", "title": "Capitalization Correction AI Demo"}, "493": {"path": "/RWKV-v5/demo.jsonl:5-5", "hash": "f7598eafa289ac0e607ac69c391ba749", "title": "Text Parsing Code: Breaks Down Company Details with English Formatting"}, "494": {"path": "/RWKV-v5/demo.jsonl:5-7", "hash": "464d569391d04c3b8185a6c9ab642a9b", "title": "Sentiment Analysis for Text Clarity"}, "495": {"path": "/RWKV-v5/demo.jsonl:7-7", "hash": "4e1d1bb9d50c74b9f24ff40d110ae1d6", "title": "The Tenors' Journey: A Snapshot"}, "496": {"path": "/RWKV-v5/demo.jsonl:7-8", "hash": "32566b91e4af5985267bcf46344db83b", "title": "Artist Group Evolution"}, "497": {"path": "/RWKV-v5/demo.jsonl:8-8", "hash": "cec10ac54b94132618baff327dbbab41", "title": "Content ID: Detecting Copyright Infringement in Videos"}, "498": {"path": "/RWKV-v5/demo.jsonl:9-9", "hash": "aeccf73f3deab0009cd24c0c41a6ad60", "title": "AI Analyzes Negative Book Review"}, "499": {"path": "/RWKV-v5/demo.jsonl:10-10", "hash": "4688d667abd672becd686cb7a85d882c", "title": "Iran's Oil Outlook Under Nuclear Deal"}, "500": {"path": "/RWKV-v5/demo.jsonl:10-11", "hash": "1e1bfde53bbf20ec36cc216b8cec61f7", "title": "Saudi Doubts on Tehran Deal Challenges Iran's OPEC Output"}, "501": {"path": "/RWKV-v5/demo.jsonl:11-11", "hash": "532f81487e645a9a7f4e55dafa086e54", "title": "Aidan Somers: Irish Hurler and Accomplishments"}, "502": {"path": "/RWKV-v5/demo.jsonl:11-12", "hash": "5252eda95a8ecbc55ea582cb2eba5cc9", "title": "Manure Classification Regulations"}, "503": {"path": "/RWKV-v5/demo.jsonl:12-12", "hash": "b5b74b7a5589de6cf8219507698c8348", "title": "Approved Dried Manure and Poultry Droppings"}, "504": {"path": "/RWKV-v5/demo.jsonl:12-13", "hash": "3944af125aa01796eb1e532a823a0475", "title": "Translator: RWKV-v5"}, "505": {"path": "/RWKV-v5/demo.jsonl:13-14", "hash": "a86b704e89b094d7c04d23140fd95393", "title": "Multilingual Text Translation and Odd Statement Generation"}, "506": {"path": "/RWKV-v5/demo.jsonl:15-16", "hash": "dc5d51f60ba7694f51c19d57457103c8", "title": "AI Explained like to a Five-Year-Old"}, "507": {"path": "/RWKV-v5/demo.jsonl:16-17", "hash": "73b4f7ccc30886641bea9c4463b8fbb2", "title": "AI on Kazuma's Quest: The Band and Dixie's Tale"}, "508": {"path": "/RWKV-v5/demo.jsonl:17-17", "hash": "394ddb156dbc6cb157488bca2fa94cce", "title": "Manga to Anime Journey: History and Legacy"}, "509": {"path": "/RWKV-v5/demo.jsonl:17-18", "hash": "b630664c34fab39c34db9b332bff58db", "title": "Rythm's 'Utatane' and 'Mugen Factory': A Journey in Japanese Music"}, "510": {"path": "/RWKV-v5/demo.jsonl:18-18", "hash": "f452cd7512c7f77b12640ad35f257cad", "title": "Treaty of Paris and Gerard Brantsen"}, "511": {"path": "/RWKV-v5/demo.jsonl:18-19", "hash": "714c7488a9eeb4d7921dd6f0f0759a15", "title": "Treaty of Paris (1783)"}, "512": {"path": "/RWKV-v5/demo.jsonl:20-21", "hash": "3a7634cb536d6815bdc589ce92e8dac8", "title": "Interactive User-Assistant Q&A"}, "513": {"path": "/RWKV-v5/demo.jsonl:21-22", "hash": "57c675662f5c8d97c6b8308eb76552d6", "title": "Product Review Sentiment Analysis"}, "514": {"path": "/RWKV-v5/demo.jsonl:22-24", "hash": "32be0eaca42e37c4103fcbcbd6ff6329", "title": "AI Assistant Review Analyzer"}, "515": {"path": "/RWKV-v5/demo.jsonl:24-24", "hash": "3085a68cd772e2a708a4d0f2a21f496b", "title": "Lonely Girl in the Park"}, "516": {"path": "/RWKV-v5/demo.jsonl:24-25", "hash": "af8db0e3cbb3d3394fb0d14f7888cf9f", "title": "Reasons for Inaction"}, "517": {"path": "/RWKV-v5/demo.jsonl:25-26", "hash": "5c6a3ab49dd39de3cfe37c7d89bd6f53", "title": "Buffalo Springfield Box Set Details"}, "518": {"path": "/RWKV-v5/demo.jsonl:26-26", "hash": "c6691748b8945a88ad9ba2824a8a29a4", "title": "George's Time-Hopping Adventure"}, "519": {"path": "/RWKV-v5/demo.jsonl:26-27", "hash": "50e379b43dac561c37c91866f7f8018e", "title": "George's Lava Escape"}, "520": {"path": "/RWKV-v5/demo.jsonl:27-27", "hash": "0f52bd76a0e2dd36d6548c4f8dbd11d9", "title": "Loss and Blame: Victoria's Grief"}, "521": {"path": "/RWKV-v5/demo.jsonl:27-29", "hash": "cc9b30dd5c531ffe3b5054c1a7e0a8dd", "title": "Two Questions, Two Answers"}, "522": {"path": "/RWKV-v5/demo.jsonl:29-30", "hash": "b610e3d1f216778378ee0fcffee97667", "title": "HDTV's European Debut: Vienna Concert 2004"}, "523": {"path": "/RWKV-v5/demo.jsonl:30-30", "hash": "743a4833fe54bf753b516e7a91a378da", "title": "Harold's Horse Racing Victory"}, "524": {"path": "/RWKV-v5/demo.jsonl:30-31", "hash": "ae938496291c72ab6c1a5b6ff3ea4f9e", "title": "Heist and Hierarchy"}, "525": {"path": "/RWKV-v5/demo.jsonl:31-31", "hash": "1e62a904f5efd9c5fad95f84b72ccaf8", "title": "Metropolitan Archbishop Roles and Responsibilities"}, "526": {"path": "/RWKV-v5/demo.jsonl:31-32", "hash": "71ec5ab0b31360681b1da5445b2b2c40", "title": "AI Learns Ecclesiastical Jurisdiction and 'MGS V' Open-World"}, "527": {"path": "/RWKV-v5/demo.jsonl:32-33", "hash": "fd143f621891c609f9ddaac220f5cc04", "title": "Eddy Merckx: Cycling's Unmatched Legend"}, "528": {"path": "/RWKV-v5/demo.jsonl:33-33", "hash": "ccbe2a99b13ddbdd7319d03fff3725c6", "title": "Career Progression of Eddy Merckx"}, "529": {"path": "/RWKV-v5/demo.jsonl:33-34", "hash": "bf6ad4a8ff5d8a702d476b49db5d7793", "title": "Eddy Merckx: Cycling Champion and Rainbow Jersey Holder"}, "530": {"path": "/RWKV-v5/demo.jsonl:34-35", "hash": "58d1bd959bced2664d27bf5c92f9778d", "title": "Frequency of \"a\" in Sentences"}, "531": {"path": "/RWKV-v5/demo.jsonl:35-35", "hash": "4b121731f7fac320b7bffb57cce1f3fe", "title": "Starvation in a Town: Not Cardiff"}, "532": {"path": "/RWKV-v5/demo.jsonl:36-37", "hash": "99355cc4b0fb8a6edfb3b12ed7a05465", "title": "Negative Review Verdict"}, "533": {"path": "/RWKV-v5/demo.jsonl:37-38", "hash": "2752aba5e5af743476b6b635d3e1074f", "title": "Sentiment Analysis for Movie Reviews"}, "534": {"path": "/RWKV-v5/demo.jsonl:38-38", "hash": "ed772126970a6fa3bd7032548ba4d051", "title": "Begin Again: A Mixed Review"}, "535": {"path": "/RWKV-v5/demo.jsonl:38-39", "hash": "c2741f74bb8f214a595fa465642f693e", "title": "Healing Melodies, Predictable Beats"}, "536": {"path": "/RWKV-v5/demo.jsonl:39-39", "hash": "5e7d3d4b41002f64c3080af13f01983e", "title": "Mouse Family's American Journey"}, "537": {"path": "/RWKV-v5/demo.jsonl:39-40", "hash": "6af8f44071d9fbc2c5499221d9c06dcb", "title": "Scotland's Revolutionary Moment"}, "538": {"path": "/RWKV-v5/demo.jsonl:40-40", "hash": "e8aff4075eff73a21204f3c922636a6f", "title": "Scotland's Stunning Views and Funny Fanatics"}, "539": {"path": "/RWKV-v5/demo.jsonl:40-42", "hash": "558e1f8b31799eda6e2b7e30671e7108", "title": "Amphibian Quiz Hints: Chemicals"}, "540": {"path": "/RWKV-v5/demo.jsonl:42-42", "hash": "dbb6970b96a7342fcaa4362e837a9d59", "title": "Mystery of the Zeko Dagger"}, "541": {"path": "/RWKV-v5/demo.jsonl:42-44", "hash": "4dec36c57ddace041cb1c8fe1a70b43b", "title": "Incomplete JSON Scenarios"}, "542": {"path": "/RWKV-v5/demo.jsonl:44-45", "hash": "6fc69ec676af99485c554ea4f2bcae9c", "title": "NLP-Based Response Generator"}, "543": {"path": "/RWKV-v5/demo.jsonl:45-45", "hash": "6dabb3e8067ca69aec3794119964da3d", "title": "The Breeders: A Turbulent History and Comeback"}, "544": {"path": "/RWKV-v5/demo.jsonl:46-46", "hash": "a13f4a4647912373679a9182df17413f", "title": "Removing Beetle Headlights"}, "545": {"path": "/RWKV-v5/demo.jsonl:46-47", "hash": "4e92120bfab3e8e226363c28e4c6f924", "title": "Removing Beetle Headlights: Step-by-Step Guide"}, "546": {"path": "/RWKV-v5/demo.jsonl:47-47", "hash": "987726e6f8d3ec5723ba4f87e8588a18", "title": "Determining Tweet Sentiment"}, "547": {"path": "/RWKV-v5/demo.jsonl:47-48", "hash": "42d272d794237d9512c686881d8c08e0", "title": "Restaurant JSON Data Snippet"}, "548": {"path": "/RWKV-v5/demo.jsonl:48-49", "hash": "e90d0f8e8e7043ad106133e3349e6d78", "title": "Establishment Info Parser"}, "549": {"path": "/RWKV-v5/demo.jsonl:49-50", "hash": "631e0757a7e1de2eef9ef80acdd859cb", "title": "Cloud Computing Project Topics: Research Abroad vs Domestic"}, "550": {"path": "/RWKV-v5/demo.jsonl:50-51", "hash": "e6fbcc279e71c19ce28e06229f13f1c1", "title": "Saghar's 2,000+ Song Compositions"}, "551": {"path": "/RWKV-v5/demo.jsonl:51-51", "hash": "13586d53638c21bbc05c5e87baf351b4", "title": "Paragraph Categorization: Artist Detection"}, "552": {"path": "/RWKV-v5/demo.jsonl:51-52", "hash": "8379d759552d9fe83c4d5ec90d1df212", "title": "Johnny's Sunday Lunch Activity"}, "553": {"path": "/RWKV-v5/demo.jsonl:52-52", "hash": "46e7ec820b3bd96761027029bc3e0163", "title": "Johnny's Busy Weekend Activities"}, "554": {"path": "/RWKV-v5/demo.jsonl:52-53", "hash": "228d96282ab395e1ed90f46ec361ae43", "title": "Title: \"Code as a Reflection\""}, "555": {"path": "/RWKV-v5/demo.jsonl:53-53", "hash": "8c68cf764c77378d6070788c0c0029b4", "title": "Christie's Rising Role in Trump's Organization"}, "556": {"path": "/RWKV-v5/demo.jsonl:53-54", "hash": "3214e94e50491e99b02583c878d0fdda", "title": "Trump's Campaign Prep and Melody Barnes' Past Roles"}, "557": {"path": "/RWKV-v5/demo.jsonl:54-55", "hash": "c91a376c10fc055af6f4ab1d7e719ad9", "title": "AI Handles Office Holder Questions"}, "558": {"path": "/RWKV-v5/demo.jsonl:55-55", "hash": "b9181e096b6ef2e9db4e95592058cd9e", "title": "Billion-Dollar Bracket Odds: One in 9.2 Quintillion"}, "559": {"path": "/RWKV-v5/demo.jsonl:55-56", "hash": "6865530d8d2be23a9f254be7215a2a0f", "title": "Perfect Brackets: Billion-Dollar Challenge"}, "560": {"path": "/RWKV-v5/demo.jsonl:56-56", "hash": "12242f74ec4df99308400075ce6bfdf8", "title": "Important Dates and Policies Shared"}, "561": {"path": "/RWKV-v5/demo.jsonl:56-57", "hash": "3fce8ae44cdf11f0d47a4634a0397d95", "title": "School Uniform Debate Analysis"}, "562": {"path": "/RWKV-v5/demo.jsonl:57-57", "hash": "552e5a3746cf813f1f0dfe4a910b3b5b", "title": "Positive Review: G-Shock's Endurance"}, "563": {"path": "/RWKV-v5/demo.jsonl:58-58", "hash": "87b2429e0a2bfac1b940db73c914fc37", "title": "Private Security Patrol: Deterrent Against Johannesburg Crime"}, "564": {"path": "/RWKV-v5/demo.jsonl:58-59", "hash": "816f544eb905b72e47cbbedebd94f707", "title": "Security Firms Ownership in SA: A Borat-Inspired Proposal"}, "565": {"path": "/RWKV-v5/demo.jsonl:59-61", "hash": "57bae1252572c7fd7ab16ede29fc7a70", "title": "Language Translation and \"Borat\" Fun"}, "566": {"path": "/RWKV-v5/demo.jsonl:61-62", "hash": "057cb4f2ae67b37a769d378119a6a7b4", "title": "Task: Generate Entailing Sentences in Fiction Genre"}, "567": {"path": "/RWKV-v5/demo.jsonl:62-63", "hash": "8c66c2bc67137d41e0573e80bf9de795", "title": "Understanding FMLA Eligibility"}, "568": {"path": "/RWKV-v5/demo.jsonl:63-64", "hash": "083b25a57cbf3c3246678d0e28a80cd4", "title": "FMLA Eligibility Criteria System"}, "569": {"path": "/RWKV-v5/demo.jsonl:64-65", "hash": "3ab86000679bf36ed39d481126d7f381", "title": "Joining a Sorority: Understanding Greek System and NPC"}, "570": {"path": "/RWKV-v5/demo.jsonl:65-67", "hash": "a5efb1feaae02dd2104af67a215c04cc", "title": "Sorority 101: Joining Process and Movie Facts"}, "571": {"path": "/RWKV-v5/demo.jsonl:67-69", "hash": "63a3fb18a62593d4de06ff02c84d5f88", "title": "Geo-AI Question Bot"}, "572": {"path": "/RWKV-v5/demo.jsonl:69-69", "hash": "46d170002921ea96b8fffb88f409242b", "title": "Mastering Ollie for 180 Skateboard Trick"}, "573": {"path": "/RWKV-v5/demo.jsonl:69-70", "hash": "c8b7df19b99d7cf28e4347788ccda3dd", "title": "Mastering the 180 Skateboard Trick"}, "574": {"path": "/RWKV-v5/demo.jsonl:70-71", "hash": "144427018f3d709d6769342749056bc6", "title": "Story Completion AI"}, "575": {"path": "/RWKV-v5/demo.jsonl:72-72", "hash": "ba12b631b73a6206ab1efb3a7c2d9def", "title": "AI Analyzes Giraffe Claim in St. Estevam Text"}, "576": {"path": "/RWKV-v5/demo.jsonl:72-73", "hash": "0f64382d461b01489eac3e76e84e761b", "title": "Restaurant Analysis Engine"}, "577": {"path": "/RWKV-v5/demo.jsonl:73-75", "hash": "2a2953b3e3d513e52f5451aa8279472a", "title": "Various Tasks and Thoughts"}, "578": {"path": "/RWKV-v5/demo.jsonl:75-75", "hash": "65a0f6dbb9e45288b3e061e93a4675bb", "title": "Sentence Analysis and Key Statements Alignment"}, "579": {"path": "/RWKV-v5/demo.jsonl:75-76", "hash": "181a413b1cc19cc8bbc64c030291c7d6", "title": "Text-Based Verification Algorithm"}, "580": {"path": "/RWKV-v5/demo.jsonl:76-77", "hash": "139b1d519e9c77bebe55fe18ce9d9b4a", "title": "Generate Sentence with Given Words"}, "581": {"path": "/RWKV-v5/demo.jsonl:77-77", "hash": "fb6c4b9c50157b221d41dfc2da7cb1e8", "title": "Oscars and Tom Cruise's Nominations"}, "582": {"path": "/RWKV-v5/demo.jsonl:77-79", "hash": "bb525ad3e585ab502beaec5671b5656d", "title": "Entity Relationship Quiz: Mark Johnson"}, "583": {"path": "/RWKV-v5/demo.jsonl:79-79", "hash": "73bc07c152c1007187c34c5fae056735", "title": "Customize Your Curry Soup"}, "584": {"path": "/RWKV-v5/demo.jsonl:79-81", "hash": "01f665712d79b4fce458ad1664b61955", "title": "AI-Assisted Curry Guide"}, "585": {"path": "/RWKV-v5/demo.jsonl:81-82", "hash": "a555bc9b111c27d7135be6149a49bfa1", "title": "AnswerBot"}, "586": {"path": "/RWKV-v5/demo.jsonl:82-83", "hash": "a83dbbad8f77ef792eca367be49ee31a", "title": "Determine Entailment of Facts"}, "587": {"path": "/RWKV-v5/demo.jsonl:83-84", "hash": "1027ae955bac86cc839681195143adf9", "title": "Create a Restaurant Description"}, "588": {"path": "/RWKV-v5/demo.jsonl:84-85", "hash": "73f2da78952cb1814bf063836e02d0ba", "title": "Merger of Betrusted and TruSecure Forms CyberTrust"}, "589": {"path": "/RWKV-v5/demo.jsonl:85-85", "hash": "fe75b455e662116679b6e990c74299d3", "title": "Merger Forms CyberTrust: Betrusted & TruSecure Unite"}, "590": {"path": "/RWKV-v5/demo.jsonl:85-86", "hash": "c5fa3eb29cace76a46721c94b6cd7949", "title": "Betrusted and TruSecure Merge to Form CyberTrust"}, "591": {"path": "/RWKV-v5/demo.jsonl:86-86", "hash": "55029054c146a4871669523acfcc6582", "title": "Key Elements of Life Liquid Show Interview"}, "592": {"path": "/RWKV-v5/demo.jsonl:86-88", "hash": "9bc2c64b25cb0f2f3a40313fa9cf2609", "title": "AI Chatbot for Resistors and Character Identification"}, "593": {"path": "/RWKV-v5/demo.jsonl:88-89", "hash": "3e96ff35abebeb2d2fde0b51eeff4093", "title": "Cleverness Predicts Success"}, "594": {"path": "/RWKV-v5/demo.jsonl:89-90", "hash": "ada08162134d5825f8c272957db564c0", "title": "Pop Artist Tom UNK: Reclining Nudes and Tragic Loss"}, "595": {"path": "/RWKV-v5/demo.jsonl:90-90", "hash": "0d9895711787b1664631591aa9a99703", "title": "Haltering a Horse: Step-by-Step Guide"}, "596": {"path": "/RWKV-v5/demo.jsonl:90-91", "hash": "c5caefe48d013cedb4db7ae8faa8e88b", "title": "Approaching Horses and End of First Crusade"}, "597": {"path": "/RWKV-v5/demo.jsonl:91-92", "hash": "40d065d30ff6ccc346705461a5633ef8", "title": "Crusade's Impact and Linear Equation"}, "598": {"path": "/RWKV-v5/demo.jsonl:92-93", "hash": "40ab292a16ecc5b8ed76b57df1047f23", "title": "Solving Linear Equations: -6c = 24"}, "599": {"path": "/RWKV-v5/demo.jsonl:93-94", "hash": "bbecd8943f5e724447d0ce631b5696da", "title": "Algorithmic Premise Analysis"}, "600": {"path": "/RWKV-v5/demo.jsonl:94-94", "hash": "b67fe3419abb0ea137e2b5584144240b", "title": "Boko Haram's Deadly Rampage in Baga"}, "601": {"path": "/RWKV-v5/demo.jsonl:94-95", "hash": "75b868fba2a033108ff8687e99a7102c", "title": "AI Trains on Boko Haram's Baga Attacks"}, "602": {"path": "/RWKV-v5/demo.jsonl:95-97", "hash": "1299ed86ec83b56286a29c1236318afd", "title": "Odd Multiplier"}, "603": {"path": "/RWKV-v5/demo.jsonl:97-98", "hash": "d03b230b2a39f5d52d9295479e67292d", "title": "The Pickwick Papers"}, "604": {"path": "/RWKV-v5/demo.jsonl:98-98", "hash": "13721c030d5227d91386857ef24cf786", "title": "Multi-meaning of \"Feel\""}, "605": {"path": "/RWKV-v5/demo.jsonl:99-100", "hash": "761beaa494e383c6bcd66f08ef88b3db", "title": "Judson's Invention and Task"}, "606": {"path": "/RWKV-v5/demo.jsonl:100-100", "hash": "2e23e4b424b4d8d7864e367c66b0eb86", "title": "Tobacco and Fast Food Near Schools: A Concerning Trend"}, "607": {"path": "/RWKV-v5/demo.jsonl:100-101", "hash": "5fd7b0a13d929e92d1036bdae6732400", "title": "Divided JSON Entry Examples"}, "608": {"path": "/RWKV-v5/demo.jsonl:101-101", "hash": "7fb4bea919fb3b3af2e407d6890efab2", "title": "Language Identification: Swedish Sentence"}, "609": {"path": "/RWKV-v5/demo.jsonl:101-103", "hash": "fd01a4ceb940df885d1b1fbd19ce6ae8", "title": "Multilingual Sentiment Analysis Assistant"}, "610": {"path": "/RWKV-v5/demo.jsonl:103-103", "hash": "9bdfab6d7dd91eb30a962e45a7564b21", "title": "Marked for Vengeance"}, "611": {"path": "/RWKV-v5/demo.jsonl:103-104", "hash": "e4308f7105afc4fa3cd49f88fc5544ee", "title": "Cost-Benefit Analysis for Global Issues"}, "612": {"path": "/RWKV-v5/demo.jsonl:104-104", "hash": "9c9493d8f39253d2f1907d61aa2d1b49", "title": "Incorrect Copenhagen Consensus Cadence"}, "613": {"path": "/RWKV-v5/demo.jsonl:105-106", "hash": "f8a01e7fb49e0b399d5f2ba3cbfd53f4", "title": "AI Analyzing User's Drawing Hypothesis"}, "614": {"path": "/RWKV-v5/demo.jsonl:106-106", "hash": "bffb7a7f3cd4d22ea3196b98ac148b37", "title": "Contradicting Hypothesis"}, "615": {"path": "/RWKV-v5/demo.jsonl:106-107", "hash": "bb518691ac34fe659ee90add8c11c999", "title": "Abuse, Batteries, and Deception"}, "616": {"path": "/RWKV-v5/demo.jsonl:107-108", "hash": "d1b97ee5b8815d7cc6741b9ed1ceeb3d", "title": "Single-Cell Battery Concept Explained"}, "617": {"path": "/RWKV-v5/demo.jsonl:108-110", "hash": "b23462ced4fe86b18a721dd50df06239", "title": "AI-Generated Article Titles and Restaurant Descriptions"}, "618": {"path": "/RWKV-v5/demo.jsonl:110-111", "hash": "141da37b7678b521e414bd0fa62062df", "title": "Common Sense Completion Test"}, "619": {"path": "/RWKV-v5/demo.jsonl:111-111", "hash": "7d04f183e9df696ecc0d3e3d24a31397", "title": "Walk and Errands: Positive Mood"}, "620": {"path": "/RWKV-v5/demo.jsonl:111-112", "hash": "b8f559dd32b34f22b92533ec7cc650d9", "title": "Happiness Detection Code"}, "621": {"path": "/RWKV-v5/demo.jsonl:112-113", "hash": "7a7deea04a7a3d1ccfa31ccd46f98654", "title": "Galician to Portuguese Translation Task"}, "622": {"path": "/RWKV-v5/demo.jsonl:113-113", "hash": "0fc59a61e724ff1246640e69806185f4", "title": "Effortless Oven Door Cleaning"}, "623": {"path": "/RWKV-v5/demo.jsonl:113-114", "hash": "42210f4b3ec169c8792113c7be29f4fc", "title": "Double-Pane Window Cleaning Efficiency"}, "624": {"path": "/RWKV-v5/demo.jsonl:114-114", "hash": "c599f60df8177c5b8428c0306dc653a2", "title": "Escape and Triumph"}, "625": {"path": "/RWKV-v5/demo.jsonl:114-115", "hash": "8ba13d2a34f8ee1ad484caa1e1b73dfd", "title": "Sentiment Analysis of Customer Reviews"}, "626": {"path": "/RWKV-v5/demo.jsonl:115-116", "hash": "842a6937cfdd5db72ba3f79dfc15abd1", "title": "Summarizing Negative Reviews"}, "627": {"path": "/RWKV-v5/demo.jsonl:116-116", "hash": "c5ed4af2eb44c5238fe552af737f7509", "title": "Restructuring HR at Enron: Reporting Directly to Chairman"}, "628": {"path": "/RWKV-v5/demo.jsonl:116-117", "hash": "2643421b012386aea54458d547948ec7", "title": "HR Team Roles Update"}, "629": {"path": "/RWKV-v5/demo.jsonl:117-118", "hash": "221a2bf765fa49fd997ed18ebf9b9ed8", "title": "Capitalization Algorithm: Sentence Identification and Correction"}, "630": {"path": "/RWKV-v5/demo.jsonl:118-118", "hash": "2a45c83aa51d66999a6c70cfbdbf784c", "title": "Peeing Differences: Girls vs Women"}, "631": {"path": "/RWKV-v5/demo.jsonl:118-119", "hash": "934424c372fd31fe3faa10d8acd8b53e", "title": "Female Urination Devices: Overcoming Challenges"}, "632": {"path": "/RWKV-v5/demo.jsonl:119-120", "hash": "f9578139ba8cb82d8b19ddbb5035d338", "title": "Self-Defense Response to Jesse's Attack"}, "633": {"path": "/RWKV-v5/demo.jsonl:120-121", "hash": "54d29144af1ec5a0abeb14b71f83ddea", "title": "Subdue Jesse for Self-Defense"}, "634": {"path": "/RWKV-v5/demo.jsonl:121-121", "hash": "441c6010f83e9e31e28b5d136108e264", "title": "Hector's Arrival on Saturn 3"}, "635": {"path": "/RWKV-v5/demo.jsonl:121-122", "hash": "e9f1678fa766a41b42791946dc1af9ea", "title": "Saturn 3: Hector's Rampage"}, "636": {"path": "/RWKV-v5/demo.jsonl:122-123", "hash": "c59927d7cdbf025e990b502d0c789c03", "title": "Faster Water, More Energy"}, "637": {"path": "/RWKV-v5/demo.jsonl:123-124", "hash": "36e7d0d9537cdd0d999a61ffe7cebb59", "title": "Clothespins or Paperclips for Chip Bags"}, "638": {"path": "/RWKV-v5/demo.jsonl:124-124", "hash": "7edb1b27da7e39c7db053ba6a100f092", "title": "Sealing Chip Bags: Accessible vs Traditional Methods"}, "639": {"path": "/RWKV-v5/demo.jsonl:124-125", "hash": "6f0cb03f30518cbb8aeda0f295e15f15", "title": "Chips Closure: Needle vs. Clothespins"}, "640": {"path": "/RWKV-v5/demo.jsonl:125-127", "hash": "def04a7526260ddceb41c5b02bc7dd4e", "title": "Turkish Alphabet and Cross-Border Police Cooperation"}, "641": {"path": "/RWKV-v5/demo.jsonl:127-127", "hash": "4b188375ead716ad1425fa14dc56cc67", "title": "Scrutinizing Hodgson's Choice: Sterling's Fatigue and Performance"}, "642": {"path": "/RWKV-v5/demo.jsonl:127-128", "hash": "d3895238473adb46650ee5707a532835", "title": "Sterling's Schedule: A Key Factor in Performance"}, "643": {"path": "/RWKV-v5/demo.jsonl:128-128", "hash": "9f227c0e96cc66e52d2fc1178f76f3bf", "title": "Deceitful Filmmaker Fakes Signature, Divorces Wife"}, "644": {"path": "/RWKV-v5/demo.jsonl:128-129", "hash": "fa6d152e6f0ad16ab42d724feb81e107", "title": "JSON Data Storage or Processing"}, "645": {"path": "/RWKV-v5/demo.jsonl:129-130", "hash": "fb84612bd97496d6ee1b35f6d241abe4", "title": "Sentiment Analysis: Positive Review Detected"}, "646": {"path": "/RWKV-v5/demo.jsonl:130-132", "hash": "35b4df39576b3794e93fb79a26a89aea", "title": "Multi-lingual Text File with Toddler Movement Quiz"}, "647": {"path": "/RWKV-v5/demo.jsonl:132-132", "hash": "a8c23c81bd0a29651af5f7a8445cbbc7", "title": "Accurate Sentiment Analysis of Tweets"}, "648": {"path": "/RWKV-v5/demo.jsonl:132-133", "hash": "ee8df7432881267fe79d5538084bb8f1", "title": "Hypothesis Verification through Text Analysis"}, "649": {"path": "/RWKV-v5/demo.jsonl:133-133", "hash": "2afc8ff288d625dec8e6ab2b51be2550", "title": "Vitali-Carath\u00e9odory Approximation Theorem"}, "650": {"path": "/RWKV-v5/demo.jsonl:133-134", "hash": "f322e3cd039be4043bd2fba1026dec53", "title": "Vitali-Carath\u00e9odory Theorem: Integrable Function Approximation"}, "651": {"path": "/RWKV-v5/demo.jsonl:134-135", "hash": "0a9b71506a4b621e22e9040c647674b1", "title": "RWKV-v5 JSON Review Task"}, "652": {"path": "/RWKV-v5/demo.jsonl:135-135", "hash": "28bc972b47d5cb739603f8c18c405055", "title": "Navigating Betrayal's Aftermath: Mary's Journey"}, "653": {"path": "/RWKV-v5/demo.jsonl:135-136", "hash": "2ddf5545fc7b2a890704ff7876d27f59", "title": "Film and Playground Connection"}, "654": {"path": "/RWKV-v5/demo.jsonl:136-136", "hash": "9d21a43ad0a77fc8f093b1230fb9c3c1", "title": "Movie Database Code Analysis"}, "655": {"path": "/RWKV-v5/demo.jsonl:136-137", "hash": "4d830cda02774373c32ea5f11700ed12", "title": "Electric Dreams: San Francisco Architect"}, "656": {"path": "/RWKV-v5/demo.jsonl:137-137", "hash": "1ad202674fb44d3cdfe74d898bbbc61b", "title": "Google-Fitbit Acquisition Talks Surface"}, "657": {"path": "/RWKV-v5/demo.jsonl:137-138", "hash": "500f00f56ea62c75de081fa176a35a17", "title": "Google Acquisition JSON Snippet"}, "658": {"path": "/RWKV-v5/demo.jsonl:138-138", "hash": "13cc73dbc9f68ecab6b34771de4f380d", "title": "AI Combats EU Human Trafficking and Prostitution"}, "659": {"path": "/RWKV-v5/demo.jsonl:138-139", "hash": "77495ce4f1e6ae53781fe6017910f022", "title": "Cavemen's Disappointing Debut"}, "660": {"path": "/RWKV-v5/demo.jsonl:139-141", "hash": "0f4b868856a595d498682499c425c807", "title": "Negative Review and Translation Task"}, "661": {"path": "/RWKV-v5/demo.jsonl:141-141", "hash": "bf4901e9195f861c184161e987d4b63a", "title": "Context-Based AI Question Answering"}, "662": {"path": "/RWKV-v5/demo.jsonl:141-142", "hash": "614e0933c9cef0dea2a192e1822e4a77", "title": "Analyzing User Queries for Statute Formation"}, "663": {"path": "/RWKV-v5/demo.jsonl:142-143", "hash": "5adce9550099c77f1ae2f0b243d4a28d", "title": "How are Laws Made?"}, "664": {"path": "/RWKV-v5/demo.jsonl:143-144", "hash": "1defb63391acfddd3fcf75ff146f4ac6", "title": "Multiple-Choice Painful Decision"}, "665": {"path": "/RWKV-v5/demo.jsonl:144-144", "hash": "971851db953beada86b70a2f58f06f56", "title": "Negative Dining Experience"}, "666": {"path": "/RWKV-v5/demo.jsonl:144-145", "hash": "a732de8417d2d59a7ff9ff81e0eb1ea2", "title": "Expensive Disappointment"}, "667": {"path": "/RWKV-v5/demo.jsonl:145-147", "hash": "336778b30fcf019d60b2e56d830a8030", "title": "Turkish Translation Engine with Abbreviation Detection"}, "668": {"path": "/RWKV-v5/demo.jsonl:147-147", "hash": "b5d3cd21f123a24eae01970974c5727f", "title": "Step-by-Step Linear Equation Solution"}, "669": {"path": "/RWKV-v5/demo.jsonl:148-148", "hash": "d36daa5d8461204f6f7bf810e1f3ad15", "title": "Snoring and Tim's Struggle"}, "670": {"path": "/RWKV-v5/demo.jsonl:148-150", "hash": "10bfbd254a67c197922504665fc58d66", "title": "JSON Objects and Semantic Analysis"}, "671": {"path": "/RWKV-v5/demo.jsonl:150-150", "hash": "241b3dcce8d62c532d35f698c91cb008", "title": "Tumors Replaced with Cavities and Scar Tissue"}, "672": {"path": "/RWKV-v5/demo.jsonl:150-151", "hash": "42f0b022847ba6d85fbb63b43e1b6069", "title": "Empty Cavities Instead of Tumors"}, "673": {"path": "/RWKV-v5/demo.jsonl:151-152", "hash": "529156bb4e26f403cc9527d3eacbdd2a", "title": "Non-Consecutive President"}, "674": {"path": "/RWKV-v5/demo.jsonl:152-153", "hash": "d70d36e225291efea41a848846e1c360", "title": "AI Interaction Scenario: Eddie & Milo"}, "675": {"path": "/RWKV-v5/demo.jsonl:153-154", "hash": "ebb8565027706b497258068ecd78da7c", "title": "Geotourism Protection and Jobless Woes"}, "676": {"path": "/RWKV-v5/demo.jsonl:154-154", "hash": "296c74f02cb320dca16af5b00f39261d", "title": "UK Prime Minister Faces A4e Fraud Probe Pressure"}, "677": {"path": "/RWKV-v5/demo.jsonl:154-155", "hash": "1307d6003c770ea6671028e0ca6c44ad", "title": "Upscale Japanese Caf\u00e9, Rated Highly but Not Family-Friendly"}, "678": {"path": "/RWKV-v5/demo.jsonl:156-156", "hash": "0017e24c6b4ec97a04f3cfd2cc603de8", "title": "Madoff Auction Raises $500K for Victims"}, "679": {"path": "/RWKV-v5/demo.jsonl:156-157", "hash": "4d8e6312834a0e86e20783a29b44aff8", "title": "Madoff's Seized Possessions: A Ponzi Scheme's Downfall"}, "680": {"path": "/RWKV-v5/demo.jsonl:157-158", "hash": "9b09c9c675b285b0e20e9d34ae59256c", "title": "Word Boundary Detection"}, "681": {"path": "/RWKV-v5/demo.jsonl:158-159", "hash": "cd928dd02e51369fba0ef509f060878a", "title": "Word Boundary Correction"}, "682": {"path": "/RWKV-v5/demo.jsonl:159-159", "hash": "f37797e28dc920ec0068404d06c1c57a", "title": "Sorting Dictionaries by First, Then Second"}, "683": {"path": "/RWKV-v5/demo.jsonl:159-160", "hash": "17326e31c0d1adff521eaaf45da00a60", "title": "Family-Friendly Xname Coffee Shop Near City Centre"}, "684": {"path": "/RWKV-v5/demo.jsonl:160-161", "hash": "0ff81c281003202c146a65243f2fc950", "title": "Extract and Categorize Information"}, "685": {"path": "/RWKV-v5/demo.jsonl:161-161", "hash": "e481899cebd3bbfb4f679ae183616ed9", "title": "St. Patrick's Rugby Fest in Dublin"}, "686": {"path": "/RWKV-v5/demo.jsonl:162-162", "hash": "c3704f54884c7046ecc08efe440f0099", "title": "Choose the Right Lift"}, "687": {"path": "/RWKV-v5/demo.jsonl:162-163", "hash": "dce0833ed81ec0bd8da91d8efe75cd32", "title": "Mixed Reviews for Nail Spa"}, "688": {"path": "/RWKV-v5/demo.jsonl:163-163", "hash": "a0f921a23600a3708661110b912dc5cf", "title": "Unsatisfactory Beauty Service"}, "689": {"path": "/RWKV-v5/demo.jsonl:163-165", "hash": "f4d65c75e6295fd23590390daddd5016", "title": "Multilingual Sentiment and Translation Tasks"}, "690": {"path": "/RWKV-v5/demo.jsonl:165-166", "hash": "1739caa29f834f69f34f5dead9f8048a", "title": "Summary and Choose-Your-Own Ending"}, "691": {"path": "/RWKV-v5/demo.jsonl:166-167", "hash": "b42f69fdd6889db8f9caab5c30779174", "title": "Clown Circus Scene Analysis"}, "692": {"path": "/RWKV-v5/demo.jsonl:167-168", "hash": "bcca81238730ebf5763daa07877ca6ff", "title": "From Juvenile to Delightful"}, "693": {"path": "/RWKV-v5/demo.jsonl:168-169", "hash": "3ae75d2d191383ce8fe22b7dd2b53f39", "title": "Exploring Hall's Impact on McCarthy"}, "694": {"path": "/RWKV-v5/demo.jsonl:169-170", "hash": "4e7c9314f6d1ae515f0ce748c71709b4", "title": "Joint Efforts in Air Defense Suppression"}, "695": {"path": "/RWKV-v5/demo.jsonl:170-170", "hash": "c31c02ff43030cdf6a5ef4d3978491bb", "title": "Japanese-Indonesian Translation Checker"}, "696": {"path": "/RWKV-v5/demo.jsonl:170-171", "hash": "4df1b00601e1606b322bfa17a6bdd7e9", "title": "Multilingual Sentence Comparison and Translation"}, "697": {"path": "/RWKV-v5/demo.jsonl:171-172", "hash": "5d0f952c11cc75ecfee639c4c69dcfd2", "title": "Sentence Translation Algorithm"}, "698": {"path": "/RWKV-v5/demo.jsonl:172-172", "hash": "ffcce1ff82186d00f99b655d557b9b6c", "title": "Symposium as Equal Justice Debate Platform"}, "699": {"path": "/RWKV-v5/demo.jsonl:172-173", "hash": "476366fbdda0ec91c3addec757bbc19d", "title": "Translation Verification with RWKV-v5"}, "700": {"path": "/RWKV-v5/demo.jsonl:173-174", "hash": "a49d0ca81720a77b3c7e9e3913446d8d", "title": "Incentive Program Ineffectiveness"}, "701": {"path": "/RWKV-v5/demo.jsonl:174-174", "hash": "34e5fc68ae4192383a22920a2bd41706", "title": "Early HIV Treatment: Progress and Challenges"}, "702": {"path": "/RWKV-v5/demo.jsonl:174-175", "hash": "e946298c868e45f464122bc8832cf461", "title": "HIV Treatment Life Expectancy Study"}, "703": {"path": "/RWKV-v5/demo.jsonl:175-177", "hash": "ed08bafd6fd282e35d1cb5f3d8bba99a", "title": "AI Evaluates Product Reviews' Flattery Levels"}, "704": {"path": "/RWKV-v5/demo.jsonl:177-177", "hash": "4321d9255d290785fb1a3db12c5b131c", "title": "Saints Stun Falcons with 2004 Blocked Punt"}, "705": {"path": "/RWKV-v5/demo.jsonl:177-178", "hash": "bad9d1a53463c08c488f2fdc4813b2cd", "title": "Saints Triumph over Falcons in Football Game"}, "706": {"path": "/RWKV-v5/demo.jsonl:178-178", "hash": "6a40aa9f1bbc99078bb0c0d3613ccc4c", "title": "Undercover Battles in New World"}, "707": {"path": "/RWKV-v5/demo.jsonl:178-180", "hash": "2f5dbc9fa7ce4a90e34af7e9e39e7c08", "title": "Language Processing AI: Bridging Communication Gaps"}, "708": {"path": "/RWKV-v5/demo.jsonl:180-180", "hash": "fd2a0c81ca92fb67c4df46fd4f173c65", "title": "Labeling Conversation Formality"}, "709": {"path": "/RWKV-v5/demo.jsonl:180-181", "hash": "2d5583b4b0cfb7c0e36e9403eeca6432", "title": "Formal Conversation Labeling"}, "710": {"path": "/RWKV-v5/demo.jsonl:181-181", "hash": "bfaf33829dfcca5b6cac9abbf3cc5ac0", "title": "Balancing Drug Abuse Penalties and Patient Access"}, "711": {"path": "/RWKV-v5/demo.jsonl:181-182", "hash": "bbe5319c8e486ac0e3d0a787b8298107", "title": "JSON Pair: Legalizing Marijuana, Conference Talk Scheduling"}, "712": {"path": "/RWKV-v5/make_data.py", "hash": "c8a82fcb5af3ef76fc9c764bb92d73bc", "title": "Binary File Processing and Indexing with RWKV v5"}, "713": {"path": "/RWKV-v5/make_data.py:1-34", "hash": "14a5a578cbb54c888c73706848082557", "title": "RWKV Tokenization Tool"}, "714": {"path": "/RWKV-v5/make_data.py:35-63", "hash": "124790b605a70bf6b975f1358dba5a91", "title": "RWKV-v5 Data Builder and Indexer"}, "715": {"path": "/RWKV-v5/make_data.py:64-99", "hash": "85864e65bb7fdec03901d0bd891605f9", "title": "Prime Checker and Data Generator"}, "716": {"path": "/RWKV-v5/make_data.py:100-133", "hash": "c88809f3bd47a1ce13e9665e88718738", "title": "Shuffling Lines and Indexing Dataset"}, "717": {"path": "/RWKV-v5/make_data.py:134-158", "hash": "bf11254ac2e2832746cbb320144a6452", "title": "Exception-Handling DIX Decoder"}, "718": {"path": "/RWKV-v5/src/binidx.py", "hash": "aa27609553eee4d8e8ce234017a61724", "title": "Memory-Mapped Indexed Datasets: RWKV-v5 Binidx.py"}, "719": {"path": "/RWKV-v5/src/dataset.py", "hash": "221525a2be876124170bf9cacd6e0e1a", "title": "RWKV Dataset: Model Training and Prediction"}, "720": {"path": "/RWKV-v5/src/dataset.py:1-25", "hash": "7f9882d3172e595a4a1463851cc615b4", "title": "RWKV Dataset Class"}, "721": {"path": "/RWKV-v5/src/dataset.py:26-42", "hash": "4fc764b4a559b52b954813b42631603e", "title": "Data Processing with Version-Specific Chunks"}, "722": {"path": "/RWKV-v5/src/dataset.py:43-60", "hash": "7a1d810bcc0e3f59e5f30daf54769bcc", "title": "Defining Dataset Class with Flexible Data Sizes"}, "723": {"path": "/RWKV-v5/src/dataset.py:61-80", "hash": "266e15ccc23c7450d8f1207b2b53acf5", "title": "Building Token Arrays from Files"}, "724": {"path": "/RWKV-v5/src/dataset.py:81-107", "hash": "59f9dde2e780adf5751ad2414690aaf6", "title": "Vocabulary Initialization and JSON Conversion"}, "725": {"path": "/RWKV-v5/src/dataset.py:109-134", "hash": "4f2b291d86c0cfccfb5ba1f97bf0c271", "title": "Random Data Sampling in Dataset"}, "726": {"path": "/RWKV-v5/src/dataset.py:135-154", "hash": "c501e8a816d262ffce5d7ddd8c3c32d2", "title": "Random Data Selection Algorithm"}, "727": {"path": "/RWKV-v5/src/dataset.py:155-177", "hash": "1550180cfe7d4840097e6c8f0ad927f5", "title": "Slicing Data from Lists"}, "728": {"path": "/RWKV-v5/src/dataset.py:178-200", "hash": "0541e9148c5437d030a512694820b772", "title": "Random Context Initialization and Masking"}, "729": {"path": "/RWKV-v5/src/model.py", "hash": "07fb60580c88faff730e42b00f7338ae", "title": "DeepSpeed Transformer Optimization"}, "730": {"path": "/RWKV-v5/src/model.py:1-31", "hash": "e440ddcd6b4d247e3acc751f33af78f2", "title": "PyTorch JIT Setup for RWKV-v5 Model"}, "731": {"path": "/RWKV-v5/src/model.py:32-55", "hash": "a2319e8702a5aa877e4ec821143b1d6e", "title": "WKV-5 CUDA Kernel Integration"}, "732": {"path": "/RWKV-v5/src/model.py:56-82", "hash": "278226d31a27cef5e1206f6caf227938", "title": "Forward-Backward Pass Model Implementation"}, "733": {"path": "/RWKV-v5/src/model.py:83-95", "hash": "499e23754746e4008b09f0a53a2d4c22", "title": "CUDA-Powered WKV-5 Model Initialization"}, "734": {"path": "/RWKV-v5/src/model.py:97-119", "hash": "b267dda0070073896ab89f95ef1dfa05", "title": "RWKV_TimeMix_RWKV5 Class Definition"}, "735": {"path": "/RWKV-v5/src/model.py:120-138", "hash": "6f21e3d8f0ad04a0caa2f309c6892554", "title": "Time-Based Attention in RWKV-v5 Transformer Layer"}, "736": {"path": "/RWKV-v5/src/model.py:140-169", "hash": "7d422bd1173f99cf9c21aa27c792a4d1", "title": "Linear Layers and GroupNorm in RWKV-v5"}, "737": {"path": "/RWKV-v5/src/model.py:171-198", "hash": "f100cdf607957848ceffa2c31b145ec4", "title": "Forward Pass Definition"}, "738": {"path": "/RWKV-v5/src/model.py:199-227", "hash": "2743d3f4818f0e3860f3869125bc04eb", "title": "MishGLU Layer Initialization"}, "739": {"path": "/RWKV-v5/src/model.py:228-256", "hash": "e74ed628e4702e014db898e382c5500d", "title": "RWKV Model Architecture\nwith Time-Shifting, Mixing, & Mish"}, "740": {"path": "/RWKV-v5/src/model.py:257-276", "hash": "19c78e4ca13b1a3cccb910f2b4ac3312", "title": "Dynamic Model Component Initialization"}, "741": {"path": "/RWKV-v5/src/model.py:278-305", "hash": "8bf7a5f2ef10deb6b6a45afe39ed688f", "title": "Implementing RWKV-v5 with Dropout and Attention"}, "742": {"path": "/RWKV-v5/src/model.py:306-338", "hash": "c17d567cf6335c0948ac5f8977d5cf65", "title": "RWKV Model: L2Wrap Loss Computation"}, "743": {"path": "/RWKV-v5/src/model.py:339-366", "hash": "deb87b4e0d843458e68859c4249f7ec4", "title": "Dynamic Model Configurations in RWKV-v5"}, "744": {"path": "/RWKV-v5/src/model.py:367-394", "hash": "a1582144b015e2929fc209a1d24dfbc8", "title": "Dynamic Learning Rates Assignment"}, "745": {"path": "/RWKV-v5/src/model.py:395-412", "hash": "401fcece750a5dd80deb5e5f41950a44", "title": "Dynamic Optimizer Groups"}, "746": {"path": "/RWKV-v5/src/model.py:413-425", "hash": "a12e784a44862f6c83baa5d719d3593e", "title": "Optimizer Setup for Model"}, "747": {"path": "/RWKV-v5/src/model.py:425-449", "hash": "8a7e4f1af7ebc684d08df77268ae1c67", "title": "Initialize Adam Optimizer and Forward Function"}, "748": {"path": "/RWKV-v5/src/model.py:450-478", "hash": "a1fe4c84f6883fec125ba67b5eb61366", "title": "Multi-Head Attention in Transformer Models"}, "749": {"path": "/RWKV-v5/src/model.py:480-503", "hash": "eb421d7f8d661c53c79f4e2da693be21", "title": "Training Step: Model Loss and Logits Calculation"}, "750": {"path": "/RWKV-v5/src/model.py:504-530", "hash": "b88ec9dd4f480da79942ee3c12d7ea4c", "title": "Batch Loss Calculation with GPU Tracking"}, "751": {"path": "/RWKV-v5/src/model.py:531-559", "hash": "77b100dd253fdd87fa9eacc14a9bfd7b", "title": "Model Weight Initialization"}, "752": {"path": "/RWKV-v5/src/model.py:560-586", "hash": "57bcd74d08f3d7597df0d7dd50fd1a3c", "title": "Scaled Weight Initialization in RWKV Model"}, "753": {"path": "/RWKV-v5/src/model.py:587-594", "hash": "947a80c77919161a8ed419486e6d82e2", "title": "bfloat16 Conversion for Memory Efficiency"}, "754": {"path": "/RWKV-v5/src/trainer.py", "hash": "3bcc333f47a180d5b3953171773ad5cd", "title": "S3 Saving, Learning Rate Scheduling & Model Loading"}, "755": {"path": "/RWKV-v5/src/utils.py", "hash": "49af45b70a091e83199737806f5c3210", "title": "Prime Number Verification Tool"}, "756": {"path": "/RWKV-v5/tokenizer/rwkv_tokenizer.py", "hash": "af8ae32b95f4852b497eb5449307d268", "title": "RWKV Trie Tokenizer"}, "757": {"path": "/RWKV-v5/tokenizer/rwkv_tokenizer.py:1-32", "hash": "0afd1fc232c8d956f418795ff341fbc8", "title": "RWKV Trie Tokenizer"}, "758": {"path": "/RWKV-v5/tokenizer/rwkv_tokenizer.py:33-66", "hash": "80f35837ec77e2431cea50cb0ece0132", "title": "RWKV TRIE Tokenizer: Efficient Indexing"}, "759": {"path": "/RWKV-v5/tokenizer/rwkv_tokenizer.py:68-103", "hash": "c874e741b84f9fff7737dcf96ed4517d", "title": "Trie-Based Tokenizer"}, "760": {"path": "/RWKV-v5/train.py", "hash": "d32bbb657be22d93941ebfbdcc3a489a", "title": "Training RWKV Language Model with Pytorch Lightning"}, "761": {"path": "/RWKV-v5/train.py:1-23", "hash": "05eb6bb4b7360800cc0b02314822c98b", "title": "RWKV Language Model Training Setup"}, "762": {"path": "/RWKV-v5/train.py:24-36", "hash": "d59b5fe7a1210bb03a7697f3a329c929", "title": "RWKV-v5 Argument Parsing"}, "763": {"path": "/RWKV-v5/train.py:37-49", "hash": "649ba1b2100c76e9eeab21b1c08bc7a0", "title": "Training Parameters: RWKV-LM's \"train.py\""}, "764": {"path": "/RWKV-v5/train.py:50-63", "hash": "1dc524ebbd1b93dece43d45140099dd0", "title": "Optimize ML Model Arguments with Argparse"}, "765": {"path": "/RWKV-v5/train.py:64-80", "hash": "226ffc536065e9b0d692f573ef297db9", "title": "Command Line Args for RWKV-v5 Training"}, "766": {"path": "/RWKV-v5/train.py:81-103", "hash": "83240a30618d6f6728d14ccce00433a0", "title": "Training RWKV-v5 with Arguments"}, "767": {"path": "/RWKV-v5/train.py:104-126", "hash": "81a2b17717a376c04f4d0f07926f3063", "title": "Training Script Configuration"}, "768": {"path": "/RWKV-v5/train.py:127-154", "hash": "4e03d4c374a4549260b25cd85a22cdcb", "title": "Epoch Count Adjustment"}, "769": {"path": "/RWKV-v5/train.py:155-182", "hash": "031884d92d01ce6811af3f832a1316dc", "title": "Max-Value Model Loading"}, "770": {"path": "/RWKV-v5/train.py:183-200", "hash": "d1e38f8ef7fddcd60e1321b491f2d475", "title": "RWKV-5 Training Details"}, "771": {"path": "/RWKV-v5/train.py:201-226", "hash": "97ba0311bf50be8afe2a36aba6c13204", "title": "RWKV-v5 Training Setup"}, "772": {"path": "/RWKV-v5/train.py:227-256", "hash": "f3fa15d3a11d48fd5dfe00785165794b", "title": "Model Training Environment Setup"}, "773": {"path": "/RWKV-v5/train.py:257-280", "hash": "9d260a1fe37614028ee645c8371666ba", "title": "Checkpoint Model Loader"}, "774": {"path": "/RWKV-v5/train.py:281-299", "hash": "72c085a1f12dee27e3884d7b23c4c9ca", "title": "Trainer Object Initialization for PyTorch Lightning"}, "775": {"path": "/RWKV-v5/train.py:300-309", "hash": "13c731191a7a4356c120a8daaa8505ba", "title": "Train RWKV Model with Deepspeed"}}} \ No newline at end of file diff --git a/docs/codeview.html b/docs/codeview.html new file mode 100755 index 00000000..761065f4 --- /dev/null +++ b/docs/codeview.html @@ -0,0 +1,669 @@ + + + + +
+ + + +Code Preview
+0:\n # self.data_pile = MMapIndexedDataset('/fsx/pile/pile_20B_tokenizer_text_document')\n self.data_pile = MMapIndexedDataset('/fsx/pile_deduped/pile_0.87_deduped_text_document')\n self.data_pile_size = len(self.data_pile._bin_buffer) // self.data._index._dtype_size", + "type": "code", + "location": "/RWKV-v5/src/dataset.py:26-42" + }, + "1641": { + "file_id": 56, + "content": "This code reads a dataset file and processes it based on the specified version. If the my_pile_version is 2, it splits the data into chunks and creates MMapIndexedDataset objects for each chunk with corresponding sizes. It also checks and asserts the size differences. If my_qa_mask is greater than 0, it sets a specific data pile for use.", + "type": "comment" + }, + "1642": { + "file_id": 56, + "content": " else:\n self.data_pile = None\n self.data_pile_size = 0\n if args.my_pile_stage > 0:\n # assert self.data_size == 332115325534 and self.vocab_size == 50277\n self.samples_per_epoch = args.epoch_steps * args.real_bsz\n assert self.samples_per_epoch == 40320\n rank_zero_info(f\"########## Pile 20b-tokenized stage {args.my_pile_stage} ##########\")\n dataset_slot = self.data_size // args.ctx_len\n if args.my_pile_stage != 4:\n assert MaybeIsPrime(args.magic_prime)\n assert args.magic_prime % 3 == 2\n assert args.magic_prime / dataset_slot > 0.99 and args.magic_prime / dataset_slot <= 1\n elif args.data_type == \"numpy\":\n self.data = np.load(args.data_file).astype(\"int\")\n self.vocab_size = args.vocab_size\n rank_zero_info(f\"Current vocab size = {self.vocab_size} (make sure it's correct)\")", + "type": "code", + "location": "/RWKV-v5/src/dataset.py:43-60" + }, + "1643": { + "file_id": 56, + "content": "The code is defining a dataset class. If data_size and vocab_size are not provided, the data pile is set to None. If my_pile_stage is greater than 0, it asserts that the data size is correct, calculates samples per epoch based on epoch_steps and real_bsz, checks if magic_prime is prime and within certain range, or loads data from file if data type is numpy. The code also provides information about current vocab size for the user.", + "type": "comment" + }, + "1644": { + "file_id": 56, + "content": " self.data_size = len(self.data)\n rank_zero_info(f\"Data has {self.data_size} tokens.\")\n elif args.data_type == \"uint16\":\n self.data = np.fromfile(args.data_file, dtype=np.uint16).astype(\"int32\").reshape(-1, args.my_sample_len)\n self.vocab_size = args.vocab_size\n rank_zero_info(f\"Current vocab size = {self.vocab_size} (make sure it's correct)\")\n self.data_size = self.data.shape[0]\n rank_zero_info(f\"Data has {self.data_size} samples.\")\n else:\n if args.data_type == \"dummy\":\n rank_zero_info(\"Building dummy data...\")\n self.data = \"\"\n for i in range(100000):\n aa = (i) % 10000\n bb = (i * i) % 10000\n cc = aa + bb\n self.data += f\".{aa}+{bb}={cc}.\"\n else:\n self.data = open(args.data_file, \"r\", encoding=args.data_type).read()\n rank_zero_info(\"Building token list...\")", + "type": "code", + "location": "/RWKV-v5/src/dataset.py:61-80" + }, + "1645": { + "file_id": 56, + "content": "The code reads data from a file based on the provided data type and builds an array of tokens. It checks if the data_type is \"RWKV-LM/RWKV-v5/src/dataset.py\":60-79\n\"RWKV-LM/RWKV-v5/src/dataset.py\":60-79", + "type": "comment" + }, + "1646": { + "file_id": 56, + "content": " unique = sorted(list(set(self.data)))\n self.vocab_size = len(unique)\n # rank_zero_info()\n # for u in unique:\n # print(u, end=' ')\n # rank_zero_info('\\n\\n')\n xx = 0\n xxObj = {}\n for u in unique:\n xxObj[xx] = u\n xx += 1\n with open(f\"{args.proj_dir}/vocab.json\", \"w\", encoding=\"utf-8\") as vocab_file:\n vocab_file.write(json.dumps(xxObj, ensure_ascii=False))\n self.data_size = len(self.data)\n rank_zero_info(f\"Data has {self.data_size} tokens, {self.vocab_size} vocab size.\")\n self.stoi = {ch: i for i, ch in enumerate(unique)}\n self.itos = {i: ch for i, ch in enumerate(unique)}\n def __len__(self):\n return self.args.epoch_steps * self.args.micro_bsz\n def __getitem__(self, idx):\n args = self.args\n rank = self.global_rank\n epoch = self.real_epoch\n world_size = self.world_size\n # print(f\"epoch {epoch} idx {idx} rank {rank}/{world_size}\")", + "type": "code", + "location": "/RWKV-v5/src/dataset.py:81-107" + }, + "1647": { + "file_id": 56, + "content": "The code snippet initializes a vocabulary and associated dictionaries, writes the vocabulary to a JSON file, sets the data size and vocab size, and defines two helper methods for converting between indices and tokens. It also includes a print statement that displays information about the current epoch, index, and rank in case of distributed training.", + "type": "comment" + }, + "1648": { + "file_id": 56, + "content": " if args.data_type == \"uint16\":\n i = np.random.randint(0, self.data_size-1)\n dix = self.data[i]\n x = torch.tensor(dix[:-1], dtype=torch.long)\n y = torch.tensor(dix[1:], dtype=torch.long)\n else:\n ctx_len = args.ctx_len\n req_len = ctx_len + 1\n magic_prime = args.magic_prime\n data = self.data\n if args.my_pile_stage > 0:\n ii = 1 + epoch * self.samples_per_epoch + (idx * world_size) + rank\n if args.my_qa_mask > 0:\n ii_orig = ii\n if ii % 2 == 0:\n ii = -1\n data = self.data_pile\n else:\n ii = ii // 2\n if data == self.data_pile:\n i = np.random.randint(0, self.data_pile_size - req_len)\n else:\n if args.my_pile_stage == 4 or ii < args.my_random_steps:\n # cheat: pick a random spot in dataset", + "type": "code", + "location": "/RWKV-v5/src/dataset.py:109-134" + }, + "1649": { + "file_id": 56, + "content": "This code retrieves data from a dataset based on the specified data type. If the data type is uint16, it selects a random index and uses the sliced data for training. Otherwise, it determines the context length and required data length based on arguments, then picks a random spot in the dataset or cheats by picking a random location when specified.", + "type": "comment" + }, + "1650": { + "file_id": 56, + "content": " if args.my_pile_version == 1:\n i = np.random.randint(0, self.data_size - req_len)\n else:\n i = np.random.randint(0, self.data_size)\n else:\n ii = ii - args.my_random_steps\n factor = (math.sqrt(5) - 1) / 2\n factor = int(magic_prime * factor)\n i = ((factor * ii * ii * ii) % magic_prime) * ctx_len\n i = i + args.my_pile_shift\n # print(f\"epoch {epoch} idx {idx} rank {rank}/{world_size} ii {ii} pos {round(i / self.data_size, 3)}\")\n else:\n # cheat: pick a random spot in dataset\n i = np.random.randint(0, self.data_size - req_len)\n if args.data_type == \"binidx\":\n if args.my_pile_version == 1:\n dix = data.get(idx=0, offset=i, length=req_len).astype(int)\n else:\n # self.data : cutoff, chunk_count, data", + "type": "code", + "location": "/RWKV-v5/src/dataset.py:135-154" + }, + "1651": { + "file_id": 56, + "content": "This code randomly selects a chunk of data from a dataset. If the `args.my_pile_version` is 1, it picks a random position within the remaining data after subtracting `req_len`. For other cases, it calculates the index using a complex formula involving `ii`, `magic_prime`, and `ctx_len`. If `args.data_type` is \"binidx\" and `args.my_pile_version` is 1, it retrieves data from the `data` object at the specified offset and length. Otherwise, it simply picks a random position within the remaining data after subtracting `req_len`.", + "type": "comment" + }, + "1652": { + "file_id": 56, + "content": " for j in range(len(data)):\n if i < data[j][0]:\n ii = i\n i = (i - (data[j-1][0] if j > 0 else 0)) % data[j][1]\n dix = data[j][2].get(idx=0, offset=i, length=req_len).astype(int)\n # print(ii, j, i)\n break\n elif args.data_type == \"numpy\":\n dix = data[i : i + req_len]\n else:\n dix = [self.stoi[s] for s in data[i : i + req_len]]\n if args.my_qa_mask == 1:\n if data == self.data_pile:\n z = [1] * ctx_len\n else:\n z = [0] * ctx_len\n z_sum = 0\n isGood = False\n for i in range(3, ctx_len):\n if dix[i] == 27 and dix[i-1] == 34 and dix[i-2] == 187 and dix[i-3] == 187:\n isGood = True\n if dix[i] == 0:", + "type": "code", + "location": "/RWKV-v5/src/dataset.py:155-177" + }, + "1653": { + "file_id": 56, + "content": "Code is slicing data from a list based on the provided index 'i' and length 'req_len'. If the data type is \"numpy\", it directly assigns the slice to dix. Otherwise, it converts the slice into indices using self.stoi. If args.my_qa_mask == 1, it checks for a specific sequence of tokens in the sliced data and assigns either [1] * ctx_len or [0] * ctx_len to 'z'.", + "type": "comment" + }, + "1654": { + "file_id": 56, + "content": " isGood = False\n if isGood:\n z[i] = 1\n z_sum += 1\n if z_sum == 0:\n z = [1] * ctx_len\n i = np.random.randint(0, self.data_pile_size - req_len)\n dix = self.data_pile.get(idx=0, offset=i, length=req_len).astype(int)\n z = torch.tensor(z, dtype=torch.bfloat16)\n x = torch.tensor(dix[:-1], dtype=torch.long)\n y = torch.tensor(dix[1:], dtype=torch.long)\n # if ii_orig < 50:\n # # if rank == 1:\n # print('rank', rank, 'i', ii_orig, ii, i, 'x', x[:5], '...', x[-5:])\n # else:\n # exit(0)\n if args.my_qa_mask == 1:\n return x, y, z\n return x, y", + "type": "code", + "location": "/RWKV-v5/src/dataset.py:178-200" + }, + "1655": { + "file_id": 56, + "content": "This code is initializing a context (z) for a dataset. It checks if the context sum is zero and if so, sets all elements of z to 1 randomly selects an index from the data pile, converts indices to tensors, and returns the input (x), output (y), and context (z) unless my_qa_mask is set to 1. In that case, it only returns x, y.", + "type": "comment" + }, + "1656": { + "file_id": 57, + "content": "/RWKV-v5/src/model.py", + "type": "filepath" + }, + "1657": { + "file_id": 57, + "content": "This code utilizes DeepSpeed and implements a transformer layer with CUDA operations. It creates RWKV model architecture, initializes an optimizer, and optimizes memory usage for transformer models.", + "type": "summary" + }, + "1658": { + "file_id": 57, + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nimport os, math, gc, importlib\nimport torch\n# torch._C._jit_set_profiling_executor(True)\n# torch._C._jit_set_profiling_mode(True)\nimport torch.nn as nn\nfrom torch.nn import functional as F\nimport pytorch_lightning as pl\nfrom pytorch_lightning.utilities import rank_zero_info, rank_zero_only\nfrom pytorch_lightning.strategies import DeepSpeedStrategy\nif importlib.util.find_spec('deepspeed'):\n import deepspeed\n from deepspeed.ops.adam import DeepSpeedCPUAdam, FusedAdam\n# from deepspeed.runtime.fp16.onebit.zoadam import ZeroOneAdam\ntry:\n print('RWKV_MY_TESTING', os.environ[\"RWKV_MY_TESTING\"])\nexcept:\n os.environ[\"RWKV_MY_TESTING\"] = ''\ndef __nop(ob):\n return ob\nMyModule = nn.Module\nMyFunction = __nop\nif os.environ[\"RWKV_JIT_ON\"] == \"1\":", + "type": "code", + "location": "/RWKV-v5/src/model.py:1-31" + }, + "1659": { + "file_id": 57, + "content": "Importing necessary libraries, setting profiling mode for torch.jit, defining custom modules and functions, integrating DeepSpeed library if available, and configuring environment variables for testing and JIT usage.", + "type": "comment" + }, + "1660": { + "file_id": 57, + "content": " MyModule = torch.jit.ScriptModule\n MyFunction = torch.jit.script_method\n########################################################################################################\n# CUDA Kernel\n########################################################################################################\nfrom torch.utils.cpp_extension import load\nHEAD_SIZE = int(os.environ[\"RWKV_HEAD_SIZE_A\"])\nwkv5_cuda = load(name=\"wkv5\", sources=[\"cuda/wkv5_op.cpp\", f\"cuda/wkv5_cuda.cu\"],\n verbose=True, extra_cuda_cflags=[\"-res-usage\", \"--use_fast_math\", \"-O3\", \"-Xptxas -O3\", \"--extra-device-vectorization\", f\"-D_N_={HEAD_SIZE}\"])\nclass WKV_5(torch.autograd.Function):\n @staticmethod\n def forward(ctx, B, T, C, H, r, k, v, w, u):\n with torch.no_grad():\n assert r.dtype == torch.bfloat16\n assert k.dtype == torch.bfloat16\n assert v.dtype == torch.bfloat16\n assert w.dtype == torch.bfloat16\n assert u.dtype == torch.bfloat16\n assert HEAD_SIZE == C // H", + "type": "code", + "location": "/RWKV-v5/src/model.py:32-55" + }, + "1661": { + "file_id": 57, + "content": "This code imports a CUDA kernel and uses it in the WKV_5 class, which performs operations on tensors of specific data types and dimensions. It asserts the input tensor sizes to ensure they match the expected format and checks the value of HEAD_SIZE to be equal to C divided by H. The code then instantiates the wkv5 CUDA kernel from the \"wkv5\" module with specific sources, flags, and verbose settings.", + "type": "comment" + }, + "1662": { + "file_id": 57, + "content": " ctx.B = B\n ctx.T = T\n ctx.C = C\n ctx.H = H\n assert r.is_contiguous()\n assert k.is_contiguous()\n assert v.is_contiguous()\n assert w.is_contiguous()\n assert u.is_contiguous()\n ew = (-torch.exp(w.float())).contiguous()\n eew = (torch.exp(ew)).contiguous()\n ctx.save_for_backward(r, k, v, eew, ew, u)\n y = torch.empty((B, T, C), device=r.device, dtype=torch.bfloat16, memory_format=torch.contiguous_format) # .uniform_(-1, 1)\n wkv5_cuda.forward(B, T, C, H, r, k, v, eew, u, y)\n return y\n @staticmethod\n def backward(ctx, gy):\n with torch.no_grad():\n assert gy.dtype == torch.bfloat16\n B = ctx.B\n T = ctx.T\n C = ctx.C\n H = ctx.H\n assert gy.is_contiguous()\n r, k, v, eew, ew, u = ctx.saved_tensors\n gr = torch.empty((B, T, C), device=gy.device, requires_grad=False, dtype=torch.bfloat16, memory_format=torch.contiguous_format) # .uniform_(-1, 1)", + "type": "code", + "location": "/RWKV-v5/src/model.py:56-82" + }, + "1663": { + "file_id": 57, + "content": "This code appears to be part of a model's forward and backward pass implementation. In the forward pass, it initializes variables for batch size (B), sequence length (T), feature dimensions (C), hidden state dimension (H), and tensors r, k, v, eew, ew, u, and y. It then calls a forward method to compute y using these variables and returns y. In the backward pass, it checks the gradient tensor gy's type and shape, retrieves saved tensors from the context, and initializes an empty tensor gr of the same shape as gy for gradients computation.", + "type": "comment" + }, + "1664": { + "file_id": 57, + "content": " gk = torch.empty((B, T, C), device=gy.device, requires_grad=False, dtype=torch.bfloat16, memory_format=torch.contiguous_format) # .uniform_(-1, 1)\n gv = torch.empty((B, T, C), device=gy.device, requires_grad=False, dtype=torch.bfloat16, memory_format=torch.contiguous_format) # .uniform_(-1, 1)\n gw = torch.empty((B, C), device=gy.device, requires_grad=False, dtype=torch.bfloat16, memory_format=torch.contiguous_format) # .uniform_(-1, 1)\n gu = torch.empty((B, C), device=gy.device, requires_grad=False, dtype=torch.bfloat16, memory_format=torch.contiguous_format) # .uniform_(-1, 1)\n wkv5_cuda.backward(B, T, C, H, r, k, v, eew, ew, u, gy, gr, gk, gv, gw, gu)\n gw = torch.sum(gw, 0).view(H, C//H)\n gu = torch.sum(gu, 0).view(H, C//H)\n return (None, None, None, None, gr, gk, gv, gw, gu)\ndef RUN_CUDA_RWKV5(B, T, C, H, r, k, v, w, u):\n return WKV_5.apply(B, T, C, H, r, k, v, w, u)\n########################################################################################################", + "type": "code", + "location": "/RWKV-v5/src/model.py:83-95" + }, + "1665": { + "file_id": 57, + "content": "This code is initializing various tensors for model parameters, and then calling a function to apply CUDA operations on these tensors. The function RUN_CUDA_RWKV5 calls the apply method of class WKV_5 with given input dimensions and parameters. It returns the gradients of the input parameters after applying the CUDA operations.", + "type": "comment" + }, + "1666": { + "file_id": 57, + "content": "class RWKV_TimeMix_RWKV5(MyModule):\n def __init__(self, args, layer_id):\n super().__init__()\n self.args = args\n self.layer_id = layer_id\n self.head_size = args.head_size_a\n assert HEAD_SIZE == self.head_size # change HEAD_SIZE to match args.head_size_a\n self.n_head = args.dim_att // self.head_size\n assert args.dim_att % self.n_head == 0\n self.head_size_divisor = args.head_size_divisor\n with torch.no_grad():\n ratio_0_to_1 = layer_id / (args.n_layer - 1) # 0 to 1\n ratio_1_to_almost0 = 1.0 - (layer_id / args.n_layer) # 1 to ~0\n ddd = torch.ones(1, 1, args.n_embd)\n for i in range(args.n_embd):\n ddd[0, 0, i] = i / args.n_embd\n # fancy time_mix\n self.time_mix_k = nn.Parameter(torch.pow(ddd, ratio_1_to_almost0))\n self.time_mix_v = nn.Parameter(torch.pow(ddd, ratio_1_to_almost0) + 0.3 * ratio_0_to_1)\n self.time_mix_r = nn.Parameter(torch.pow(ddd, 0.5 * ratio_1_to_almost0))", + "type": "code", + "location": "/RWKV-v5/src/model.py:97-119" + }, + "1667": { + "file_id": 57, + "content": "This code defines a class for RWKV_TimeMix_RWKV5, a type of MyModule. It initializes instance variables based on input arguments and asserts the head size consistency. The time_mix parameters are calculated using layer ID, number of layers, and embedding dimension.", + "type": "comment" + }, + "1668": { + "file_id": 57, + "content": " self.time_mix_g = nn.Parameter(torch.pow(ddd, 0.5 * ratio_1_to_almost0))\n # fancy time_decay\n decay_speed = torch.ones(args.dim_att)\n for n in range(args.dim_att):\n decay_speed[n] = -6 + 5 * (n / (args.dim_att - 1)) ** (0.7 + 1.3 * ratio_0_to_1)\n self.time_decay = nn.Parameter(decay_speed.reshape(self.n_head, self.head_size))\n # print(layer_id, self.time_decay.flatten()[:3].cpu().numpy(), '...', self.time_decay.flatten()[-3:].cpu().numpy())\n tmp = torch.zeros(args.dim_att)\n for n in range(args.dim_att):\n zigzag = ((n + 1) % 3 - 1) * 0.1\n tmp[n] = ratio_0_to_1 * (1 - (n / (args.dim_att - 1))) + zigzag\n self.time_faaaa = nn.Parameter(tmp.reshape(self.n_head, self.head_size))\n self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))\n self.receptance = nn.Linear(args.n_embd, args.dim_att, bias=False)\n self.key = nn.Linear(args.n_embd, args.dim_att, bias=False)", + "type": "code", + "location": "/RWKV-v5/src/model.py:120-138" + }, + "1669": { + "file_id": 57, + "content": "This code is initializing parameters for a transformer layer in RWKV-v5. It defines time_mix_g, time_decay, and time_faaaa as learnable parameters. The time_decay and time_faaaa parameters control the time-based attention mechanism, while time_shift is a zero padding operator and receptance and key are linear layers for processing the input embeddings.", + "type": "comment" + }, + "1670": { + "file_id": 57, + "content": " self.value = nn.Linear(args.n_embd, args.dim_att, bias=False)\n self.output = nn.Linear(args.dim_att, args.n_embd, bias=False)\n self.gate = nn.Linear(args.n_embd, args.dim_att, bias=False)\n self.ln_x = nn.GroupNorm(self.n_head, args.dim_att)\n @MyFunction\n def jit_func(self, x):\n B, T, C = x.size()\n xx = self.time_shift(x) # Mix x with the previous timestep to produce xk, xv, xr\n xk = x * self.time_mix_k + xx * (1 - self.time_mix_k)\n xv = x * self.time_mix_v + xx * (1 - self.time_mix_v)\n xr = x * self.time_mix_r + xx * (1 - self.time_mix_r)\n xg = x * self.time_mix_g + xx * (1 - self.time_mix_g)\n r = self.receptance(xr)\n k = self.key(xk)\n v = self.value(xv)\n g = F.silu(self.gate(xg))\n return r, k, v, g\n @MyFunction\n def jit_func_2(self, x, g):\n B, T, C = x.size()\n x = x.view(B * T, C)\n x = self.ln_x(x / self.head_size_divisor).view(B, T, C)\n x = self.output(x * g)\n return x", + "type": "code", + "location": "/RWKV-v5/src/model.py:140-169" + }, + "1671": { + "file_id": 57, + "content": "This code defines a class with three linear layers for key, value, and gate calculations. It also includes a GroupNorm layer and two JIT functions for forward pass and normalization. The functions apply time shifting, calculate key, value, and receptance, and perform normalization using GroupNorm.", + "type": "comment" + }, + "1672": { + "file_id": 57, + "content": " def forward(self, x):\n B, T, C = x.size()\n H = self.n_head\n r, k, v, g = self.jit_func(x)\n x = RUN_CUDA_RWKV5(B, T, C, H, r, k, v, w=self.time_decay, u=self.time_faaaa)\n return self.jit_func_2(x, g)\n########################################################################################################\nclass RWKV_ChannelMix(MyModule):\n def __init__(self, args, layer_id):\n super().__init__()\n self.args = args\n self.layer_id = layer_id\n self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))\n with torch.no_grad(): # fancy init of time_mix\n ratio_1_to_almost0 = 1.0 - (layer_id / args.n_layer) # 1 to ~0\n ddd = torch.ones(1, 1, args.n_embd)\n for i in range(args.n_embd):\n ddd[0, 0, i] = i / args.n_embd\n self.time_mix_k = nn.Parameter(torch.pow(ddd, ratio_1_to_almost0))\n self.time_mix_r = nn.Parameter(torch.pow(ddd, ratio_1_to_almost0))\n self.key = nn.Linear(args.n_embd, args.dim_ffn, bias=False)", + "type": "code", + "location": "/RWKV-v5/src/model.py:171-198" + }, + "1673": { + "file_id": 57, + "content": "The code defines a forward pass for a model, with input size BTC and H being the number of heads. It uses jit_func to calculate r, k, v, and g. RUN_CUDA_RWKV5 is applied on these calculated values. RWKV_ChannelMix is a subclass of MyModule with an args parameter for arguments and layer_id for layer identification. It initializes time_shift with a zero pad 2D, performs fancy initialization of time_mix by setting ratio_1_to_almost0 and calculating ddd. It also initializes key as a linear layer without bias.", + "type": "comment" + }, + "1674": { + "file_id": 57, + "content": " self.receptance = nn.Linear(args.n_embd, args.n_embd, bias=False)\n self.value = nn.Linear(args.dim_ffn, args.n_embd, bias=False)\n @MyFunction\n def forward(self, x):\n xx = self.time_shift(x)\n xk = x * self.time_mix_k + xx * (1 - self.time_mix_k)\n xr = x * self.time_mix_r + xx * (1 - self.time_mix_r)\n k = self.key(xk)\n k = torch.relu(k) ** 2\n kv = self.value(k)\n return torch.sigmoid(self.receptance(xr)) * kv\nclass MishGLU(MyModule):\n def __init__(self, args, layer_id):\n super().__init__()\n self.args = args\n self.layer_id = layer_id\n self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))\n with torch.no_grad():\n ratio_1_to_almost0 = 1.0 - (layer_id / args.n_layer)\n x = torch.ones(1, 1, args.n_embd)\n for i in range(args.n_embd):\n x[0, 0, i] = i / args.n_embd\n self.time_mix_k = nn.Parameter(torch.pow(x, ratio_1_to_almost0))\n self.time_mix_r = nn.Parameter(torch.pow(x, ratio_1_to_almost0))", + "type": "code", + "location": "/RWKV-v5/src/model.py:199-227" + }, + "1675": { + "file_id": 57, + "content": "This code initializes layers for a MishGLU module, including time shifting and mixing parameters. The time_shift layer pads the input tensor with a one-pixel shift, while time_mix_k and time_mix_r are learned parameters for mixing inputs. These operations are used to calculate key and receptance values in the forward pass before returning the final result through a sigmoid function multiplied by the value tensor.", + "type": "comment" + }, + "1676": { + "file_id": 57, + "content": " self.aa = nn.Linear(args.n_embd, args.dim_ffn, bias=False)\n self.bb = nn.Linear(args.n_embd, args.dim_ffn, bias=False)\n self.value = nn.Linear(args.dim_ffn, args.n_embd, bias=False)\n @MyFunction\n def forward(self, x):\n xx = self.time_shift(x)\n xa = x * self.time_mix_k + xx * (1 - self.time_mix_k)\n xb = x * self.time_mix_r + xx * (1 - self.time_mix_r)\n a = self.aa(xa)\n b = self.bb(xb)\n return self.value(a * F.mish(b))\n########################################################################################################\n# The RWKV Model with our blocks\n########################################################################################################\nclass Block(nn.Module):\n def __init__(self, args, layer_id):\n super().__init__()\n self.args = args\n self.layer_id = layer_id\n self.ln1 = nn.LayerNorm(args.n_embd)\n self.ln2 = nn.LayerNorm(args.n_embd)\n if self.layer_id == 0:\n self.ln0 = nn.LayerNorm(args.n_embd)", + "type": "code", + "location": "/RWKV-v5/src/model.py:228-256" + }, + "1677": { + "file_id": 57, + "content": "This code defines the architecture of an RWKV model with layers for time-shifting, mixing, and applying a Mish activation function. The `Block` class represents each layer in the model, which includes layer normalization, feed forward network (FFN), and optional additional layer normalization for the first layer. It also defines functions to perform time shifting and linear transformations using fully connected layers.", + "type": "comment" + }, + "1678": { + "file_id": 57, + "content": " if args.my_pos_emb > 0:\n self.pos_emb_x = nn.Parameter(torch.zeros((1,args.my_pos_emb,args.n_embd)))\n self.pos_emb_y = nn.Parameter(torch.zeros((args.my_pos_emb,1,args.n_embd)))\n if self.layer_id == 0 and self.args.pre_ffn > 0:\n self.ffnPre = RWKV_ChannelMix(args, 0)\n else:\n self.att = RWKV_TimeMix_RWKV5(args, layer_id)\n if 'g' in os.environ[\"RWKV_MY_TESTING\"]:\n self.ffn = MishGLU(args, layer_id)\n else:\n self.ffn = RWKV_ChannelMix(args, layer_id)\n if args.tiny_att_dim > 0 and self.layer_id == args.tiny_att_layer:\n self.tiny_ln = nn.LayerNorm(args.n_embd)\n self.tiny_q = nn.Linear(args.n_embd, args.tiny_att_dim, bias=False)\n self.tiny_k = nn.Linear(args.n_embd, args.tiny_att_dim, bias=False)\n self.tiny_v = nn.Linear(args.n_embd, args.n_embd, bias=False)\n self.register_buffer(\"tiny_mask\", torch.tril(torch.ones(args.ctx_len, args.ctx_len)))", + "type": "code", + "location": "/RWKV-v5/src/model.py:257-276" + }, + "1679": { + "file_id": 57, + "content": "The code initializes model components based on provided arguments and layer ID. If `my_pos_emb > 0`, it creates positional embedding parameters. If `pre_ffn > 0`, it creates an FFN layer (RWKV_ChannelMix) for the first layer. Depending on environment, it also initializes ffn layers (MishGLU or RWKV_ChannelMix). For tiny_att_dim and specific layer ID, it registers linear layers and buffer for tiny attention implementation.", + "type": "comment" + }, + "1680": { + "file_id": 57, + "content": " if args.dropout > 0:\n self.drop0 = nn.Dropout(p = args.dropout)\n self.drop1 = nn.Dropout(p = args.dropout)\n def forward(self, x, x_emb=None):\n args = self.args\n B, T, C = x.size()\n if self.layer_id == 0:\n x = self.ln0(x)\n if args.my_pos_emb > 0:\n pos_emb = (self.pos_emb_x + self.pos_emb_y).reshape(T+1, -1)[:-1,:]\n x = x + pos_emb\n if self.args.dropout == 0:\n if self.layer_id == 0 and args.pre_ffn > 0:\n x = x + self.ffnPre(self.ln1(x))\n else:\n x = x + self.att(self.ln1(x))\n x = x + self.ffn(self.ln2(x))\n else:\n if self.layer_id == 0 and args.pre_ffn > 0:\n x = self.drop0(x + self.ffnPre(self.ln1(x)))\n else:\n x = self.drop0(x + self.att(self.ln1(x)))\n x = self.drop1(x + self.ffn(self.ln2(x)))\n if args.tiny_att_dim > 0 and self.layer_id == args.tiny_att_layer:\n xx = self.tiny_ln(x)", + "type": "code", + "location": "/RWKV-v5/src/model.py:278-305" + }, + "1681": { + "file_id": 57, + "content": "The code implements a layer of the RWKV-v5 model, including dropout regularization, layers normalization, feed-forward network (FFN), and attention mechanism. The dropout rate is determined by the \"args\" parameter and can be zero. If the pre_ffn argument is greater than 0, it executes FFN before other operations. The code also includes a tiny attention layer at a specific layer specified by the \"tiny_att_dim\" argument.", + "type": "comment" + }, + "1682": { + "file_id": 57, + "content": " q = self.tiny_q(xx)[:, :T, :]\n k = self.tiny_k(xx)[:, :T, :]\n c = (q @ k.transpose(-2, -1)) * (args.tiny_att_dim ** (-0.5))\n c = c.masked_fill(self.tiny_mask[:T, :T] == 0, 0)\n x = x + c @ self.tiny_v(x_emb)\n return x\nclass L2Wrap(torch.autograd.Function):\n @staticmethod\n def forward(ctx, loss, y):\n ctx.save_for_backward(y)\n return loss\n @staticmethod\n def backward(ctx, grad_output):\n y = ctx.saved_tensors[0]\n # to encourage the logits to be close to 0\n factor = 1e-4 / (y.shape[0] * y.shape[1])\n maxx, ids = torch.max(y, -1, keepdim=True)\n gy = torch.zeros_like(y)\n gy.scatter_(-1, ids, maxx * factor)\n return (grad_output, gy)\nclass RWKV(pl.LightningModule):\n def __init__(self, args):\n super().__init__()\n self.args = args\n if not hasattr(args, 'dim_att'):\n args.dim_att = args.n_embd\n if not hasattr(args, 'dim_ffn'):\n args.dim_ffn = args.n_embd * 4", + "type": "code", + "location": "/RWKV-v5/src/model.py:306-338" + }, + "1683": { + "file_id": 57, + "content": "This code defines a class RWKV, which inherits from LightningModule. It includes an initialization method that takes arguments and creates instance variables. The class also has a forward method for the L2Wrap function, which computes a loss and applies a specific gradient calculation. This appears to be part of a deep learning model implementation.", + "type": "comment" + }, + "1684": { + "file_id": 57, + "content": " if not hasattr(args, 'tiny_att_layer'):\n args.tiny_att_layer = -1\n if not hasattr(args, 'tiny_att_dim'):\n args.tiny_att_dim = -1\n assert args.n_embd % 32 == 0\n assert args.dim_att % 32 == 0\n assert args.dim_ffn % 32 == 0\n self.emb = nn.Embedding(args.vocab_size, args.n_embd)\n self.blocks = nn.ModuleList([Block(args, i) for i in range(args.n_layer)])\n self.ln_out = nn.LayerNorm(args.n_embd)\n self.head = nn.Linear(args.n_embd, args.vocab_size, bias=False)\n if args.head_qk > 0:\n self.head_q = nn.Linear(args.n_embd, args.head_qk, bias=False)\n self.head_k = nn.Linear(args.n_embd, args.head_qk, bias=False)\n self.register_buffer(\"copy_mask\", torch.tril(torch.ones(args.ctx_len, args.ctx_len)))\n if args.dropout > 0:\n self.drop0 = nn.Dropout(p = args.dropout)\n def configure_optimizers(self):\n args = self.args\n lr_decay = set()\n lr_1x = set()\n lr_2x = set()", + "type": "code", + "location": "/RWKV-v5/src/model.py:339-366" + }, + "1685": { + "file_id": 57, + "content": "The code defines a model with multiple layers, embeddings, and optional components depending on the provided arguments. It also sets up optimizer configurations based on the specified arguments.", + "type": "comment" + }, + "1686": { + "file_id": 57, + "content": " lr_3x = set()\n for n, p in self.named_parameters():\n if (\"time_mix\" in n) and (args.layerwise_lr > 0):\n if args.my_pile_stage == 2:\n lr_2x.add(n)\n else:\n lr_1x.add(n)\n elif (\"time_decay\" in n) and (args.layerwise_lr > 0):\n if args.my_pile_stage == 2:\n lr_3x.add(n)\n else:\n lr_2x.add(n)\n elif (\"time_faaaa\" in n) and (args.layerwise_lr > 0):\n if args.my_pile_stage == 2:\n lr_2x.add(n)\n else:\n lr_1x.add(n)\n elif (\"time_first\" in n) and (args.layerwise_lr > 0):\n lr_3x.add(n)\n elif (len(p.squeeze().shape) >= 2) and (args.weight_decay > 0):\n lr_decay.add(n)\n else:\n lr_1x.add(n)\n lr_decay = sorted(list(lr_decay))\n lr_1x = sorted(list(lr_1x))\n lr_2x = sorted(list(lr_2x))\n lr_3x = sorted(list(lr_3x))", + "type": "code", + "location": "/RWKV-v5/src/model.py:367-394" + }, + "1687": { + "file_id": 57, + "content": "This code is assigning different learning rates based on parameter names and certain conditions. It first defines four sets (lr_1x, lr_2x, lr_3x, lr_decay) and then populates them according to specific criteria: if a parameter's name matches certain strings or has a layerwise learning rate greater than 0, it will be added to the corresponding set. Finally, the sets are sorted and stored.", + "type": "comment" + }, + "1688": { + "file_id": 57, + "content": " # print('decay', lr_decay)\n # print('1x', lr_1x)\n # print('2x', lr_2x)\n # print('3x', lr_3x)\n param_dict = {n: p for n, p in self.named_parameters()}\n if args.layerwise_lr > 0:\n if args.my_pile_stage == 2:\n optim_groups = [\n {\"params\": [param_dict[n] for n in lr_1x], \"weight_decay\": 0.0, \"my_lr_scale\": 1.0},\n {\"params\": [param_dict[n] for n in lr_2x], \"weight_decay\": 0.0, \"my_lr_scale\": 5.0},# test: 2e-3 / args.lr_init},\n {\"params\": [param_dict[n] for n in lr_3x], \"weight_decay\": 0.0, \"my_lr_scale\": 5.0},# test: 3e-3 / args.lr_init},\n ]\n else:\n optim_groups = [\n {\"params\": [param_dict[n] for n in lr_1x], \"weight_decay\": 0.0, \"my_lr_scale\": 1.0},\n {\"params\": [param_dict[n] for n in lr_2x], \"weight_decay\": 0.0, \"my_lr_scale\": 2.0},\n {\"params\": [param_dict[n] for n in lr_3x], \"weight_decay\": 0.0, \"my_lr_scale\": 3.0},", + "type": "code", + "location": "/RWKV-v5/src/model.py:395-412" + }, + "1689": { + "file_id": 57, + "content": "The code defines different optimizer groups for layers based on the desired layer-wise learning rate. It separates parameters into three groups with scaling factors of 1x, 2x, and 3x (relative to initial learning rate), and sets their respective weights decay to zero.", + "type": "comment" + }, + "1690": { + "file_id": 57, + "content": " ]\n else:\n optim_groups = [{\"params\": [param_dict[n] for n in lr_1x], \"weight_decay\": 0.0, \"my_lr_scale\": 1.0}]\n if args.weight_decay > 0:\n optim_groups += [{\"params\": [param_dict[n] for n in lr_decay], \"weight_decay\": args.weight_decay, \"my_lr_scale\": 1.0}]\n if self.deepspeed_offload:\n return DeepSpeedCPUAdam(optim_groups, lr=self.args.lr_init, betas=self.args.betas, eps=self.args.adam_eps, bias_correction=True, adamw_mode=True, amsgrad=False)\n return FusedAdam(optim_groups, lr=self.args.lr_init, betas=self.args.betas, eps=self.args.adam_eps, bias_correction=True, adam_w_mode=True, amsgrad=False)\n else:\n if self.deepspeed_offload:\n return DeepSpeedCPUAdam(optim_groups, lr=self.args.lr_init, betas=self.args.betas, eps=self.args.adam_eps, bias_correction=True, adamw_mode=False, weight_decay=0, amsgrad=False)\n return FusedAdam(optim_groups, lr=self.args.lr_init, betas=", + "type": "code", + "location": "/RWKV-v5/src/model.py:413-425" + }, + "1691": { + "file_id": 57, + "content": "This code sets up an optimizer for the model. It creates a list of optimization groups, adding either a group with no weight decay or a group with weight decay based on arguments provided. Depending on the presence of a weight decay argument and whether DeepSpeed is being used, it returns a corresponding optimizer - either DeepSpeedCPUAdam or FusedAdam.", + "type": "comment" + }, + "1692": { + "file_id": 57, + "content": "self.args.betas, eps=self.args.adam_eps, bias_correction=True, adam_w_mode=False, weight_decay=0, amsgrad=False)\n # return ZeroOneAdam(optim_groups, lr=self.args.lr_init, betas=self.args.betas, eps=self.args.adam_eps, bias_correction=True, weight_decay=0, amsgrad=False, cuda_aware=False)\n @property\n def deepspeed_offload(self) -> bool:\n strategy = self.trainer.strategy\n if isinstance(strategy, DeepSpeedStrategy):\n cfg = strategy.config[\"zero_optimization\"]\n return cfg.get(\"offload_optimizer\") or cfg.get(\"offload_param\")\n return False\n def forward(self, idx):\n args = self.args\n B, T = idx.size()\n assert T <= args.ctx_len, \"Cannot forward, model ctx_len is exhausted.\"\n x = self.emb(idx)\n x_emb = x\n if args.dropout > 0:\n x = self.drop0(x)\n if args.tiny_att_dim > 0:\n for block in self.blocks:\n if args.grad_cp == 1:\n x = deepspeed.checkpointing.checkpoint(block, x, x_emb)", + "type": "code", + "location": "/RWKV-v5/src/model.py:425-449" + }, + "1693": { + "file_id": 57, + "content": "Lines 424-448: Initialize Adam optimizer with specified arguments, including learning rate, betas, epsilon, bias correction, weight decay, and amsgrad flag.\nLine 451-456: Check if the current strategy is a DeepSpeedStrategy and return whether to offload the optimizer or parameters from this model.\nLine 458-470: Define the forward function for the model, applying embedding, dropout (if enabled), and potentially using DeepSpeed checkpointing with gradient clip (if enabled).", + "type": "comment" + }, + "1694": { + "file_id": 57, + "content": " else:\n x = block(x, x_emb)\n else:\n for block in self.blocks:\n if args.grad_cp == 1:\n x = deepspeed.checkpointing.checkpoint(block, x)\n else:\n x = block(x)\n x = self.ln_out(x)\n if args.head_qk > 0:\n q = self.head_q(x)[:, :T, :]\n k = self.head_k(x)[:, :T, :]\n c = (q @ k.transpose(-2, -1)) * (1.0 / args.head_qk)\n c = c.masked_fill(self.copy_mask[:T, :T] == 0, 0)\n if \"32\" in os.environ[\"RWKV_FLOAT_MODE\"]:\n c = c @ F.one_hot(idx, num_classes=args.vocab_size)\n elif os.environ[\"RWKV_FLOAT_MODE\"] == \"fp16\":\n c = c @ F.one_hot(idx, num_classes=args.vocab_size).half()\n elif os.environ[\"RWKV_FLOAT_MODE\"] == \"bf16\":\n c = c @ F.one_hot(idx, num_classes=args.vocab_size).bfloat16()\n x = self.head(x) + c\n else:\n x = self.head(x)\n return x", + "type": "code", + "location": "/RWKV-v5/src/model.py:450-478" + }, + "1695": { + "file_id": 57, + "content": "This code is defining a multi-head attention mechanism in a transformer model, including options for head masking and different float modes (FP16 or BF16). It applies checkpointing for gradient accumulation and performs matrix operations to generate context vectors.", + "type": "comment" + }, + "1696": { + "file_id": 57, + "content": " def training_step(self, batch, batch_idx):\n args = self.args\n if args.my_qa_mask != 1:\n idx, targets = batch\n logits = self(idx)\n loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))\n # if '0' in os.environ[\"RWKV_MY_TESTING\"]:\n # print('logits', logits)\n # torch.set_printoptions(threshold=10000)\n # print('idx', idx)\n # exit(0)\n else:\n idx, targets, mask = batch\n mask = mask.view(-1)\n sum_mask = torch.sum(mask).item()\n # if sum_mask == 0:\n # return torch.tensor([0.0], requires_grad=True)\n logits = self(idx)\n if sum_mask == mask.shape[0]:\n loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))\n # print('rank', self.global_rank, 'loss', loss.item())\n else:\n loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), reduction='none')", + "type": "code", + "location": "/RWKV-v5/src/model.py:480-503" + }, + "1697": { + "file_id": 57, + "content": "The code defines a training step function for the model. It checks an environment variable and performs different operations based on its value. If the variable is not equal to 1, it computes logits from input idx, calculates cross-entropy loss, and prints logits and idx if certain conditions are met. If the variable is equal to 1, it handles batches with masks, checks for mask sum, and adjusts the loss calculation based on whether all values in the mask are 0 or not. It also prints the loss for specific ranks.", + "type": "comment" + }, + "1698": { + "file_id": 57, + "content": " # loss_raw = loss\n loss = torch.sum(loss * mask) / sum_mask\n # torch.set_printoptions(threshold=10000)\n # if True: #self.global_rank == 1:\n # tmp = ''\n # sss = 0\n # ccc = 0\n # for i in range(mask.shape[0]):\n # if mask[i] > 0:\n # tmp += str(idx.view(-1)[i].item()) + ','\n # sss += loss_raw.view(-1)[i].float().item()\n # ccc += 1\n # print('rank', self.global_rank, 'loss', loss.item(), 'lavg', sss / ccc)#, 'tmp', tmp, 'input', idx)\n return L2Wrap.apply(loss, logits)\n def training_step_end(self, batch_parts):\n if pl.__version__[0]!='2':\n all = self.all_gather(batch_parts)\n if self.trainer.is_global_zero:\n self.trainer.my_loss_all = all\n def generate_init_weight(self):\n print(\n f\"\"\"\n############################################################################", + "type": "code", + "location": "/RWKV-v5/src/model.py:504-530" + }, + "1699": { + "file_id": 57, + "content": "In this code snippet, the model calculates the loss for a batch of data. The loss is calculated by summing up the element-wise multiplication of the loss and a mask, then dividing by the sum of the mask. The code also includes additional logging functionality to gather statistics on the training process across multiple GPUs using PyTorch's all_gather function.", + "type": "comment" + } +} \ No newline at end of file diff --git a/docs/data/17.json b/docs/data/17.json new file mode 100644 index 00000000..9b6976e3 --- /dev/null +++ b/docs/data/17.json @@ -0,0 +1,449 @@ +{ + "1700": { + "file_id": 57, + "content": "#\n# Init model weight (slow for large models)...\n#\n############################################################################\n\"\"\"\n )\n m = {}\n for n in self.state_dict():\n p = self.state_dict()[n]\n shape = p.shape\n gain = 1.0\n scale = 1.0\n if \"ln_\" in n or \".ln\" in n or \"time_\" in n or \"_mask\" in n or \"pos_emb\" in n or '.mask.' in n:\n if 'ln_x.weight' in n:\n layer_scale = (1+int(n.split('.')[1])) / self.args.n_layer\n m[n] = (p * 0.0) + (layer_scale ** 0.7)\n else:\n m[n] = p\n else:\n if n == \"emb.weight\":\n scale = -1 * self.args.lr_init\n else:\n if shape[0] > shape[1]:\n gain = math.sqrt(shape[0] / shape[1])\n zero = [\".att.output.\", \".ffn.value.\", \".ffn.receptance.\", \".ffnPre.value.\", \".ffnPre.receptance.\", \"head_q.\", '.oo.', '.rr.']\n for kk in zero:", + "type": "code", + "location": "/RWKV-v5/src/model.py:531-559" + }, + "1701": { + "file_id": 57, + "content": "This code initializes the model's weights, handling specific layers and adjusting gains and scales accordingly. It also handles layer normalization (ln_x.weight), embedding scale initialization (-1 * lr_init), and certain zero-initialized layers.", + "type": "comment" + }, + "1702": { + "file_id": 57, + "content": " if kk in n:\n scale = 0\n if n == \"head.weight\":\n scale = 0.5\n if \"head_k.\" in n:\n scale = 0.1\n if \"head_q.\" in n:\n scale = 0\n print(f\"{str(shape[0]).ljust(5)} {str(shape[1]).ljust(5)} {str(scale).ljust(4)} {n}\")\n if self.args.accelerator.upper() == \"GPU\":\n m[n] = torch.empty((shape[0], shape[1]), device=\"cuda\")\n else:\n m[n] = torch.empty((shape[0], shape[1]))\n if scale == 0:\n nn.init.zeros_(m[n])\n elif scale < 0:\n nn.init.uniform_(m[n], a=scale, b=-scale)\n else:\n nn.init.orthogonal_(m[n], gain=gain * scale)\n m[n] = m[n].cpu()\n if os.environ[\"RWKV_FLOAT_MODE\"] == \"fp16\":\n m[n] = m[n].half()\n elif os.environ[\"RWKV_FLOAT_MODE\"] == \"bf16\":", + "type": "code", + "location": "/RWKV-v5/src/model.py:560-586" + }, + "1703": { + "file_id": 57, + "content": "This code initializes the model's weights with different scales based on their names. If the name contains \"head_k.\" or \"head_q.\", the scale is set to 0. If the name is \"head.weight\", the scale is set to 0.5. Otherwise, the scale is set to 0. The weights are initialized using PyTorch's initialization functions depending on their scales. Finally, the model parameters are moved to CPU and potentially converted to BF16 or FP16 if environment variables RWKV_FLOAT_MODE is set to \"bf16\" or \"fp16\".", + "type": "comment" + }, + "1704": { + "file_id": 57, + "content": " m[n] = m[n].bfloat16()\n # if n == \"emb.weight\":\n # print(m[n])\n gc.collect()\n torch.cuda.empty_cache()\n return m", + "type": "code", + "location": "/RWKV-v5/src/model.py:587-594" + }, + "1705": { + "file_id": 57, + "content": "In this code snippet, the model's parameters are converted to bfloat16 and then garbage collected, followed by clearing CUDA cache. This improves memory usage and efficiency.", + "type": "comment" + }, + "1706": { + "file_id": 58, + "content": "/RWKV-v5/src/trainer.py", + "type": "filepath" + }, + "1707": { + "file_id": 58, + "content": "The code defines a `my_save()` function for saving PyTorch Lightning model data using AWS S3, handles learning rate scheduling and logs progress. It also loads, reshapes and converts a model dictionary, performs interpolation, saves epoch information, and generates initial weights for model training.", + "type": "summary" + }, + "1708": { + "file_id": 58, + "content": "import os, math, time, datetime, subprocess\nimport torch\nfrom torch.utils.data import DataLoader\nimport pytorch_lightning as pl\nfrom pytorch_lightning.utilities import rank_zero_info, rank_zero_only\ndef my_save(args, trainer, dd, ff):\n if '14b-run1' in ff:\n fn = ff.split('/')[-1]\n fff = '/dev/shm/' + fn\n torch.save(dd, fff)\n subprocess.Popen(f\" aws s3 mv {fff} s3://rwkv-14b-4k/{fn} --quiet\", shell=True)\n elif ('world/14b' in ff) or ('world/7b' in ff):\n aa = ff.split('/')[1]\n fn = ff.split('/')[-1]\n fff = f'/dev/shm/{aa}-{fn}'\n torch.save(dd, fff)\n subprocess.Popen(f\" aws s3 mv {fff} s3://rwkv-world/{aa}-{fn} --quiet\", shell=True)\n else:\n if 'deepspeed_stage_3' in args.strategy:\n trainer.save_checkpoint(ff, weights_only=True)\n else:\n torch.save(dd, ff)\nclass train_callback(pl.Callback):\n def __init__(self, args):\n super().__init__()\n self.args = args\n def on_train_batch_start(self, trainer, pl_module, batch, batch_idx):", + "type": "code", + "location": "/RWKV-v4neo/src/trainer.py:1-30" + }, + "1709": { + "file_id": 58, + "content": "This code defines a function `my_save()` that saves the model's data depending on the file path (`ff`) and calls another function `train_callback()`, which is a PyTorch Lightning callback class. The code also uses subprocess to move saved files to AWS S3 storage and incorporates the usage of Deepspeed for distributed training.", + "type": "comment" + }, + "1710": { + "file_id": 58, + "content": " args = self.args\n # if args.cuda_cleanup > 0:\n # torch.cuda.empty_cache()\n real_step = trainer.global_step + args.epoch_begin * args.epoch_steps\n # LR schedule\n w_step = args.warmup_steps\n if args.lr_final == args.lr_init or args.epoch_count == 0:\n lr = args.lr_init\n else:\n decay_step = real_step - args.my_pile_edecay * args.epoch_steps\n decay_total = (args.epoch_count - args.my_pile_edecay) * args.epoch_steps\n progress = (decay_step - w_step + 1) / (decay_total - w_step)\n progress = min(1, max(0, progress))\n if args.lr_final == 0 or args.lr_init == 0: # linear decay\n lr = args.lr_init + (args.lr_final - args.lr_init) * progress\n else: # exp decay\n lr = args.lr_init * math.exp(math.log(args.lr_final / args.lr_init) * pow(progress, 1))\n # if trainer.is_global_zero:\n # print(trainer.global_step, decay_step, decay_total, w_step, progress, lr)", + "type": "code", + "location": "/RWKV-v4neo/src/trainer.py:31-51" + }, + "1711": { + "file_id": 58, + "content": "Code snippet handles learning rate (LR) scheduling and potentially clears GPU cache based on provided arguments. It calculates the real training step, determines if LR should be adjusted based on epoch count and warmup steps, and applies linear or exponential decay to adjust the learning rate. It also prints some info if it's the global zero trainer.", + "type": "comment" + }, + "1712": { + "file_id": 58, + "content": " if args.my_exit_tokens != 0: # cosine decay\n real_tokens = real_step * args.ctx_len * args.real_bsz\n warmup_tokens = w_step * args.ctx_len * args.real_bsz\n progress = (real_tokens - warmup_tokens) / (abs(args.my_exit_tokens) - warmup_tokens)\n progress = max(0, min(1, progress))\n lr_final_factor = args.lr_final / args.lr_init \n lr_mult = (0.5 + lr_final_factor / 2) + (0.5 - lr_final_factor / 2) * math.cos(math.pi * progress)\n if args.my_exit_tokens > 0:\n lr = args.lr_init * lr_mult\n else:\n lr = (lr + args.lr_init * lr_mult) / 2\n if progress >= 1:\n if (trainer.is_global_zero) or ('deepspeed_stage_3' in args.strategy):\n my_save(\n args, trainer,\n pl_module.state_dict(),\n f\"{args.proj_dir}/rwkv-final.pth\",\n )\n exit(0)\n if trainer.global_step < w_step:", + "type": "code", + "location": "/RWKV-v4neo/src/trainer.py:53-72" + }, + "1713": { + "file_id": 58, + "content": "This code is setting the learning rate (lr) based on the exit tokens. If my_exit_tokens > 0, lr = lr_init * lr_mult. If my_exit_tokens < 0, lr = (lr + lr_init * lr_mult) / 2. The code also saves and exits if progress >= 1 or if global step is less than w_step. This appears to be part of a training process where the learning rate dynamically adjusts during training based on exit tokens.", + "type": "comment" + }, + "1714": { + "file_id": 58, + "content": " lr = lr * (0.2 + 0.8 * trainer.global_step / w_step)\n if args.weight_decay_final > 0:\n wd_now = args.weight_decay * math.exp(math.log(args.weight_decay_final / args.weight_decay) * progress)\n else:\n wd_now = args.weight_decay\n for param_group in trainer.optimizers[0].param_groups:\n if param_group[\"weight_decay\"] > 0:\n param_group[\"weight_decay\"] = wd_now\n if args.layerwise_lr > 0:\n param_group[\"lr\"] = lr * param_group[\"my_lr_scale\"]\n # print(param_group[\"lr\"], param_group[\"my_lr_scale\"])\n else:\n param_group[\"lr\"] = lr\n trainer.my_lr = lr\n trainer.my_wd = wd_now\n # rank_zero_info(f\"{real_step} {lr}\")\n if trainer.global_step == 0:\n if trainer.is_global_zero: # logging\n trainer.my_loss_sum = 0\n trainer.my_loss_count = 0\n trainer.my_log = open(args.proj_dir + \"/train_log.txt\", \"a\")", + "type": "code", + "location": "/RWKV-v4neo/src/trainer.py:73-97" + }, + "1715": { + "file_id": 58, + "content": "The code sets the learning rate (lr) based on a decay formula and updates the weight decay (wd_now). It iterates through each param group, setting the lr and wd accordingly. If layerwise learning rate is enabled, it adjusts the lr further based on my_lr_scale. The trainer's current lr and wd are stored for future reference, and logging is initialized if this is the first global step.", + "type": "comment" + }, + "1716": { + "file_id": 58, + "content": " trainer.my_log.write(f\"NEW RUN {args.my_timestamp}\\n{vars(self.args)}\\n\")\n try:\n print(f\"\\n{trainer.strategy.config}\\n\")\n trainer.my_log.write(f\"{trainer.strategy.config}\\n\")\n except:\n pass\n trainer.my_log.flush()\n if len(args.wandb) > 0:\n print(\"Login to wandb...\")\n import wandb\n wandb.init(\n project=args.wandb,\n name=args.run_name + \" \" + args.my_timestamp,\n config=args,\n save_code=False,\n )\n trainer.my_wandb = wandb\n def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):\n args = self.args\n token_per_step = args.ctx_len * args.real_bsz\n real_step = trainer.global_step + args.epoch_begin * args.epoch_steps\n if trainer.is_global_zero: # logging", + "type": "code", + "location": "/RWKV-v4neo/src/trainer.py:98-120" + }, + "1717": { + "file_id": 58, + "content": "Writes log information to file, tries printing strategy configuration but handles exceptions, flushes the log, initializes W&B if enabled. In on_train_batch_end, calculates token per step, determines real step, logs only on global_step 0 (zero-based indexing).", + "type": "comment" + }, + "1718": { + "file_id": 58, + "content": " t_now = time.time_ns()\n kt_s = 0\n try:\n t_cost = (t_now - trainer.my_time_ns) / 1e9\n kt_s = token_per_step / t_cost / 1000\n self.log(\"REAL it/s\", 1.0 / t_cost, prog_bar=True, on_step=True)\n self.log(\"Kt/s\", kt_s, prog_bar=True, on_step=True)\n except:\n pass\n trainer.my_time_ns = t_now\n if pl.__version__[0]=='2':\n trainer.my_loss = outputs[\"loss\"]\n else:\n trainer.my_loss = trainer.my_loss_all.float().mean().item()\n trainer.my_loss_sum += trainer.my_loss\n trainer.my_loss_count += 1\n trainer.my_epoch_loss = trainer.my_loss_sum / trainer.my_loss_count\n self.log(\"lr\", trainer.my_lr, prog_bar=True, on_step=True)\n self.log(\"loss\", trainer.my_epoch_loss, prog_bar=True, on_step=True)\n # self.log(\"s\", real_step, prog_bar=True, on_step=True)\n if len(args.wandb) > 0:", + "type": "code", + "location": "/RWKV-v4neo/src/trainer.py:121-142" + }, + "1719": { + "file_id": 58, + "content": "Code block calculates the time taken for training step, real iterations per second (it/s), and kilo-tokens per second (Kt/s). It also logs the learning rate (lr) and current loss for progress tracking. If using PyTorch 2 version, it retrieves loss value differently. It also logs the epoch loss and optionally sends information to W&B if specified in arguments.", + "type": "comment" + }, + "1720": { + "file_id": 58, + "content": " lll = {\"loss\": trainer.my_loss, \"lr\": trainer.my_lr, \"wd\": trainer.my_wd, \"Gtokens\": real_step * token_per_step / 1e9}\n if kt_s > 0:\n lll[\"kt/s\"] = kt_s\n trainer.my_wandb.log(lll, step=int(real_step))\n if (trainer.is_global_zero) or ('deepspeed_stage_3' in args.strategy): # save pth\n if args.magic_prime > 0:\n expand_factor = 2 if args.my_qa_mask > 0 else 1\n if int(real_step) == int(args.magic_prime * expand_factor // args.real_bsz) - 1 + int(args.my_random_steps):\n to_save_dict = pl_module.state_dict()\n my_save(\n args, trainer,\n to_save_dict,\n f\"{args.proj_dir}/rwkv-final.pth\",\n )\n def on_train_epoch_start(self, trainer, pl_module):\n args = self.args\n if pl.__version__[0]=='2':\n dataset = trainer.train_dataloader.dataset\n else:\n dataset = trainer.train_dataloader.dataset.datasets", + "type": "code", + "location": "/RWKV-v4neo/src/trainer.py:143-164" + }, + "1721": { + "file_id": 58, + "content": "This code is responsible for logging training metrics and saving the model checkpoint. It checks if it's the global zero or using DeepSpeed, and saves the model state dictionary as \"rwkv-final.pth\" when the current step meets certain conditions related to batch size and random steps. The logging includes loss, learning rate, weight decay, and token count per second (if applicable).", + "type": "comment" + }, + "1722": { + "file_id": 58, + "content": " assert \"MyDataset\" in str(dataset)\n dataset.global_rank = trainer.global_rank\n dataset.real_epoch = int(args.epoch_begin + trainer.current_epoch)\n dataset.world_size = trainer.world_size\n # print(f'########## world_size {dataset.world_size} global_rank {dataset.global_rank} real_epoch {dataset.real_epoch} ##########')\n def on_train_epoch_end(self, trainer, pl_module):\n args = self.args\n to_save_dict = {}\n if (trainer.is_global_zero) or ('deepspeed_stage_3' in args.strategy): # save pth\n if (args.epoch_save > 0 and trainer.current_epoch % args.epoch_save == 0) or (trainer.current_epoch == args.epoch_count - 1):\n if args.data_type == 'wds_img':\n raw_dict = pl_module.state_dict()\n for k in raw_dict:\n if k.startswith('encoder.') or k.startswith('decoder.'):\n to_save_dict[k] = raw_dict[k]\n else:\n to_save_dict = pl_module.state_dict()", + "type": "code", + "location": "/RWKV-v4neo/src/trainer.py:165-182" + }, + "1723": { + "file_id": 58, + "content": "This code snippet is part of the trainer class and defines a method 'on_train_epoch_end'. It asserts that the dataset has the name 'MyDataset' and assigns values to the dataset object properties. If global_zero or 'deepspeed_stage_3' strategy is used, it saves the model state at specified epoch intervals.", + "type": "comment" + }, + "1724": { + "file_id": 58, + "content": " try:\n my_save(\n args, trainer,\n to_save_dict,\n f\"{args.proj_dir}/rwkv-{args.epoch_begin + trainer.current_epoch}.pth\",\n )\n except Exception as e:\n print('Error\\n\\n', e, '\\n\\n')\n if trainer.is_global_zero: # logging\n trainer.my_log.write(f\"{args.epoch_begin + trainer.current_epoch} {trainer.my_epoch_loss:.6f} {math.exp(trainer.my_epoch_loss):.4f} {trainer.my_lr:.8f} {datetime.datetime.now()} {trainer.current_epoch}\\n\")\n trainer.my_log.flush()\n trainer.my_loss_sum = 0\n trainer.my_loss_count = 0\n if (args.epoch_begin + trainer.current_epoch) >= args.my_exit:\n exit(0)\n@rank_zero_only\ndef generate_init_weight(model, init_weight_name):\n mm = model.generate_init_weight()\n if model.args.my_pile_stage == 1:\n if len(model.args.load_model) > 0:\n print(f\"Combine weights from {model.args.load_model}...\")", + "type": "code", + "location": "/RWKV-v4neo/src/trainer.py:183-208" + }, + "1725": { + "file_id": 58, + "content": "Trying to save the model, log epoch information, and optionally exit if the current epoch exceeds a specified limit. Additionally, there's a function for generating initial weights, combining with pre-existing ones if available.", + "type": "comment" + }, + "1726": { + "file_id": 58, + "content": " load_dict = torch.load(model.args.load_model, map_location=\"cpu\")\n for k in load_dict:\n try:\n assert k in mm\n except:\n print('missing', k)\n exit(0)\n src = load_dict[k]\n try:\n mm[k] = src.reshape(mm[k].shape)\n except:\n tmp = mm[k].squeeze().clone()\n print(k, src.shape, '-->', mm[k].shape)\n ss = src.shape[0]\n dd = tmp.shape[0]\n for i in range(dd):\n pos = i / dd * ss\n if pos >= ss - 1:\n tmp[i] = src[ss-1]\n else:\n p0 = int(math.floor(pos))\n ii = pos - p0\n tmp[i] = src[p0] * (1-ii) + src[p0+1] * (ii)\n mm[k] = tmp.reshape(mm[k].shape)\n sss = src.squeeze().float().cpu().numpy()", + "type": "code", + "location": "/RWKV-v4neo/src/trainer.py:209-233" + }, + "1727": { + "file_id": 58, + "content": "The code loads a dictionary from the specified model file, checks if all keys are present in 'mm' dictionary, and reshapes the loaded source to match the shape of existing data in 'mm'. If source shape doesn't match, it performs a linear interpolation to fit the new data. Finally, converts the source to float and cpu numpy array.", + "type": "comment" + }, + "1728": { + "file_id": 58, + "content": " print(sss[:10], '...', sss[-10:])\n mmm = mm[k].squeeze().float().cpu().numpy()\n print(mmm[:10], '...', mmm[-10:])\n print(f\"Save to {init_weight_name}...\")\n torch.save(mm, init_weight_name)\n if model.args.my_pile_stage == 1:\n print(\"Done. Now go for stage 2.\")\n exit(0)", + "type": "code", + "location": "/RWKV-v4neo/src/trainer.py:234-243" + }, + "1729": { + "file_id": 58, + "content": "This code segment prints parts of 'sss' and 'mmm', saves 'mm' to a file, and if in stage 1, suggests moving on to stage 2. It seems to be part of a model training process where it displays data, saves an intermediate model checkpoint, and moves to the next phase.", + "type": "comment" + }, + "1730": { + "file_id": 59, + "content": "/RWKV-v5/src/utils.py", + "type": "filepath" + }, + "1731": { + "file_id": 59, + "content": "The code imports libraries, defines a tokenizing class, utilizes Fermat's Little Theorem and Miller-Rabin primality test for prime number calculation, performs verification process to determine if a given number is prime, and returns True or False accordingly.", + "type": "summary" + }, + "1732": { + "file_id": 59, + "content": "import json, time, random, os\nimport numpy as np\nimport torch\nfrom torch.nn import functional as F\ntime_slot = {}\ntime_ref = time.time_ns()\ndef record_time(name):\n if name not in time_slot:\n time_slot[name] = 1e20\n tt = (time.time_ns() - time_ref) / 1e9\n if tt < time_slot[name]:\n time_slot[name] = tt\nclass TOKENIZER():\n def __init__(self, WORD_NAME, UNKNOWN_CHAR='\\ue083'):\n if 'list' in str(type(WORD_NAME)):\n self.charMode = False\n if WORD_NAME[0] == WORD_NAME[1]:\n from transformers import PreTrainedTokenizerFast\n self.tokenizer = PreTrainedTokenizerFast(tokenizer_file=WORD_NAME[0])\n else:\n from transformers import GPT2TokenizerFast\n self.tokenizer = GPT2TokenizerFast(WORD_NAME[0], WORD_NAME[1])\n self.vocab_size = len(self.tokenizer)\n else:\n self.charMode = True\n with open(WORD_NAME + '.json', \"r\", encoding=\"utf-16\") as result_file:\n self.word_table = json.load(result_file)", + "type": "code", + "location": "/RWKV-v4neo/src/utils.py:1-30" + }, + "1733": { + "file_id": 59, + "content": "This code imports necessary libraries and defines a class for tokenizing text using either pre-trained tokenizers or custom word tables. It also records time usage for optimization purposes.", + "type": "comment" + }, + "1734": { + "file_id": 59, + "content": " self.vocab_size = len(self.word_table)\n self.stoi = {v: int(k) for k, v in self.word_table.items()}\n self.itos = {int(k): v for k, v in self.word_table.items()}\n self.UNKNOWN_CHAR = self.stoi[UNKNOWN_CHAR]\n def refine_context(self, context):\n context = context.strip().split('\\n')\n for c in range(len(context)):\n context[c] = context[c].strip().strip('\\u3000').strip('\\r')\n context = list(filter(lambda c: c != '', context))\n context = '\\n' + ('\\n'.join(context)).strip()\n if context == '':\n context = '\\n'\n return context\n def sample_logits(self, out, x, ctx_len, temperature=1.0, top_p_usual=None, top_p_newline=None):\n # out[self.UNKNOWN_CHAR] = -float('Inf')\n lastChar = int(x[-1])\n probs = F.softmax(out, dim=-1)\n if self.charMode:\n if self.itos[lastChar] == '\\n':\n top_p = top_p_newline\n else:\n top_p = top_p_usual\n else:", + "type": "code", + "location": "/RWKV-v4neo/src/utils.py:32-60" + }, + "1735": { + "file_id": 59, + "content": "This code snippet is part of a model for text generation. The 'utils' class contains methods to refine the context by removing extra whitespace and unwanted characters, set vocabulary size based on the word table, map words to integers and vice versa, and sample logits to generate text using softmax function with option to specify temperature and top probabilities for specific characters.", + "type": "comment" + }, + "1736": { + "file_id": 59, + "content": " top_p = top_p_usual\n if os.environ[\"RWKV_RUN_DEVICE\"] == \"cpu\":\n probs = probs.numpy()\n sorted_probs = np.sort(probs)[::-1]\n cumulative_probs = np.cumsum(sorted_probs)\n cutoff = float(sorted_probs[np.argmax(cumulative_probs > top_p)])\n probs[probs < cutoff] = 0\n if temperature != 1.0:\n probs = probs.pow(1.0 / temperature)\n probs = probs / np.sum(probs)\n out = np.random.choice(a=len(probs), p=probs)\n return out\n else:\n sorted_probs = torch.sort(probs, descending=True)[0]\n cumulative_probs = torch.cumsum(sorted_probs, dim=-1).cpu().numpy()\n cutoff = float(sorted_probs[np.argmax(cumulative_probs > top_p)])\n probs[probs < cutoff] = 0\n if temperature != 1.0:\n probs = probs.pow(1.0 / temperature)\n out = torch.multinomial(probs, num_samples=1)[0]\n return out\ndef MaybeIsPrime(number):\n if FermatPrimalityTest(number) and MillerRabinPrimalityTest(number):", + "type": "code", + "location": "/RWKV-v4neo/src/utils.py:61-85" + }, + "1737": { + "file_id": 59, + "content": "This function calculates prime numbers using Fermat's Little Theorem and Miller-Rabin primality test. The prime number check is done only if the environment variable RWKV_RUN_DEVICE is set to \"cpu\". If not, it uses torch.multinomial() for selection.", + "type": "comment" + }, + "1738": { + "file_id": 59, + "content": " return True\n else:\n return False\ndef FermatPrimalityTest(number):\n if number > 1:\n for time in range(3):\n randomNumber = random.randint(2, number) - 1\n if pow(randomNumber, number - 1, number) != 1:\n return False\n return True\n else:\n return False\ndef MillerRabinPrimalityTest(number):\n if number == 2:\n return True\n elif number == 1 or number % 2 == 0:\n return False\n oddPartOfNumber = number - 1\n timesTwoDividNumber = 0\n while oddPartOfNumber % 2 == 0:\n oddPartOfNumber = oddPartOfNumber // 2\n timesTwoDividNumber = timesTwoDividNumber + 1\n for time in range(3):\n while True:\n randomNumber = random.randint(2, number) - 1\n if randomNumber != 0 and randomNumber != 1:\n break\n randomNumberWithPower = pow(randomNumber, oddPartOfNumber, number)\n if (randomNumberWithPower != 1) and (randomNumberWithPower != number - 1):\n iterationNumber = 1", + "type": "code", + "location": "/RWKV-v4neo/src/utils.py:86-122" + }, + "1739": { + "file_id": 59, + "content": "The code contains three functions: 1) FermatPrimalityTest checks if a number is prime using the Fermat Little Theorem; it returns True if number is greater than 1, and False otherwise. 2) MillerRabinPrimalityTest checks if a number is prime using the Miller-Rabin algorithm; it returns True for number=2, False for number=1 or even numbers, and starts by simplifying the odd part of the number. 3) The code uses a while loop to calculate the odd part of the number and determines the maximum iterations needed. The function then uses another loop to randomly select randomNumbers and test them for primality using the calculated parameters.", + "type": "comment" + }, + "1740": { + "file_id": 59, + "content": " while (iterationNumber <= timesTwoDividNumber - 1) and (randomNumberWithPower != number - 1):\n randomNumberWithPower = pow(randomNumberWithPower, 2, number)\n iterationNumber = iterationNumber + 1\n if randomNumberWithPower != (number - 1):\n return False\n return True", + "type": "code", + "location": "/RWKV-v4neo/src/utils.py:124-130" + }, + "1741": { + "file_id": 59, + "content": "This code performs a verification process where it checks if the given number can be determined as prime. It does this by iterating through a range of values, squaring a random number and calculating its modulo with the input number until either all iterations are completed or the value no longer matches the expected result. If the latter occurs, it returns False indicating that the number is not prime. Otherwise, it returns True.", + "type": "comment" + }, + "1742": { + "file_id": 60, + "content": "/RWKV-v5/tokenizer/rwkv_tokenizer.py", + "type": "filepath" + }, + "1743": { + "file_id": 60, + "content": "This code defines a tokenizer class using a trie data structure, with methods for encoding and decoding bytes. The TRIE_TOKENIZER initializes the trie by reading lines from a file and includes a 'find_longest' method for finding the longest token within a key string.", + "type": "summary" + }, + "1744": { + "file_id": 60, + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nclass TRIE:\n __slots__ = tuple(\"ch,to,values,front\".split(\",\"))\n to:list\n values:set\n def __init__(self, front=None, ch=None):\n self.ch = ch\n self.to = [None for ch in range(256)]\n self.values = set()\n self.front = front\n def __repr__(self):\n fr = self\n ret = []\n while(fr!=None):\n if(fr.ch!=None):\n ret.append(fr.ch)\n fr = fr.front\n return \"\"%(ret[::-1], self.values)\n def add(self, key:bytes, idx:int=0, val=None):\n if(idx == len(key)):\n if(val is None):\n val = key\n self.values.add(val)\n return self\n ch = key[idx]\n if(self.to[ch] is None):\n self.to[ch] = TRIE(front=self, ch=ch)", + "type": "code", + "location": "/RWKV-v5/tokenizer/rwkv_tokenizer.py:1-32" + }, + "1745": { + "file_id": 60, + "content": "Class \"TRIE\" for RWKV Language Model - represents a Trie data structure used for tokenizing input strings, storing values associated with each character path in the trie.", + "type": "comment" + }, + "1746": { + "file_id": 60, + "content": " return self.to[ch].add(key, idx=idx+1, val=val)\n def find_longest(self, key:bytes, idx:int=0):\n u:TRIE = self\n ch:int = key[idx]\n while(u.to[ch] is not None):\n u = u.to[ch]\n idx += 1\n if(u.values):\n ret = idx, u, u.values\n if(idx==len(key)):\n break\n ch = key[idx]\n return ret\nclass TRIE_TOKENIZER():\n def __init__(self, file_name):\n self.idx2token = {}\n sorted = [] # must be already sorted\n with open(file_name, \"r\", encoding=\"utf-8\") as f:\n lines = f.readlines()\n for l in lines:\n idx = int(l[:l.index(' ')])\n x = eval(l[l.index(' '):l.rindex(' ')])\n x = x.encode(\"utf-8\") if isinstance(x, str) else x\n assert isinstance(x, bytes)\n assert len(x) == int(l[l.rindex(' '):])\n sorted += [x]\n self.idx2token[idx] = x\n self.token2idx = {}\n for k,v in self.idx2token.items():\n self.token2idx[v] = int(k)", + "type": "code", + "location": "/RWKV-v5/tokenizer/rwkv_tokenizer.py:33-66" + }, + "1747": { + "file_id": 60, + "content": "Code is defining a TRIE data structure for tokenizing and mapping indices to tokens in a specified file. The `TRIE_TOKENIZER` class initializes the data structure by reading lines from a file, converting values to bytes, storing them in the TRIE, and creating a reverse index. The `find_longest` method is used for finding the longest token within a given key string.", + "type": "comment" + }, + "1748": { + "file_id": 60, + "content": " self.root = TRIE()\n for t, i in self.token2idx.items():\n _ = self.root.add(t, val=(t, i))\n def encodeBytes(self, src:bytes):\n idx:int = 0\n tokens = []\n while (idx < len(src)):\n _idx:int = idx\n idx, _, values = self.root.find_longest(src, idx)\n assert(idx != _idx)\n _, token = next(iter(values)) \n tokens.append(token)\n return tokens\n def decodeBytes(self, tokens):\n return b''.join(map(lambda i: self.idx2token[i], tokens))\n def encode(self, src):\n return self.encodeBytes(src.encode(\"utf-8\"))\n def decode(self, tokens):\n try:\n return self.decodeBytes(tokens).decode('utf-8')\n except:\n return '\\ufffd' # bad utf-8\n def printTokens(self, tokens):\n for i in tokens:\n s = self.idx2token[i]\n try:\n s = s.decode('utf-8')\n except:\n pass\n print(f'{repr(s)}{i}', end=' ')\n print()", + "type": "code", + "location": "/RWKV-v5/tokenizer/rwkv_tokenizer.py:68-103" + }, + "1749": { + "file_id": 60, + "content": "This code defines a tokenizer class that can encode and decode bytes using a trie data structure. The encodeBytes method converts input bytes to tokens, while the decodeBytes method reconverts tokens back into bytes. The encode and decode methods handle Unicode strings. The printTokens method prints the tokens along with their indices.", + "type": "comment" + }, + "1750": { + "file_id": 61, + "content": "/RWKV-v5/train.py", + "type": "filepath" + }, + "1751": { + "file_id": 61, + "content": "The code initializes RWKV Language Model, uses Pytorch Lightning for training and handles command line arguments. It optimizes performance by loading checkpoints, handling exceptions, setting trainer parameters and using Deepspeed optimization.", + "type": "summary" + }, + "1752": { + "file_id": 61, + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nimport logging\nlogging.basicConfig(level=logging.INFO)\nif __name__ == \"__main__\":\n from argparse import ArgumentParser\n from pytorch_lightning import Trainer\n from pytorch_lightning.utilities import rank_zero_info, rank_zero_only\n import pytorch_lightning as pl\n rank_zero_info(\"########## work in progress ##########\")\n parser = ArgumentParser()\n parser.add_argument(\"--load_model\", default=\"\", type=str) # full path, with .pth\n parser.add_argument(\"--wandb\", default=\"\", type=str) # wandb project name. if \"\" then don't use wandb\n parser.add_argument(\"--proj_dir\", default=\"out\", type=str)\n parser.add_argument(\"--random_seed\", default=\"-1\", type=int)\n parser.add_argument(\"--data_file\", default=\"\", type=str)", + "type": "code", + "location": "/RWKV-v5/train.py:1-23" + }, + "1753": { + "file_id": 61, + "content": "This code initializes the RWKV Language Model, sets up logging, and parses command-line arguments for loading a model, using Wandb, project directory, random seed, and data file. It uses Pytorch Lightning framework for training.", + "type": "comment" + }, + "1754": { + "file_id": 61, + "content": " parser.add_argument(\"--data_type\", default=\"utf-8\", type=str)\n parser.add_argument(\"--vocab_size\", default=0, type=int) # vocab_size = 0 means auto (for char-level LM and .txt data)\n parser.add_argument(\"--ctx_len\", default=1024, type=int)\n parser.add_argument(\"--epoch_steps\", default=1000, type=int) # a mini \"epoch\" has [epoch_steps] steps\n parser.add_argument(\"--epoch_count\", default=500, type=int) # train for this many \"epochs\". will continue afterwards with lr = lr_final\n parser.add_argument(\"--epoch_begin\", default=0, type=int) # if you load a model trained for x \"epochs\", set epoch_begin = x\n parser.add_argument(\"--epoch_save\", default=5, type=int) # save the model every [epoch_save] \"epochs\"\n parser.add_argument(\"--micro_bsz\", default=12, type=int) # micro batch size (batch size per GPU)\n parser.add_argument(\"--n_layer\", default=6, type=int)\n parser.add_argument(\"--n_embd\", default=512, type=int)\n parser.add_argument(\"--dim_att\", default=0, type=int)", + "type": "code", + "location": "/RWKV-v5/train.py:24-36" + }, + "1755": { + "file_id": 61, + "content": "This code snippet is for argument parsing in the RWKV-LM/RWKV-v5/train.py file. It sets default values and types for various training parameters such as data type, vocab size, context length, epoch steps, number of epochs, initial epoch, epoch save frequency, micro batch size, number of layers, embedding dimension, and attention dimensionality.", + "type": "comment" + }, + "1756": { + "file_id": 61, + "content": " parser.add_argument(\"--dim_ffn\", default=0, type=int)\n parser.add_argument(\"--pre_ffn\", default=0, type=int) # replace first att layer by ffn (sometimes better)\n parser.add_argument(\"--head_qk\", default=0, type=int) # my headQK trick\n parser.add_argument(\"--tiny_att_dim\", default=0, type=int) # tiny attention dim\n parser.add_argument(\"--tiny_att_layer\", default=-999, type=int) # tiny attention @ which layer\n parser.add_argument(\"--lr_init\", default=6e-4, type=float) # 6e-4 for L12-D768, 4e-4 for L24-D1024, 3e-4 for L24-D2048\n parser.add_argument(\"--lr_final\", default=1e-5, type=float)\n parser.add_argument(\"--warmup_steps\", default=-1, type=int) # try 50 if you load a model\n parser.add_argument(\"--beta1\", default=0.9, type=float)\n parser.add_argument(\"--beta2\", default=0.99, type=float) # use 0.999 when your model is close to convergence\n parser.add_argument(\"--adam_eps\", default=1e-8, type=float)\n parser.add_argument(\"--grad_cp\", default=0, type=int) # gradient checkpt: saves VRAM, but slower", + "type": "code", + "location": "/RWKV-v5/train.py:37-49" + }, + "1757": { + "file_id": 61, + "content": "This code snippet is from the RWKV-LM's \"train.py\" file and it sets various arguments for model training, such as dimensionality of feedforward network layers (dim_ffn), replacing first attention layer by a feedforward network (pre_ffn), and tricks like my headQK trick (head_qk). The code also specifies parameters for the tiny attention dimension (tiny_att_dim) and layer (tiny_att_layer), learning rate initialization and final values (lr_init, lr_final), warm-up steps, optimizer parameters (beta1, beta2), and Adam epsilon (adam_eps). There's also an argument for gradient checkpointing to save VRAM at the cost of increased training time (grad_cp).", + "type": "comment" + }, + "1758": { + "file_id": 61, + "content": " parser.add_argument(\"--dropout\", default=0, type=float) # try 0.01 / 0.02 / 0.05 / 0.1\n parser.add_argument(\"--weight_decay\", default=0, type=float) # try 0.1 / 0.01 / 0.001\n parser.add_argument(\"--weight_decay_final\", default=-1, type=float)\n parser.add_argument(\"--my_pile_version\", default=1, type=int) # my special pile version\n parser.add_argument(\"--my_pile_stage\", default=0, type=int) # my special pile mode\n parser.add_argument(\"--my_pile_shift\", default=-1, type=int) # my special pile mode - text shift\n parser.add_argument(\"--my_pile_edecay\", default=0, type=int)\n parser.add_argument(\"--layerwise_lr\", default=1, type=int) # layerwise lr for faster convergence (but slower it/s)\n parser.add_argument(\"--ds_bucket_mb\", default=200, type=int) # deepspeed bucket size in MB. 200 seems enough\n # parser.add_argument(\"--cuda_cleanup\", default=0, type=int) # extra cuda cleanup (sometimes helpful)\n parser.add_argument(\"--my_sample_len\", default=0, type=int)\n parser.add_argument(\"--my_ffn_shift\", default=1, type=int)", + "type": "code", + "location": "/RWKV-v5/train.py:50-63" + }, + "1759": { + "file_id": 61, + "content": "This code snippet is using argparse in Python to define various command-line arguments for a machine learning model. These arguments control features such as dropout rate, weight decay, and additional customizable settings like text shift, sample length, and more. Different values can be tried to optimize the performance of the model during training.", + "type": "comment" + }, + "1760": { + "file_id": 61, + "content": " parser.add_argument(\"--my_att_shift\", default=1, type=int)\n parser.add_argument(\"--head_size_a\", default=64, type=int) # can try larger values for larger models\n parser.add_argument(\"--head_size_divisor\", default=8, type=int)\n parser.add_argument(\"--my_pos_emb\", default=0, type=int)\n parser.add_argument(\"--load_partial\", default=0, type=int)\n parser.add_argument(\"--magic_prime\", default=0, type=int)\n parser.add_argument(\"--my_qa_mask\", default=0, type=int)\n parser.add_argument(\"--my_random_steps\", default=0, type=int)\n parser.add_argument(\"--my_testing\", default='', type=str)\n parser.add_argument(\"--my_exit\", default=99999999, type=int)\n parser.add_argument(\"--my_exit_tokens\", default=0, type=int)\n if pl.__version__[0]=='2':\n parser.add_argument(\"--accelerator\", default=\"gpu\", type=str)\n parser.add_argument(\"--strategy\", default=\"auto\", type=str)\n parser.add_argument(\"--devices\", default=1, type=int)\n parser.add_argument(\"--num_nodes\", default=1, type=int)", + "type": "code", + "location": "/RWKV-v5/train.py:64-80" + }, + "1761": { + "file_id": 61, + "content": "This code snippet adds command line arguments to a parser for various settings and configurations in the RWKV-v5 model training. It includes options like my_att_shift, head_size_a, head_size_divisor, my_pos_emb, load_partial, magic_prime, my_qa_mask, my_random_steps, my_testing, my_exit, my_exit_tokens, accelerator, strategy, devices and num_nodes. The code also checks if the Python version is 2, in which case it adds arguments for accelerator, strategy, devices and num_nodes.", + "type": "comment" + }, + "1762": { + "file_id": 61, + "content": " parser.add_argument(\"--precision\", default=\"fp16\", type=str)\n parser.add_argument(\"--accumulate_grad_batches\", default=1, type=int)\n else:\n parser = Trainer.add_argparse_args(parser)\n args = parser.parse_args()\n ########################################################################################################\n import os, warnings, math, datetime, sys, time\n import numpy as np\n import torch\n from torch.utils.data import DataLoader\n if \"deepspeed\" in args.strategy:\n import deepspeed\n from pytorch_lightning import seed_everything\n if args.random_seed >= 0:\n print(f\"########## WARNING: GLOBAL SEED {args.random_seed} THIS WILL AFFECT MULTIGPU SAMPLING ##########\\n\" * 3)\n seed_everything(args.random_seed)\n np.set_printoptions(precision=4, suppress=True, linewidth=200)\n warnings.filterwarnings(\"ignore\", \".*Consider increasing the value of the `num_workers` argument*\")\n warnings.filterwarnings(\"ignore\", \".*The progress bar already tracks a metric with the*\")", + "type": "code", + "location": "/RWKV-v5/train.py:81-103" + }, + "1763": { + "file_id": 61, + "content": "This code is adding arguments to the argument parser, handling global seed, setting numpy print options, and filtering warnings. The \"--precision\" argument sets the precision type to \"fp16\", and \"--accumulate_grad_batches\" determines the number of gradient accumulation batches. The code also imports necessary libraries and handles warnings related to workers and metric tracking.", + "type": "comment" + }, + "1764": { + "file_id": 61, + "content": " # os.environ[\"WDS_SHOW_SEED\"] = \"1\"\n args.my_timestamp = datetime.datetime.today().strftime(\"%Y-%m-%d-%H-%M-%S\")\n args.enable_checkpointing = False\n args.replace_sampler_ddp = False\n args.logger = False\n args.gradient_clip_val = 1.0\n args.num_sanity_val_steps = 0\n args.check_val_every_n_epoch = int(1e20)\n args.log_every_n_steps = int(1e20)\n args.max_epochs = -1 # continue forever\n args.betas = (args.beta1, args.beta2)\n args.real_bsz = int(args.num_nodes) * int(args.devices) * args.micro_bsz\n os.environ[\"RWKV_MY_TESTING\"] = args.my_testing\n os.environ[\"RWKV_HEAD_SIZE_A\"] = str(args.head_size_a)\n if args.dim_att <= 0:\n args.dim_att = args.n_embd\n if args.dim_ffn <= 0:\n args.dim_ffn = int((args.n_embd * 3.5) // 32 * 32) # default = 3.5x emb size\n if args.data_type == \"wds_img\":\n args.run_name = f\"v{args.my_img_version}-{args.my_img_size}-{args.my_img_bit}bit-{args.my_img_clip}x{args.my_img_clip_scale}\"\n args.proj_dir = f\"{args.proj_dir}-{args.run_name}\"", + "type": "code", + "location": "/RWKV-v5/train.py:104-126" + }, + "1765": { + "file_id": 61, + "content": "This code sets various arguments for a training script. It enables continuous training (`args.max_epochs = -1`), disables checkpointing and logging, and adjusts the batch size based on the number of nodes and devices. It also configures the dimensions of certain layers and sets specific environment variables for the training process.", + "type": "comment" + }, + "1766": { + "file_id": 61, + "content": " else:\n args.run_name = f\"{args.vocab_size} ctx{args.ctx_len} L{args.n_layer} D{args.n_embd}\"\n if not os.path.exists(args.proj_dir):\n os.makedirs(args.proj_dir)\n if args.my_pile_stage > 0:\n magic_prime_bak = args.magic_prime\n if args.my_pile_shift < 0:\n args.my_pile_shift = 0\n if magic_prime_bak > 0:\n args.magic_prime = magic_prime_bak\n if args.my_qa_mask == 2:\n args.epoch_count = 2 * args.magic_prime // 40320\n else:\n args.epoch_count = args.magic_prime // 40320\n args.epoch_steps = 40320 // args.real_bsz\n assert args.epoch_steps * args.real_bsz == 40320\n # if args.my_pile_stage == 2:\n # assert args.lr_final == args.lr_init\n if args.my_pile_stage >= 2: # find latest saved model\n list_p = []\n for p in os.listdir(args.proj_dir):\n if p.startswith(\"rwkv\") and p.endswith(\".pth\"):\n p = ((p.split(\"-\"))[1].split(\".\"))[0]\n if p != \"final\":", + "type": "code", + "location": "/RWKV-v5/train.py:127-154" + }, + "1767": { + "file_id": 61, + "content": "This code sets the run name based on certain parameters, creates a project directory if it doesn't exist, and adjusts the magic prime and epoch count for specific stages. It also ensures that the number of epoch steps and batch size are correctly set, and finds the latest saved model in the specified project directory.", + "type": "comment" + }, + "1768": { + "file_id": 61, + "content": " if p == \"init\":\n p = -1\n else:\n p = int(p)\n list_p += [p]\n list_p.sort()\n max_p = list_p[-1]\n if len(list_p) > 1:\n args.my_pile_prev_p = list_p[-2] # in case max_p is corrupted\n if max_p == -1:\n args.load_model = f\"{args.proj_dir}/rwkv-init.pth\"\n else:\n args.load_model = f\"{args.proj_dir}/rwkv-{max_p}.pth\"\n if args.warmup_steps < 0:\n if args.my_pile_stage == 2:\n args.warmup_steps = 10\n else:\n args.warmup_steps = 30\n args.epoch_begin = max_p + 1\n samples_per_epoch = args.epoch_steps * args.real_bsz\n tokens_per_epoch = samples_per_epoch * args.ctx_len\n try:\n deepspeed_version = deepspeed.__version__\n except:\n deepspeed_version = None\n pass\n rank_zero_info(", + "type": "code", + "location": "/RWKV-v5/train.py:155-182" + }, + "1769": { + "file_id": 61, + "content": "The code retrieves the maximum value from a list of integers and uses it to determine which model checkpoint file (rwkv-{max_p}.pth or rwkv-init.pth) to load, depending on whether the max value is -1 or not. It also calculates the number of samples and tokens for an epoch based on the provided arguments. The code attempts to retrieve the DeepSpeed version but handles exceptions if it fails.", + "type": "comment" + }, + "1770": { + "file_id": 61, + "content": " f\"\"\"\n############################################################################\n#\n# RWKV-5 {args.precision.upper()} on {args.num_nodes}x{args.devices} {args.accelerator.upper()}, bsz {args.num_nodes}x{args.devices}x{args.micro_bsz}={args.real_bsz}, {args.strategy} {'with grad_cp' if args.grad_cp > 0 else ''}\n#\n# Data = {args.data_file} ({args.data_type}), ProjDir = {args.proj_dir}\n#\n# Epoch = {args.epoch_begin} to {args.epoch_begin + args.epoch_count - 1} (will continue afterwards), save every {args.epoch_save} epoch\n#\n# Each \"epoch\" = {args.epoch_steps} steps, {samples_per_epoch} samples, {tokens_per_epoch} tokens\n#\n# Model = {args.n_layer} n_layer, {args.n_embd} n_embd, {args.ctx_len} ctx_len\n#\n# Adam = lr {args.lr_init} to {args.lr_final}, warmup {args.warmup_steps} steps, beta {args.betas}, eps {args.adam_eps}\n#\n# Found torch {torch.__version__}, recommend 1.13.1+cu117 or newer\n# Found deepspeed {deepspeed_version}, recommend 0.7.0 (faster than newer versions)\n# Found pytorch_lightning {pl.__version__}, recommend 1.9.5", + "type": "code", + "location": "/RWKV-v5/train.py:183-200" + }, + "1771": { + "file_id": 61, + "content": "This code block is displaying various configuration details of the RWKV-5 model, including the precision, number of nodes and devices used, batch size, data file and project directory. It also mentions the epoch range, saving frequency, steps per epoch, model architecture, learning rate schedule, Adam optimizer settings, and version information for Torch, Deepspeed, and PyTorch Lightning. The recommendation section advises using specific versions of these libraries for optimal performance.", + "type": "comment" + }, + "1772": { + "file_id": 61, + "content": "#\n############################################################################\n\"\"\"\n )\n rank_zero_info(str(vars(args)) + \"\\n\")\n assert args.data_type in [\"utf-8\", \"utf-16le\", \"numpy\", \"binidx\", \"dummy\", \"uint16\"]\n if args.lr_final == 0 or args.lr_init == 0:\n rank_zero_info(\"\\n\\nNote: lr_final = 0 or lr_init = 0. Using linear LR schedule instead.\\n\\n\")\n assert args.precision in [\"fp32\", \"tf32\", \"fp16\", \"bf16\"]\n os.environ[\"RWKV_FLOAT_MODE\"] = args.precision\n if args.precision == \"fp32\":\n for i in range(10):\n rank_zero_info(\"\\n\\nNote: you are using fp32 (very slow). Try bf16 / tf32 for faster training.\\n\\n\")\n if args.precision == \"fp16\":\n rank_zero_info(\"\\n\\nNote: you are using fp16 (might overflow). Try bf16 / tf32 for stable training.\\n\\n\")\n os.environ[\"RWKV_JIT_ON\"] = \"1\"\n if \"deepspeed_stage_3\" in args.strategy:\n os.environ[\"RWKV_JIT_ON\"] = \"0\"\n torch.backends.cudnn.benchmark = True\n torch.backends.cudnn.enabled = True\n if args.precision == \"fp32\":", + "type": "code", + "location": "/RWKV-v5/train.py:201-226" + }, + "1773": { + "file_id": 61, + "content": "This code segment sets up the training environment for the RWKV-v5 model. It checks the arguments provided, ensures correct data type and precision, handles special learning rate cases, and sets up appropriate configurations for faster and stable training. It also provides informative notes if using potentially slower or less stable precisions. Additionally, it enables CUDA features for improved performance.", + "type": "comment" + }, + "1774": { + "file_id": 61, + "content": " torch.backends.cudnn.allow_tf32 = False\n torch.backends.cuda.matmul.allow_tf32 = False\n else:\n torch.backends.cudnn.allow_tf32 = True\n torch.backends.cuda.matmul.allow_tf32 = True\n if \"32\" in args.precision:\n args.precision = 32\n elif args.precision == \"fp16\":\n args.precision = 16\n else:\n args.precision = \"bf16\"\n ########################################################################################################\n from src.trainer import train_callback, generate_init_weight\n from src.dataset import MyDataset\n train_data = MyDataset(args)\n args.vocab_size = train_data.vocab_size\n from src.model import RWKV\n model = RWKV(args)\n if len(args.load_model) == 0 or args.my_pile_stage == 1: # shall we build the initial weights?\n init_weight_name = f\"{args.proj_dir}/rwkv-init.pth\"\n generate_init_weight(model, init_weight_name) # save initial weights\n args.load_model = init_weight_name\n rank_zero_info(f\"########## Loading {args.load_model}... ##########\")", + "type": "code", + "location": "/RWKV-v5/train.py:227-256" + }, + "1775": { + "file_id": 61, + "content": "This code snippet is setting up the model training environment. It sets the CUDA backend allowances for TF32 and checks the precision argument (32, fp16, or bf16). It imports necessary modules like `train_callback`, `MyDataset` and `RWKV`. The code initializes a dataset instance, sets the vocab size based on it. Then it creates an RWKV model instance. If there's no pre-existing load model or if it's at the first stage of MyPile, it generates initial weights using `generate_init_weight`, saves them to a file and uses that file as the load model. Finally, it prints a status message about loading the specified model.", + "type": "comment" + }, + "1776": { + "file_id": 61, + "content": " try:\n load_dict = torch.load(args.load_model, map_location=\"cpu\")\n load_keys = list(load_dict.keys())\n for k in load_keys:\n if k.startswith('_forward_module.'):\n load_dict[k.replace('_forward_module.','')] = load_dict[k]\n del load_dict[k]\n except:\n rank_zero_info(f\"Bad checkpoint {args.load_model}\")\n if args.my_pile_stage >= 2: # try again using another checkpoint\n max_p = args.my_pile_prev_p\n if max_p == -1:\n args.load_model = f\"{args.proj_dir}/rwkv-init.pth\"\n else:\n args.load_model = f\"{args.proj_dir}/rwkv-{max_p}.pth\"\n args.epoch_begin = max_p + 1\n rank_zero_info(f\"Trying {args.load_model}\")\n load_dict = torch.load(args.load_model, map_location=\"cpu\")\n if args.load_partial == 1:\n load_keys = load_dict.keys()\n for k in model.state_dict():\n if k not in load_keys:\n load_dict[k] = model.state_dict()[k]", + "type": "code", + "location": "/RWKV-v5/train.py:257-280" + }, + "1777": { + "file_id": 61, + "content": "This code attempts to load a checkpoint model from the specified file. It handles exceptions if the checkpoint is invalid and allows for loading partial models. If an issue occurs, it provides information about the bad checkpoint and tries again with a different one. The code also removes unnecessary keys starting with \"_forward_module.\"", + "type": "comment" + }, + "1778": { + "file_id": 61, + "content": " model.load_state_dict(load_dict)\n if pl.__version__[0]=='2':\n trainer = Trainer(accelerator=args.accelerator,strategy=args.strategy,devices=args.devices,num_nodes=args.num_nodes,precision=args.precision,\n logger=args.logger,callbacks=[train_callback(args)],max_epochs=args.max_epochs,check_val_every_n_epoch=args.check_val_every_n_epoch,num_sanity_val_steps=args.num_sanity_val_steps,\n log_every_n_steps=args.log_every_n_steps,enable_checkpointing=args.enable_checkpointing,accumulate_grad_batches=args.accumulate_grad_batches,gradient_clip_val=args.gradient_clip_val)\n else:\n trainer = Trainer.from_argparse_args(\n args,\n callbacks=[train_callback(args)],\n )\n if trainer.global_rank == 0:\n for n in model.state_dict():\n shape = model.state_dict()[n].shape\n shape = [i for i in shape if i != 1]\n if len(shape) > 1:\n print(f\"{str(shape[0]).ljust(5)} {str(shape[1]).ljust(5)} {n}\")\n else:", + "type": "code", + "location": "/RWKV-v5/train.py:281-299" + }, + "1779": { + "file_id": 61, + "content": "The code is creating a trainer object for PyTorch Lightning, depending on the version of PyTorch. It loads the state dictionary into the model and initializes the trainer with provided arguments like accelerator, strategy, devices, etc. The code also checks the shape of parameters in the model's state_dict and prints them if the shape has more than one element.", + "type": "comment" + }, + "1780": { + "file_id": 61, + "content": " print(f\"{str(shape[0]).ljust(5)} {n}\")\n if \"deepspeed\" in args.strategy:\n trainer.strategy.config[\"zero_optimization\"][\"allgather_bucket_size\"] = args.ds_bucket_mb * 1000 * 1000\n trainer.strategy.config[\"zero_optimization\"][\"reduce_bucket_size\"] = args.ds_bucket_mb * 1000 * 1000\n # must set shuffle=False, persistent_workers=False (because worker is in another thread)\n data_loader = DataLoader(train_data, shuffle=False, pin_memory=True, batch_size=args.micro_bsz, num_workers=1, persistent_workers=False, drop_last=True)\n trainer.fit(model, data_loader)", + "type": "code", + "location": "/RWKV-v5/train.py:300-309" + }, + "1781": { + "file_id": 61, + "content": "This code prints the shape and number of training samples, configures Deepspeed optimization settings, sets up a DataLoader with specified parameters, and trains the model using the Deepspeed trainer.", + "type": "comment" + } +} \ No newline at end of file diff --git a/docs/data/2.json b/docs/data/2.json new file mode 100644 index 00000000..e0ca0eb4 --- /dev/null +++ b/docs/data/2.json @@ -0,0 +1,544 @@ +{ + "200": { + "file_id": 8, + "content": "/RWKV-v2-RNN/src/model.py", + "type": "filepath" + }, + "201": { + "file_id": 8, + "content": "The code initializes the RWKV v2-RNN language model, optimizes it for performance, and utilizes CUDA for efficient computation. It implements time-decay parameters, defines a GPT model with RWKV blocks, and sets up optimization methods.", + "type": "summary" + }, + "202": { + "file_id": 8, + "content": "########################################################################################################\n# The RWKV v2-RNN Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nfrom torch.utils.cpp_extension import load\nimport math\nimport numpy as np\nimport logging\nimport torch\nimport torch.nn as nn\nfrom torch.nn import functional as F\nlogger = logging.getLogger(__name__)\n########################################################################################################\n# CUDA Kernel\n########################################################################################################\nT_MAX = 1024 # increase this if your ctx_len > 1024\nB_GROUP_FORWARD = 4 # set to 8 for best performance\nB_GROUP_BACKWARD = 2 # set to 2 for best performance\ntimex_cuda = load(name=\"timex\", sources=[\"cuda/timex_op.cpp\", \"cuda/timex_cuda.cu\"],\n verbose=True, extra_cuda_cflags=['--use_fast", + "type": "code", + "location": "/RWKV-v2-RNN/src/model.py:1-23" + }, + "203": { + "file_id": 8, + "content": "The code imports necessary libraries and defines constants for the RWKV v2-RNN Language Model, which is a neural network language model. It loads a CUDA kernel called \"timex\" for efficient computation on GPUs using Torch's `torch.utils.cpp_extension.load` function. The constants T_MAX, B_GROUP_FORWARD, and B_GROUP_BACKWARD are set to optimize performance.", + "type": "comment" + }, + "204": { + "file_id": 8, + "content": "_math', '--extra-device-vectorization', f'-DTmax={T_MAX}', f'-DBF={B_GROUP_FORWARD}', f'-DBB={B_GROUP_BACKWARD}'])\nclass TimeX(torch.autograd.Function):\n @staticmethod\n def forward(ctx, w, k, B, C, T, eps):\n ctx.B = B\n ctx.C = C\n ctx.T = T\n assert ctx.T % 4 == 0 and ctx.T <= T_MAX and ctx.B % B_GROUP_FORWARD == 0 and ctx.B % B_GROUP_BACKWARD == 0\n w = w.contiguous()\n k = k.contiguous()\n ctx.save_for_backward(w, k)\n wk = torch.empty((B, C, T), device='cuda',\n memory_format=torch.contiguous_format)\n timex_cuda.forward(w, k, wk, eps, B, C, T)\n return wk\n @staticmethod\n def backward(ctx, gwk):\n assert ctx.T % 4 == 0 and ctx.T <= T_MAX and ctx.B % B_GROUP_FORWARD == 0 and ctx.B % B_GROUP_BACKWARD == 0\n w, k = ctx.saved_tensors\n gw = torch.empty((ctx.B, ctx.C, ctx.T), device='cuda',\n memory_format=torch.contiguous_format)\n gk = torch.empty((ctx.B, ctx.C, ctx.T), device='cuda',", + "type": "code", + "location": "/RWKV-v2-RNN/src/model.py:23-47" + }, + "205": { + "file_id": 8, + "content": "Defines a TimeX class that implements the Time-X function using CUDA for efficient computation. The class takes in weights (w), kernel (k), batch size (B), number of channels (C), sequence length (T), and epsilon (eps) as input, and returns the output tensor (wk). It also ensures all parameters meet certain conditions before forwarding and backward propagation.", + "type": "comment" + }, + "206": { + "file_id": 8, + "content": " memory_format=torch.contiguous_format)\n timex_cuda.backward(w, k, gwk.contiguous(), gw,\n gk, ctx.B, ctx.C, ctx.T)\n return (gw.sum(dim=0), gk, None, None, None, None)\n########################################################################################################\n# RWKV: RWKV Time-mix + RWKV Channel-mix\n########################################################################################################\nRWKV_K_CLAMP = 60 # e^60 = 1e26\nRWKV_K_EPS = 1e-16\nRWKV_HEAD_QK_DIM = 256\ndef RWKV_Init(module, config): # fancy initialization of all lin & emb layer in the module\n for m in module.modules():\n if not isinstance(m, (nn.Linear, nn.Embedding)):\n continue\n with torch.no_grad():\n name = '[unknown weight]'\n for name, parameter in module.named_parameters(): # find the name of the weight\n if id(m.weight) == id(parameter):\n break\n shape = m.weight.data.shape", + "type": "code", + "location": "/RWKV-v2-RNN/src/model.py:48-73" + }, + "207": { + "file_id": 8, + "content": "This code initializes the RWKV model by setting specific parameters and configurations for each layer. It uses fancy initialization to set the weights of linear and embedding layers in the module. The code also defines constants like RWKV_K_CLAMP, RWKV_K_EPS, and RWKV_HEAD_QK_DIM for further calculations.", + "type": "comment" + }, + "208": { + "file_id": 8, + "content": " gain = 1.0\n scale = 1.0 # extra scale for gain\n if isinstance(m, nn.Embedding):\n gain = math.sqrt(max(shape[0], shape[1]))\n if shape[0] == config.vocab_size and shape[1] == config.n_embd: # token emb?\n scale = 1e-4\n else:\n scale = 0\n if isinstance(m, nn.Linear):\n if m.bias is not None:\n m.bias.data.zero_()\n if shape[0] > shape[1]:\n gain = math.sqrt(shape[0] / shape[1])\n if shape[0] == config.vocab_size and shape[1] == config.n_embd: # final projection?\n scale = 0.5\n if hasattr(m, 'scale_init'):\n scale = m.scale_init\n # print(str(shape[0]).ljust(5), str(shape[1]).ljust(5), f'{round(scale,2):g}'.ljust(4), name)\n gain *= scale\n if scale == -999:\n nn.init.eye_(m.weight)\n elif gain == 0:\n # zero init is great for some RWKV matrices", + "type": "code", + "location": "/RWKV-v2-RNN/src/model.py:74-101" + }, + "209": { + "file_id": 8, + "content": "The code adjusts the gain and scale of layer weights in a neural network model, depending on the type and shape of the layer. It initializes embeddings with a small scale and linear layers with zero or identity matrices, based on specific conditions. The final projection has a different scale, while zero initialization is used if the scale is set to -999 and gain is 0.", + "type": "comment" + }, + "210": { + "file_id": 8, + "content": " nn.init.zeros_(m.weight)\n elif gain > 0:\n nn.init.orthogonal_(m.weight, gain=gain)\n else:\n nn.init.normal_(m.weight, mean=0.0, std=-scale)\nclass RWKV_TimeMix(nn.Module):\n def __init__(self, config, layer_id):\n super().__init__()\n self.layer_id = layer_id\n self.ctx_len = config.ctx_len\n self.n_embd = config.n_embd\n attn_sz = config.n_embd\n ############# fancy init of time_w curves ###################################\n f1_begin = 3.0\n f1_end = 1.2\n f2_begin = 0.65\n f2_end = 0.4\n with torch.no_grad(): # initial time_w curves for better convergence\n decay_speed = torch.ones(attn_sz, 1)\n first_sa_layer_id = 1\n for h in range(attn_sz):\n f1 = f1_begin + (layer_id-first_sa_layer_id) / \\\n (config.n_layer-1-first_sa_layer_id) * (f1_end - f1_begin)\n f2 = f2_begin + (layer_id-first_sa_layer_id) / \\", + "type": "code", + "location": "/RWKV-v2-RNN/src/model.py:102-129" + }, + "211": { + "file_id": 8, + "content": "This code initializes the time_w curves for a RWKV_TimeMix module in the model. It uses different initialization methods depending on the gain value. If the gain is zero, it initializes the weight as zeros. If the gain is positive, it initializes the weight with orthogonal values. If the gain is negative, it initializes the weight with normal distribution mean 0 and standard deviation of -scale. The time_w curves are initialized for better convergence using a decay speed variable and calculating f1 and f2 based on layer ID.", + "type": "comment" + }, + "212": { + "file_id": 8, + "content": " (config.n_layer-1-first_sa_layer_id) * (f2_end - f2_begin)\n if layer_id == first_sa_layer_id:\n f1 += 0.5\n if layer_id == config.n_layer-2:\n f2 = 0.4\n if layer_id == config.n_layer-1:\n f2 = 0.37\n decay_speed[h][0] = math.pow(f2, h / (attn_sz-1) * 7) * f1\n self.time_decay = nn.Parameter(torch.log(decay_speed)) # will use exp(self.time_decay) to ensure time_decay > 0\n self.time_curve = torch.tensor(\n [-(config.ctx_len - 2 - i) for i in range(config.ctx_len-1)]).unsqueeze(0)\n self.time_curve = self.time_curve.to('cuda')\n self.time_first = nn.Parameter(torch.ones(attn_sz, 1) * math.log(0.3))\n #############################################################################\n self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))\n with torch.no_grad(): # init to \"shift half of the channels\"\n ww = torch.ones(1, 1, config.n_embd)", + "type": "code", + "location": "/RWKV-v2-RNN/src/model.py:130-147" + }, + "213": { + "file_id": 8, + "content": "This code initializes the time-decay and related parameters for a transformer model. It sets `self.time_decay` as a learnable parameter, calculates `self.time_curve`, and initializes `self.time_first` and `self.time_shift`. The decay speed is adjusted based on the layer id to control how quickly time decays in the attention mechanism.", + "type": "comment" + }, + "214": { + "file_id": 8, + "content": " for i in range(config.n_embd // 2):\n ww[0, 0, i] = 0\n self.time_mix = nn.Parameter(ww)\n self.key = nn.Linear(config.n_embd, attn_sz, bias=False)\n self.value = nn.Linear(config.n_embd, attn_sz, bias=False)\n self.receptance = nn.Linear(config.n_embd, attn_sz, bias=False)\n self.output = nn.Linear(attn_sz, config.n_embd, bias=False)\n self.key.scale_init = 0\n self.receptance.scale_init = 0\n self.output.scale_init = 0\n def forward(self, x):\n B, T, C = x.size()\n x = x * self.time_mix + self.time_shift(x) * (1 - self.time_mix)\n k = self.key(x).transpose(-1, -2)\n v = self.value(x).transpose(-1, -2)\n r = self.receptance(x)\n # RWKV_K_CLAMP can be removed if the CUDA kernel substracts the correct k_max for each k (I will do this later)\n k = torch.clamp(k, max=RWKV_K_CLAMP)\n k = torch.exp(k)\n kv = k * v\n self.time_w = torch.cat(\n [torch.exp(self.time_decay) * self.time_curve, self.time_first], dim=-1)", + "type": "code", + "location": "/RWKV-v2-RNN/src/model.py:148-177" + }, + "215": { + "file_id": 8, + "content": "This code initializes a model for the RWKV-v2 architecture. It defines the time_mix parameter, and several linear layers (key, value, receptance, output). The forward function applies these layers to input x, scales key and receptance to zero, clamps key within certain bounds, exponentials it, and performs a weighted sum with value before returning the result.", + "type": "comment" + }, + "216": { + "file_id": 8, + "content": " w = torch.exp(self.time_w)\n wkv = TimeX.apply(w, kv, B, C, T, 0)\n # RWKV_K_EPS can be removed if the CUDA kernel sets 0/0 = 0 (I will do this later)\n wk = TimeX.apply(w, k, B, C, T, RWKV_K_EPS)\n rwkv = torch.sigmoid(r) * (wkv / wk).transpose(-1, -2)\n rwkv = self.output(rwkv)\n return rwkv\nclass RWKV_ChannelMix(nn.Module):\n def __init__(self, config, layer_id):\n super().__init__()\n self.layer_id = layer_id\n self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))\n with torch.no_grad(): # init to \"shift half of the channels\"\n x = torch.ones(1, 1, config.n_embd)\n for i in range(config.n_embd // 2):\n x[0, 0, i] = 0\n self.time_mix = nn.Parameter(x)\n hidden_sz = 4 * config.n_embd\n self.key = nn.Linear(config.n_embd, hidden_sz, bias=False)\n self.receptance = nn.Linear(config.n_embd, config.n_embd, bias=False)\n self.value = nn.Linear(hidden_sz, config.n_embd, bias=False)\n self.value.scale_init = 0", + "type": "code", + "location": "/RWKV-v2-RNN/src/model.py:178-207" + }, + "217": { + "file_id": 8, + "content": "This code defines a RWKV Channel Mix module for a transformer model. It initializes channel mixing parameters, applies time shifting and mixing operations, and then performs a key-value attention mechanism to produce the final output. The TimeX class is applied to compute weighting factors based on time steps.", + "type": "comment" + }, + "218": { + "file_id": 8, + "content": " self.receptance.scale_init = 0\n def forward(self, x):\n x = x * self.time_mix + self.time_shift(x) * (1 - self.time_mix)\n k = self.key(x)\n k = torch.square(torch.relu(k))\n kv = self.value(k)\n rkv = torch.sigmoid(self.receptance(x)) * kv\n return rkv\n########################################################################################################\n# The GPT Model with our blocks\n########################################################################################################\nclass GPTConfig:\n def __init__(self, vocab_size, ctx_len, **kwargs):\n self.vocab_size = vocab_size\n self.ctx_len = ctx_len\n for k, v in kwargs.items():\n setattr(self, k, v)\nclass Block(nn.Module):\n def __init__(self, config, layer_id):\n super().__init__()\n self.config = config\n self.layer_id = layer_id\n self.ln1 = nn.LayerNorm(config.n_embd)\n self.ln2 = nn.LayerNorm(config.n_embd)\n if self.layer_id == 0 and self.config.model_type == 'RWKV-ffnPre':", + "type": "code", + "location": "/RWKV-v2-RNN/src/model.py:208-242" + }, + "219": { + "file_id": 8, + "content": "The code is defining a GPT model with RWKV blocks. It includes an initialization for the receptance scale, forward function to process input data, and a class for the GPTConfig and Block modules. The RWKV-ffnPre model type initializes additional layers in the first layer.", + "type": "comment" + }, + "220": { + "file_id": 8, + "content": " self.ffnPre = RWKV_ChannelMix(config, layer_id+1000)\n else:\n self.att = RWKV_TimeMix(config, layer_id)\n self.ffn = RWKV_ChannelMix(config, layer_id)\n def forward(self, x):\n x = self.ln1(x)\n if self.layer_id == 0 and self.config.model_type == 'RWKV-ffnPre':\n x = x + self.ffnPre(x) # better in some cases\n else:\n x = x + self.att(x)\n x = self.ln2(x)\n x = x + self.ffn(x)\n return x\nclass GPT(nn.Module):\n def __init__(self, config):\n super().__init__()\n self.step = 0\n self.config = config\n self.emb = nn.Embedding(config.vocab_size, config.n_embd)\n self.blocks = nn.Sequential(*[Block(config, i)\n for i in range(config.n_layer)])\n self.ln_out = nn.LayerNorm(config.n_embd)\n self.head = nn.Linear(config.n_embd, config.vocab_size, bias=False)\n self.head_q = nn.Linear(config.n_embd, RWKV_HEAD_QK_DIM, bias=False)\n self.head_q.scale_init = 0", + "type": "code", + "location": "/RWKV-v2-RNN/src/model.py:243-275" + }, + "221": { + "file_id": 8, + "content": "The code defines a GPT model with layers and a forward function. It initializes an embedding layer, a sequence of blocks, and output and attention layers. The forward function applies layer normalization and feeds the input through the attention and feed-forward networks. If the layer_id is 0 and the config's model_type is 'RWKV-ffnPre', it adds the ffnPre to the input for better performance in some cases.", + "type": "comment" + }, + "222": { + "file_id": 8, + "content": " self.head_k = nn.Linear(config.n_embd, RWKV_HEAD_QK_DIM, bias=False)\n self.head_k.scale_init = 0.1\n self.register_buffer(\"copy_mask\", torch.tril(\n torch.ones(config.ctx_len, config.ctx_len)))\n self.ctx_len = config.ctx_len\n RWKV_Init(self, config)\n logger.info(\"number of parameters: %e\", sum(p.numel()\n for p in self.parameters()))\n def get_ctx_len(self):\n return self.ctx_len\n def _init_weights(self, module):\n if isinstance(module, (nn.Linear)):\n module.weight.data.normal_(mean=0.0, std=0.01)\n if isinstance(module, (nn.Embedding)):\n module.weight.data.normal_(mean=0.0, std=1e-5)\n if isinstance(module, nn.Linear) and module.bias is not None:\n module.bias.data.zero_()\n def configure_optimizers(self, train_config):\n # separate out all parameters to those that will and won't experience regularizing weight decay\n decay = set()\n no_decay = set()\n for mn, m in self.named_modules(): # here we disable weight_decay", + "type": "code", + "location": "/RWKV-v2-RNN/src/model.py:276-304" + }, + "223": { + "file_id": 8, + "content": "This code defines a model class with initialization and optimization configuration methods. It initializes the linear layer, copy mask, and registers buffers for the model. It also sets the context length (ctx_len) and calls RWKV_Init function to initialize other parameters. The _init_weights method is used to set weights in the layers. Finally, the configure_optimizers method separates model parameters into those with and without weight decay for optimization.", + "type": "comment" + }, + "224": { + "file_id": 8, + "content": " for pn, p in m.named_parameters():\n fpn = '%s.%s' % (mn, pn) if mn else pn # full param name\n no_decay.add(fpn)\n param_dict = {pn: p for pn, p in self.named_parameters()}\n inter_params = decay & no_decay\n union_params = decay | no_decay\n assert len(\n inter_params) == 0, \"parameters %s made it into both decay/no_decay sets!\" % (str(inter_params), )\n assert len(param_dict.keys() - union_params) == 0, \"parameters %s were not separated into either decay/no_decay set!\" \\\n % (str(param_dict.keys() - union_params), )\n optim_groups = [\n {\"params\": [param_dict[pn]\n for pn in sorted(list(no_decay))], \"weight_decay\": 0.0},\n ]\n optimizer = torch.optim.Adam(\n optim_groups, lr=train_config.learning_rate, betas=train_config.betas, eps=train_config.eps)\n return optimizer\n def forward(self, idx, targets=None):\n self.step += 1\n B, T = idx.size()", + "type": "code", + "location": "/RWKV-v2-RNN/src/model.py:305-329" + }, + "225": { + "file_id": 8, + "content": "Looping through model parameters, separating them into decay and no_decay groups. Creating optimizer groups for no_decay params with weight_decay=0, then initializing an Adam optimizer for training.", + "type": "comment" + }, + "226": { + "file_id": 8, + "content": " assert T <= self.ctx_len, \"Cannot forward, because len(input) > model ctx_len.\"\n x = self.emb(idx)\n x = self.blocks(x)\n x = self.ln_out(x)\n q = self.head_q(x)[:, :T, :]\n k = self.head_k(x)[:, :T, :]\n c = (q @ k.transpose(-2, -1)) * (1.0 / RWKV_HEAD_QK_DIM)\n c = c.masked_fill(self.copy_mask[:T, :T] == 0, 0)\n c = c @ F.one_hot(idx, num_classes=self.config.vocab_size).float()\n x = self.head(x) + c\n loss = None\n if targets is not None:\n loss = F.cross_entropy(x.view(-1, x.size(-1)), targets.view(-1))\n return x, loss", + "type": "code", + "location": "/RWKV-v2-RNN/src/model.py:330-349" + }, + "227": { + "file_id": 8, + "content": "This code is part of a model's forward pass. It checks if the input length (T) is within the model's context length, embeds the input, passes it through multiple blocks, applies layer normalization, and performs attention calculations for query and key tensors. If targets are provided, it calculates the cross-entropy loss.", + "type": "comment" + }, + "228": { + "file_id": 9, + "content": "/RWKV-v2-RNN/src/model_run.py", + "type": "filepath" + }, + "229": { + "file_id": 9, + "content": "The code initializes a language generation model class using layer normalization and attention, applying iterative layers with time decay to generate future sequence tokens. A context vector is calculated via matrix multiplications and added to the input list at corresponding indices.", + "type": "summary" + }, + "230": { + "file_id": 9, + "content": "import types\nimport copy\nimport torch\nfrom torch.nn import functional as F\nRWKV_K_CLAMP = 60\nRWKV_K_EPS = 1e-16\nRWKV_HEAD_QK_DIM = 256\nDEBUG_TIME = False # True False - show trained time-coeffs\nclass RWKV_RNN():\n def __init__(self, MODEL_NAME, RUN_DEVICE, model_type, n_layer, n_embd, ctx_len):\n self.RUN_DEVICE = RUN_DEVICE\n self.model_type = model_type\n self.n_layer = n_layer\n self.n_embd = n_embd\n self.ctx_len = ctx_len\n self.w = types.SimpleNamespace()\n w = torch.load(MODEL_NAME + '.pth',\n map_location=torch.device(RUN_DEVICE))\n for x in w.keys():\n if '.time_' in x:\n w[x] = w[x].squeeze()\n if '.time_decay' in x:\n w[x] = torch.exp(-torch.exp(w[x]))\n if '.time_first' in x:\n w[x] = torch.exp(w[x])\n if DEBUG_TIME and '.time_' in x:\n print(x, w[x].squeeze().cpu().numpy())\n xx = x.split('.')\n here = self.w\n for i in range(len(xx)):", + "type": "code", + "location": "/RWKV-v2-RNN/src/model_run.py:1-37" + }, + "231": { + "file_id": 9, + "content": "The code initializes a RWKV_RNN class. It takes input parameters such as MODEL_NAME, RUN_DEVICE, model_type, n_layer, n_embd, and ctx_len. The class loads the weights from a .pth file located at MODEL_NAME+'.pth' using map_location set to RUN_DEVICE. It performs certain operations on the loaded weights for time-coefficients and assigns them to corresponding attributes within the class. If DEBUG_TIME is True, it prints the updated time coefficients.", + "type": "comment" + }, + "232": { + "file_id": 9, + "content": " if xx[i].isdigit():\n ii = int(xx[i])\n if ii not in here:\n here[ii] = types.SimpleNamespace()\n here = here[ii]\n else:\n if i == len(xx) - 1:\n setattr(here, xx[i], w[x])\n elif not hasattr(here, xx[i]):\n if xx[i+1].isdigit():\n setattr(here, xx[i], {})\n else:\n setattr(here, xx[i], types.SimpleNamespace())\n here = getattr(here, xx[i])\n self.clear()\n def clear(self):\n self.xx = {}\n self.aa = {}\n self.bb = {}\n self.hk = None\n def save(self, target):\n target.xx = copy.deepcopy(self.xx)\n target.aa = copy.deepcopy(self.aa)\n target.bb = copy.deepcopy(self.bb)\n target.hk = copy.deepcopy(self.hk)\n def load(self, target):\n self.xx = copy.deepcopy(target.xx)\n self.aa = copy.deepcopy(target.aa)", + "type": "code", + "location": "/RWKV-v2-RNN/src/model_run.py:38-69" + }, + "233": { + "file_id": 9, + "content": "This code represents a data structure that allows for nested dictionary-like storage with optional object types. It includes functions to clear, save, and load the data structure. The `clear` function resets all stored variables to empty states, while `save` and `load` allow for copying state between instances of this data structure. The code uses a combination of dictionaries and simple namespaces to handle various types of values within the storage.", + "type": "comment" + }, + "234": { + "file_id": 9, + "content": " self.bb = copy.deepcopy(target.bb)\n self.hk = copy.deepcopy(target.hk)\n def LN(self, xx, w):\n return F.layer_norm(xx, (self.n_embd,), weight=w.weight, bias=w.bias)\n def FF(self, xx, w, name):\n if name not in self.xx:\n self.xx[name] = torch.zeros(self.n_embd, device=self.RUN_DEVICE)\n x = xx * w.time_mix + self.xx[name] * (1 - w.time_mix)\n self.xx[name] = xx\n r = torch.sigmoid(w.receptance.weight @ x)\n k = torch.square(torch.relu(w.key.weight @ x))\n kv = w.value.weight @ k\n return r * kv\n def SA(self, xx, w, name):\n if name not in self.xx:\n self.xx[name] = torch.zeros(self.n_embd, device=self.RUN_DEVICE)\n self.aa[name] = torch.zeros(self.n_embd, device=self.RUN_DEVICE)\n self.bb[name] = torch.zeros(self.n_embd, device=self.RUN_DEVICE)\n x = xx * w.time_mix + self.xx[name] * (1 - w.time_mix)\n self.xx[name] = xx\n r = torch.sigmoid(w.receptance.weight @ x)\n k = torch.exp(torch.clamp(w.key.weight @ x, max=RWKV_K_CLAMP))", + "type": "code", + "location": "/RWKV-v2-RNN/src/model_run.py:70-98" + }, + "235": { + "file_id": 9, + "content": "This code defines methods for a neural network model. It includes deep copying of variables, layer normalization, feed-forward operation, and scaled attention operation. The LN method performs layer normalization on input tensor xx using the weights w. The FF method applies a feed-forward operation to the input tensor xx using the weights w and stores intermediate results. The SA method applies the scaled attention operation to the input tensor xx using the weights w and stores intermediate results.", + "type": "comment" + }, + "236": { + "file_id": 9, + "content": " v = w.value.weight @ x\n kv = k * v\n a = self.aa[name] + w.time_first * kv\n b = self.bb[name] + w.time_first * k\n self.aa[name] = w.time_decay * self.aa[name] + kv\n self.bb[name] = w.time_decay * self.bb[name] + k\n rwkv = r * a / (b + RWKV_K_EPS)\n return w.output.weight @ rwkv\n def run(self, ctx):\n w = self.w\n x = w.emb.weight[ctx[-1]]\n for i in range(self.n_layer):\n x = self.LN(x, w.blocks[i].ln1)\n if i == 0 and self.model_type == 'RWKV-ffnPre':\n x = x + self.FF(x, w.blocks[i].ffnPre, f'ffnPre.{i}')\n else:\n x = x + self.SA(x, w.blocks[i].att, f'att.{i}')\n x = self.LN(x, w.blocks[i].ln2)\n x = x + self.FF(x, w.blocks[i].ffn, f'ffn.{i}')\n x = self.LN(x, w.ln_out)\n if self.hk == None:\n self.hk = (w.head_k.weight @ x).unsqueeze(0)\n else:\n self.hk = torch.cat(\n [self.hk, (w.head_k.weight @ x).unsqueeze(0)], dim=0)", + "type": "code", + "location": "/RWKV-v2-RNN/src/model_run.py:99-130" + }, + "237": { + "file_id": 9, + "content": "This code is implementing the RWKV model for language generation. It applies layer normalization, self-attention, and feed-forward layers iteratively to generate output. The time decay mechanism is used to update the internal states of the model. Additionally, the code initializes the head keys for generating future sequence tokens.", + "type": "comment" + }, + "238": { + "file_id": 9, + "content": " if self.hk.shape[0] > self.ctx_len:\n self.hk = self.hk[-self.ctx_len:, :]\n q = w.head_q.weight @ x\n x = w.head.weight @ x\n x = x.cpu().numpy().tolist()\n c = (self.hk @ q) / RWKV_HEAD_QK_DIM\n for i in range(len(c)):\n x[ctx[i]] += c[i]\n return x", + "type": "code", + "location": "/RWKV-v2-RNN/src/model_run.py:131-143" + }, + "239": { + "file_id": 9, + "content": "This code snippet reshapes the `hk` variable if its shape exceeds the context length, then performs matrix multiplications to calculate a context vector, and finally adds this vector to the input list at corresponding indices.", + "type": "comment" + }, + "240": { + "file_id": 10, + "content": "/RWKV-v2-RNN/src/trainer.py", + "type": "filepath" + }, + "241": { + "file_id": 10, + "content": "TrainerConfig class establishes training parameters, GPU support, Wandb setup, initializes device, defines epoch function, creates optimizer, sets DataLoader and trains model. Code handles learning rate multiplier update, progress logging, and moving averages for loss calculation in specified epochs loop.", + "type": "summary" + }, + "242": { + "file_id": 10, + "content": "########################################################################################################\n# The RWKV v2-RNN Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nfrom torch.utils.data.dataloader import DataLoader\nfrom torch.optim.lr_scheduler import LambdaLR\nfrom torch.nn import functional as F\nimport torch.nn as nn\nimport torch.optim as optim\nimport torch\nfrom tqdm.auto import tqdm\nimport numpy as np\nimport logging\nimport os\nimport datetime\nimport sys\nimport math\n# import wandb # comment this if you don't have wandb\n# print('logging to wandb... (comment it if you don\\'t have wandb)')\nlogger = logging.getLogger(__name__)\ntorch.backends.cudnn.benchmark = True\ntorch.backends.cudnn.allow_tf32 = True\ntorch.backends.cuda.matmul.allow_tf32 = True\nlog_file = open(\"mylog.txt\", \"a\")\nclass TrainerConfig:\n max_epochs = 10\n batch_size = 64\n learning_rate = 4e-4\n betas = (0.9, 0.99)\n eps = 1e-8\n grad_norm_clip = 1.0", + "type": "code", + "location": "/RWKV-v2-RNN/src/trainer.py:1-36" + }, + "243": { + "file_id": 10, + "content": "TrainerConfig class initializes model training parameters, setting maximum epochs to 10, batch size to 64, learning rate to 4e-4, gradient norm clipping threshold to 1.0, and optimizer betas and eps for Adam optimizer. The code also sets up environment for efficient GPU usage with CUDA and TF32 support.", + "type": "comment" + }, + "244": { + "file_id": 10, + "content": " lr_decay = True # linear warmup followed by cosine decay\n warmup_tokens = 0\n final_tokens = 0\n epoch_save_frequency = 0\n epoch_save_path = 'trained-'\n num_workers = 0 # for DataLoader\n def __init__(self, **kwargs):\n for k, v in kwargs.items():\n setattr(self, k, v)\nclass Trainer:\n def __init__(self, model, train_dataset, test_dataset, config):\n self.model = model\n self.train_dataset = train_dataset\n self.test_dataset = test_dataset\n self.config = config\n self.avg_loss = -1\n self.steps = 0\n if 'wandb' in sys.modules:\n cfg = model.config\n for k in config.__dict__:\n setattr(cfg, k, config.__dict__[k]) # combine cfg\n wandb.init(project=\"RWKV-LM\", name=self.get_run_name() + '-' +\n datetime.datetime.today().strftime('%Y-%m-%d-%H-%M-%S'), config=cfg, save_code=False)\n self.device = 'cpu'\n if torch.cuda.is_available(): # take over whatever gpus are on the system", + "type": "code", + "location": "/RWKV-v2-RNN/src/trainer.py:37-67" + }, + "245": { + "file_id": 10, + "content": "The code snippet represents the initialisation of a Trainer class. It accepts a model, train and test datasets, and a configuration. If wandb (a popular ML experiment tracking tool) is available, it combines the config with the model's config and initiates an experiment with a specific project name and timestamped run name. The device is set to 'cpu', but if GPUs are available, they will be used.", + "type": "comment" + }, + "246": { + "file_id": 10, + "content": " self.device = torch.cuda.current_device()\n def get_run_name(self):\n raw_model = self.model.module if hasattr(\n self.model, \"module\") else self.model\n cfg = raw_model.config\n run_name = str(cfg.vocab_size) + '-' + str(cfg.ctx_len) + '-' + \\\n cfg.model_type + '-' + str(cfg.n_layer) + '-' + str(cfg.n_embd)\n return run_name\n def train(self):\n model, config = self.model, self.config\n raw_model = model.module if hasattr(self.model, \"module\") else model\n optimizer = raw_model.configure_optimizers(config)\n def run_epoch(split):\n is_train = split == 'train'\n model.train(is_train)\n data = self.train_dataset if is_train else self.test_dataset\n if config.num_workers > 0:\n loader = DataLoader(data, shuffle=False, pin_memory=True,\n batch_size=config.batch_size,\n num_workers=config.num_workers)\n else:", + "type": "code", + "location": "/RWKV-v2-RNN/src/trainer.py:68-92" + }, + "247": { + "file_id": 10, + "content": "The code initializes the device, retrieves the run name based on model configuration, and defines a function to run an epoch. It also creates an optimizer for the model and sets up a data loader if necessary, all within the context of training the model.", + "type": "comment" + }, + "248": { + "file_id": 10, + "content": " loader = DataLoader(data, shuffle=False,\n batch_size=config.batch_size,\n num_workers=config.num_workers)\n pbar = tqdm(enumerate(loader), total=len(\n loader), bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}') if is_train else enumerate(loader)\n for it, (x, y) in pbar:\n x = x.to(self.device) # place data on the correct device\n y = y.to(self.device)\n with torch.set_grad_enabled(is_train):\n _, loss = model(x, y) # forward the model\n if is_train: # backprop and update the parameters\n model.zero_grad()\n loss.backward()\n if config.grad_norm_clip > 0:\n torch.nn.utils.clip_grad_norm_(\n model.parameters(), config.grad_norm_clip)\n optimizer.step()\n if config.lr_decay: # decay the learning rate based on our progress", + "type": "code", + "location": "/RWKV-v2-RNN/src/trainer.py:93-117" + }, + "249": { + "file_id": 10, + "content": "This code creates a DataLoader for iterating through data with shuffling disabled, and sets the batch size and number of workers according to config settings. It then initializes a progress bar (pbar) to track the progress through the loader. In training mode, it loops through each iteration (it, x, y), moves tensors to the device, forwards model, zeroes gradients, backprops, applies gradient clipping if specified, and steps the optimizer. If learning rate decay is enabled, it also decays the learning rate based on progress.", + "type": "comment" + }, + "250": { + "file_id": 10, + "content": " # number of tokens processed this step (i.e. label is not -100)\n self.tokens += (y >= 0).sum()\n lr_final_factor = config.lr_final / config.learning_rate\n if self.tokens < config.warmup_tokens:\n # linear warmup\n lr_mult = lr_final_factor + \\\n (1 - lr_final_factor) * float(self.tokens) / \\\n float(config.warmup_tokens)\n progress = 0\n else:\n # cosine learning rate decay\n progress = float(self.tokens - config.warmup_tokens) / float(\n max(1, config.final_tokens - config.warmup_tokens))\n lr_mult = (0.5 + lr_final_factor / 2) + (0.5 - lr_final_factor /\n 2) * math.cos(math.pi * progress) # better 1.0 ~ 0.1", + "type": "code", + "location": "/RWKV-v2-RNN/src/trainer.py:118-132" + }, + "251": { + "file_id": 10, + "content": "This code calculates the learning rate multiplier based on the number of tokens processed. If fewer than warmup_tokens, uses linear warmup; otherwise, applies cosine learning rate decay. The learning rate is adjusted according to the current token count and configuration parameters.", + "type": "comment" + }, + "252": { + "file_id": 10, + "content": " lr = config.learning_rate * lr_mult\n for param_group in optimizer.param_groups:\n param_group['lr'] = lr\n else:\n lr = config.learning_rate\n now_loss = loss.item() # report progress\n self.lr = lr\n if 'wandb' in sys.modules:\n wandb.log({\"loss\": now_loss},\n step=self.steps * self.config.batch_size)\n self.steps += 1\n if self.avg_loss < 0:\n self.avg_loss = now_loss\n else:\n factor = 1 / (it + 1)\n self.avg_loss = self.avg_loss * \\\n (1.0 - factor) + now_loss * factor\n pbar.set_description(\n f\"mini-epoch {epoch+1} prog {progress*100.0:.2f}% iter {it}: ppl {math.exp(self.avg_loss):.2f} loss {self.avg_loss:.4f} lr {lr:e}\")", + "type": "code", + "location": "/RWKV-v2-RNN/src/trainer.py:133-154" + }, + "253": { + "file_id": 10, + "content": "Updates learning rate based on config. Sets the learning rate for optimizer parameter groups and logs loss, steps, average loss, progress, and learning rate using WandB. Calculates and updates average loss using a moving average factor. Updates progress bar description with mini-epoch, progress percentage, perplexity, current loss, and learning rate in exponential format.", + "type": "comment" + }, + "254": { + "file_id": 10, + "content": " self.tokens = 0 # counter used for learning rate decay\n for epoch in range(config.max_epochs):\n run_epoch('train')\n log_file.write(\n f'{epoch+1} {self.avg_loss:.6f} {math.exp(self.avg_loss):.4f} {self.lr:.8f} {datetime.datetime.now()} \\n')\n log_file.flush()\n if (self.config.epoch_save_frequency > 0 and epoch % self.config.epoch_save_frequency == 0) or (epoch == config.max_epochs - 1):\n # DataParallel wrappers keep raw model object in .module\n raw_model = self.model.module if hasattr(\n self.model, \"module\") else self.model\n torch.save(raw_model.state_dict(),\n self.config.epoch_save_path + str(epoch+1) + '.pth')", + "type": "code", + "location": "/RWKV-v2-RNN/src/trainer.py:156-170" + }, + "255": { + "file_id": 10, + "content": "The code initializes a tokens counter and loops over the specified number of epochs. For each epoch, it runs the 'train' function, logs information to a file, saves the model's state if necessary (based on config), and flushes the log file.", + "type": "comment" + }, + "256": { + "file_id": 11, + "content": "/RWKV-v2-RNN/src/utils.py", + "type": "filepath" + }, + "257": { + "file_id": 11, + "content": "This code defines a Dataset class for the RWKV v2-RNN Language Model, creating token lists and storing them in \"vocab.json\". It provides functions to convert input data into tokens using a pre-defined vocabulary, supports random access to data with fixed epoch length, initializes a word table, sets vocabulary size, establishes dictionaries, refines context, samples logits based on input, sorts probabilities, breaks loop at top_p, calculates cutoff value and modifies probabilities, applies optional temperature parameter, and returns a sample using multinomial sampling.", + "type": "summary" + }, + "258": { + "file_id": 11, + "content": "########################################################################################################\n# The RWKV v2-RNN Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nimport json\nimport random\nimport time\nimport math\nimport numpy as np\nimport torch\nimport torch.nn as nn\nfrom torch.nn import functional as F\nfrom torch.utils.data import Dataset\nclass Dataset(Dataset):\n def __init__(self, data, ctx_len, epoch_length_fixed):\n print('building token list...', end=' ')\n unique = sorted(list(set(data)))\n # print()\n # for u in unique:\n # print(u, end=' ')\n # print('\\n\\n')\n xx = 0\n xxObj = {}\n for u in unique:\n xxObj[xx] = u\n xx += 1\n with open('vocab.json', \"w\", encoding=\"utf-16\") as vocab_file:\n vocab_file.write(json.dumps(xxObj, ensure_ascii=False))\n data_size, vocab_size = len(data), len(unique)", + "type": "code", + "location": "/RWKV-v2-RNN/src/utils.py:1-33" + }, + "259": { + "file_id": 11, + "content": "This code defines a custom Dataset class for the RWKV v2-RNN Language Model. It builds a token list from input data, stores it in \"vocab.json\", and calculates the dataset size and vocabulary size.", + "type": "comment" + }, + "260": { + "file_id": 11, + "content": " print('data has %d tokens, %d unique.' % (data_size, vocab_size))\n self.stoi = {ch: i for i, ch in enumerate(unique)}\n self.itos = {i: ch for i, ch in enumerate(unique)}\n self.ctx_len = ctx_len\n self.epoch_length_fixed = epoch_length_fixed\n self.vocab_size = vocab_size\n self.data = data\n def __len__(self):\n return self.epoch_length_fixed\n def __getitem__(self, idx):\n # cheat: pick a random spot in dataset\n i = np.random.randint(0, len(self.data) - (self.ctx_len + 1))\n chunk = self.data[i:i+self.ctx_len+1]\n dix = [self.stoi[s] for s in chunk]\n x = torch.tensor(dix[:-1], dtype=torch.long,\n device=torch.device('cuda'))\n y = torch.tensor(dix[1:], dtype=torch.long,\n device=torch.device('cuda'))\n return x, y\nclass TOKENIZER():\n def __init__(self, WORD_NAME, UNKNOWN_CHAR='\\ue083'):\n with open(WORD_NAME + '.json', \"r\", encoding=\"utf-16\") as result_file:", + "type": "code", + "location": "/RWKV-v2-RNN/src/utils.py:34-59" + }, + "261": { + "file_id": 11, + "content": "This code defines a class for data processing and loading, with functions to convert input data into tokens using a pre-defined vocabulary. It also provides random access to the data in fixed epoch length. The TOKENIZER class is initialized with a WORD_NAME file path and an UNKNOWN_CHAR placeholder.", + "type": "comment" + }, + "262": { + "file_id": 11, + "content": " self.word_table = json.load(result_file)\n self.vocab_size = len(self.word_table)\n self.stoi = {v: int(k) for k, v in self.word_table.items()}\n self.itos = {int(k): v for k, v in self.word_table.items()}\n self.UNKNOWN_CHAR = self.stoi[UNKNOWN_CHAR]\n def refine_context(self, context):\n context = context.strip().split('\\n')\n for c in range(len(context)):\n context[c] = context[c].strip().strip('\\u3000').strip('\\r')\n context = list(filter(lambda c: c != '', context))\n context = '\\n' + ('\\n'.join(context)).strip()\n if context == '':\n context = '\\n'\n return context\n def sample_logits(self, out, x, ctx_len, temperature=1.0, top_p_usual=None, top_p_newline=None):\n # out[self.UNKNOWN_CHAR] = -float('Inf')\n lastChar = int(x[-1])\n probs = F.softmax(torch.tensor(out), dim=-1)\n if self.itos[lastChar] == '\\n':\n top_p = top_p_newline\n else:\n top_p = top_p_usual", + "type": "code", + "location": "/RWKV-v2-RNN/src/utils.py:60-90" + }, + "263": { + "file_id": 11, + "content": "The code initializes a word table, sets vocabulary size, establishes string-to-int and int-to-string dictionaries, defines an UNKNOWN_CHAR, refines context by stripping whitespace and special characters, and samples logits based on input while applying softmax function and considering different top_p values for newline characters.", + "type": "comment" + }, + "264": { + "file_id": 11, + "content": " sorted_probs, s_index = torch.sort(probs, descending=True)\n # for j in range(30):\n # pp = sorted_probs[j].item()\n # if pp < 0.005:\n # break\n # ss = self.itos[int(s_index[j])].replace('\\n','_')\n # print(f'{math.floor(pp*100):>3.0f}{ss}', end='')\n # print('')\n cumulative_probs = torch.cumsum(sorted_probs, dim=-1).numpy()\n cutoff = float(sorted_probs[np.argmax(cumulative_probs > top_p)])\n probs[probs < cutoff] = 0\n # print(\"[\" + str(round(cutoff,4)) + ' ' + str(round(to_float(sum(probs)),3)) + \"]\", end = \"\")\n if temperature != 1.0:\n probs = probs.pow(1.0 / temperature)\n return torch.multinomial(probs, num_samples=1)[0]\ndef to_float(x):\n return x.cpu().detach().numpy().flatten()[0].astype(float)\ndef set_seed(seed):\n random.seed(seed)\n np.random.seed(seed)\n torch.manual_seed(seed)\n torch.cuda.manual_seed_all(seed)", + "type": "code", + "location": "/RWKV-v2-RNN/src/utils.py:92-122" + }, + "265": { + "file_id": 11, + "content": "Sorts probabilities and breaks loop when cumulative probability exceeds top_p. Calculates a cutoff value based on the sorted probabilities and sets low probabilities to 0. Optionally applies temperature parameter. Returns a single sample from the modified probabilities using multinomial sampling.", + "type": "comment" + }, + "266": { + "file_id": 12, + "content": "/RWKV-v2-RNN/train.py", + "type": "filepath" + }, + "267": { + "file_id": 12, + "content": "This code trains RWKV v2-RNN and GPT models, adjusting parameters such as batch size, learning rate, and optimizing Torch usage. It saves models at fixed intervals and provides configurations for both models.", + "type": "summary" + }, + "268": { + "file_id": 12, + "content": "########################################################################################################\n# The RWKV v2-RNN Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nimport logging\nimport datetime\nimport json\nfrom src.model import GPT, GPTConfig\nfrom src.trainer import Trainer, TrainerConfig\nfrom src.utils import Dataset\nimport torch\nimport numpy as np\ntorch.backends.cudnn.benchmark = True\ntorch.backends.cudnn.allow_tf32 = True\ntorch.backends.cuda.matmul.allow_tf32 = True\n### Step 1: set training data ##########################################################################\ndatafile = \"enwik8\"\ndatafile_encoding = 'utf-8'\n# datafile_encoding = 'utf-16le'\n### Step 2: set model size #############################################################################\nctx_len = 1024 # ===> increase T_MAX in model.py if your ctx_len > 1024\nn_layer = 6\nn_embd = 512\n# 'RWKV' (better for char-level English) or 'RWKV-ffnPre' (better in some cases)", + "type": "code", + "location": "/RWKV-v2-RNN/train.py:1-29" + }, + "269": { + "file_id": 12, + "content": "Code for RWKV v2-RNN language model training. Sets up training data, model size, and provides options for encoding format. Imports necessary modules, optimizes Torch usage, and offers different model configurations.", + "type": "comment" + }, + "270": { + "file_id": 12, + "content": "model_type = 'RWKV'\n### Step 3: set batch size #############################################################################\n# ===> batch_size must be divisible by B_GROUP_FORWARD and B_GROUP_BACKWARD in model.py\n# For example, if your batch_size = 20, you can set B_GROUP_FORWARD = 4, B_GROUP_BACKWARD = 2\n# If you see \"CUDA out of memory\", reduce it. Use GPU-Z to find the highest value for your VRAM.\nbatch_size = 12\n### Step 4: set learning rate, training mini-epochs #######################################################\nlr_init = 6e-4\nlr_final = 1e-5\n# the mini-epoch is very short and of fixed length (ctx_len * epoch_length_fixed tokens)\nn_epoch = 500\n# 0 = never, 1 = every mini-epoch, 2 = every two mini-epochs, etc.\nepoch_save_frequency = 30\nepoch_save_path = 'trained-'\nepoch_length_fixed = 10000\n########################################################################################################\n# import src.utils\n# src.utils.set_seed(42) # remember to change seed if you load a model\nnp.set_printoptions(precision=4, suppress=True, linewidth=200)", + "type": "code", + "location": "/RWKV-v2-RNN/train.py:30-56" + }, + "271": { + "file_id": 12, + "content": "This code sets the batch size, learning rate, training mini-epochs, and other parameters for the RWKV model. The batch size should be divisible by forward and backward group sizes in model.py. If encountering \"CUDA out of memory,\" reduce it within GPU limitations. Mini-epochs have a fixed length with saved models saved every 30 mini-epochs.", + "type": "comment" + }, + "272": { + "file_id": 12, + "content": "logging.basicConfig(format=\"%(asctime)s - %(levelname)s - %(name)s - %(message)s\",\n datefmt=\"%Y-%m-%d %H:%M:%S\", level=logging.INFO,)\ngrad_norm_clip = 1.0\nwarmup_tokens = 0\nbetas = (0.9, 0.99)\neps = 4e-9\nnum_workers = 0\n########################################################################################################\n# Load data\n########################################################################################################\nprint('loading data... ' + datafile)\ntrain_dataset = Dataset(open(\n datafile, \"r\", encoding=datafile_encoding).read(), ctx_len, epoch_length_fixed)\n########################################################################################################\n# Train model\n########################################################################################################\nif __name__ == '__main__':\n model = GPT(GPTConfig(train_dataset.vocab_size, train_dataset.ctx_len, model_type=model_type,\n n_layer=n_layer, n_embd=n_embd)).cuda()", + "type": "code", + "location": "/RWKV-v2-RNN/train.py:57-82" + }, + "273": { + "file_id": 12, + "content": "Loading data for training GPT model...\nTraining the GPT model with specified configuration and parameters.", + "type": "comment" + }, + "274": { + "file_id": 12, + "content": " # # # load a trained model. remember to change random seed\n # m2 = torch.load('trained-61.pth')\n # model.load_state_dict(m2)\n print('model', model_type, 'epoch', n_epoch, 'batchsz', batch_size, 'betas',\n betas, 'eps', eps, 'ctx', ctx_len, 'layer', n_layer, 'embd', n_embd, )\n tconf = TrainerConfig(model_type=model_type, max_epochs=n_epoch, batch_size=batch_size,\n learning_rate=lr_init, lr_decay=True, lr_final=lr_final, betas=betas, eps=eps, grad_norm_clip=grad_norm_clip,\n warmup_tokens=warmup_tokens, final_tokens=n_epoch*len(train_dataset)*ctx_len, num_workers=num_workers, epoch_save_frequency=epoch_save_frequency, epoch_save_path=epoch_save_path)\n trainer = Trainer(model, train_dataset, None, tconf)\n trainer.train()\n torch.save(model.state_dict(), 'trained-' + str(n_epoch) + '-' + trainer.get_run_name() +\n '-' + datetime.datetime.today().strftime('%Y-%m-%d-%H-%M-%S') + '.pth')", + "type": "code", + "location": "/RWKV-v2-RNN/train.py:84-98" + }, + "275": { + "file_id": 12, + "content": "This code loads a trained model, sets up the Trainer and trains it, then saves the final model. It also prints out various parameters for the training session such as epochs, batch size, etc.", + "type": "comment" + }, + "276": { + "file_id": 13, + "content": "/RWKV-v3/cuda/timex_cuda.cu", + "type": "filepath" + }, + "277": { + "file_id": 13, + "content": "The CUDA code computes dot products between weight matrix and input data for an RNN model, utilizing shared memory and optimized access. It performs forward pass with iterative dot products and updates variables using input matrices g and k. The code calculates RWKV-v2-RNN time step output and configures backward propagation kernel dimensions in `cuda_backward`.", + "type": "summary" + }, + "278": { + "file_id": 13, + "content": "#include \n// require T <= Tmax, T % 4 == 0, B % BF == 0, B % BB === 0 (Tmax and BF and BB are passed by compiler)\n#define F4(A, B) ((float4 *)(A))[(B) >> 2]\ntemplate \n__global__ void kernel_forward(const F *__restrict__ const __w, const F *__restrict__ const __k, F *__restrict__ const x,\n const F eps, const int B, const int C, const int T) {\n const int i = blockIdx.y;\n const int ij = (B * C) / BF;\n const int t = threadIdx.x << 2;\n __shared__ F ww[Tmax];\n __shared__ F kk[Tmax * BF];\n F4(ww, t) = F4(__w, t + T * (i % C));\n #pragma unroll\n for (int j = 0; j < BF; j++) {\n F4(kk, t + Tmax * j) = F4(__k, t + T * (i + ij * j));\n }\n __syncthreads();\n float4 s[BF];\n #pragma unroll\n for (int j = 0; j < BF; j++) {\n s[j] = {eps, eps, eps, eps};\n }\n const F *__restrict__ const w = ww + T - t - 4;\n for (int u = 0; u <= t; u++) {\n #pragma unroll\n for (int j = 0; j < BF; j++) {\n const F x = kk[u + Tmax * j];", + "type": "code", + "location": "/RWKV-v2-RNN/cuda/timex_cuda.cu:1-33" + }, + "279": { + "file_id": 13, + "content": "Kernel function for forward pass in RWKV-v2-RNN, with CUDA implementation. Uses shared memory to optimize access time. Requires T <= Tmax, B % BF == 0, and B % BB === 0. Initializes ww and kk arrays using w and k parameters, then sets s array to eps for each thread's j in BF. Performs a forward pass on the RNN using shared memory for efficiency.", + "type": "comment" + }, + "280": { + "file_id": 13, + "content": " s[j].x += w[u + 3] * x;\n s[j].y += w[u + 2] * x;\n s[j].z += w[u + 1] * x;\n s[j].w += w[u + 0] * x;\n }\n }\n #pragma unroll\n for (int j = 0; j < BF; j++) {\n const F *__restrict__ const k = kk + Tmax * j;\n s[j].y += w[t + 3] * k[t + 1];\n s[j].z += w[t + 2] * k[t + 1];\n s[j].z += w[t + 3] * k[t + 2];\n s[j].w += w[t + 1] * k[t + 1];\n s[j].w += w[t + 2] * k[t + 2];\n s[j].w += w[t + 3] * k[t + 3];\n F4(x, t + T * (i + ij * j)) = s[j];\n }\n}\ntemplate \n__global__ void kernel_backward_W(const F *__restrict__ const __w, const F *__restrict__ const __k, const F *__restrict__ const __gwk,\n F *__restrict__ const gw, F *__restrict__ const gk,\n const int B, const int C, const int T) {\n const int i = blockIdx.y;\n const int t = threadIdx.x << 2;\n __shared__ F k[Tmax];\n __shared__ F gg[Tmax];\n F4(k, t) = F4(__k, t + T * i);\n F4(gg, t) = F4(__gwk, t + T * i);", + "type": "code", + "location": "/RWKV-v2-RNN/cuda/timex_cuda.cu:34-63" + }, + "281": { + "file_id": 13, + "content": "This code calculates the dot product between the weight matrix and input data, then updates the output. It performs this operation for each thread and stores the results in shared memory. The kernel function is defined to operate on a specific block of threads, where i represents the block index, and t represents the thread index within that block. The code uses CUDA programming features such as __restrict__ pointers, __global__ functions, and shared memory to optimize performance.", + "type": "comment" + }, + "282": { + "file_id": 13, + "content": " __syncthreads();\n float4 s = {0, 0, 0, 0};\n const F *__restrict__ const g = gg + T - t - 4;\n for (int u = 0; u <= t; u++) {\n F x = k[u];\n s.x += g[u + 3] * x;\n s.y += g[u + 2] * x;\n s.z += g[u + 1] * x;\n s.w += g[u + 0] * x;\n }\n s.y += g[t + 3] * k[t + 1];\n s.z += g[t + 2] * k[t + 1];\n s.z += g[t + 3] * k[t + 2];\n s.w += g[t + 1] * k[t + 1];\n s.w += g[t + 2] * k[t + 2];\n s.w += g[t + 3] * k[t + 3];\n F4(gw, t + T * i) = s;\n}\nvoid cuda_forward(const float *w, const float *k, float *x, float eps, int B, int C, int T) {\n dim3 gridDim(1, B * C / BF);\n dim3 blockDim(T >> 2);\n kernel_forward<< >>(w, k, x, eps, B, C, T);\n}\ntemplate \n__global__ void kernel_backward(const F *__restrict__ const __w, const F *__restrict__ const __k, const F *__restrict__ const __gwk,\n F *__restrict__ const gw, F *__restrict__ const gk,\n const int B, const int C, const int T) {", + "type": "code", + "location": "/RWKV-v2-RNN/cuda/timex_cuda.cu:64-93" + }, + "283": { + "file_id": 13, + "content": "This code performs a forward pass of an RNN model using CUDA. It calculates the output by summing up the contributions from each time step, taking into account the input sequence and the hidden state. The function `cuda_forward` sets up the grid and block dimensions for the kernel launch, while the `kernel_forward` kernel itself performs the actual computation on the GPU.", + "type": "comment" + }, + "284": { + "file_id": 13, + "content": " const int i = blockIdx.y;\n const int ij = (B * C) / BB;\n const int t = threadIdx.x << 2;\n __shared__ F w[Tmax];\n __shared__ F kk[Tmax * BB];\n __shared__ F gg[Tmax * BB];\n F4(w, t) = F4(__w, t + T * (i % C));\n #pragma unroll\n for (int j = 0; j < BB; j++) {\n F4(kk, t + Tmax * j) = F4(__k, t + T * (i + ij * j));\n F4(gg, t + Tmax * j) = F4(__gwk, t + T * (i + ij * j));\n }\n __syncthreads();\n float4 s[BB];\n #pragma unroll\n for (int j = 0; j < BB; j++) {\n s[j] = {0, 0, 0, 0};\n }\n for (int u = 0; u <= t; u++) {\n #pragma unroll\n for (int j = 0; j < BB; j++) {\n const F *__restrict__ const g = gg + Tmax * j + T - t - 4;\n F x = kk[u + Tmax * j];\n s[j].x += g[u + 3] * x;\n s[j].y += g[u + 2] * x;\n s[j].z += g[u + 1] * x;\n s[j].w += g[u + 0] * x;\n }\n }\n #pragma unroll\n for (int j = 0; j < BB; j++) {\n const F *__restrict__ const k = kk + Tmax * j;\n const F *__restrict__ const g = gg + Tmax * j + T - t - 4;", + "type": "code", + "location": "/RWKV-v2-RNN/cuda/timex_cuda.cu:94-130" + }, + "285": { + "file_id": 13, + "content": "Code initializes shared memory arrays for weights, kernel, and input-kernel product. It then calculates thread-specific weight tensor, loads kernel and input-kernel product into shared memory, and synchronizes threads. Finally, it iteratively performs dot product between shared kernel and input-kernel product tensors to accumulate output tensor values in shared memory.", + "type": "comment" + }, + "286": { + "file_id": 13, + "content": " s[j].y += g[t + 3] * k[t + 1];\n s[j].z += g[t + 2] * k[t + 1];\n s[j].z += g[t + 3] * k[t + 2];\n s[j].w += g[t + 1] * k[t + 1];\n s[j].w += g[t + 2] * k[t + 2];\n s[j].w += g[t + 3] * k[t + 3];\n F4(gw, t + T * (i + ij * j)) = s[j];\n }\n #pragma unroll\n for (int j = 0; j < BB; j++) {\n s[j] = {0, 0, 0, 0};\n }\n for (int u = t + 3; u < T; u++) {\n F x = w[u];\n #pragma unroll\n for (int j = 0; j < BB; j++) {\n const F *__restrict__ const g = gg + Tmax * j + T + t - 3;\n s[j].x += g[2 - u] * x;\n s[j].y += g[3 - u] * x;\n s[j].z += g[4 - u] * x;\n s[j].w += g[5 - u] * x;\n } \n }\n #pragma unroll\n for (int j = 0; j < BB; j++) {\n const F *__restrict__ const g = gg + Tmax * j + T + t - 3;\n s[j].x += g[2 - t] * w[t + 0];\n s[j].x += g[1 - t] * w[t + 1];\n s[j].x += g[0 - t] * w[t + 2];\n s[j].y += g[2 - t] * w[t + 1];\n s[j].y += g[1 - t] * w[t + 2];", + "type": "code", + "location": "/RWKV-v2-RNN/cuda/timex_cuda.cu:131-163" + }, + "287": { + "file_id": 13, + "content": "This code updates the values of a set of variables (s[j].x, s[j].y, s[j].z, s[j].w) based on different input matrices g and k. It utilizes unroll to optimize performance by performing multiple calculations simultaneously.", + "type": "comment" + }, + "288": { + "file_id": 13, + "content": " s[j].z += g[2 - t] * w[t + 2];\n F4(gk, t + T * (i + ij * j)) = s[j];\n }\n}\nvoid cuda_backward(const float *w, const float *k, const float *gwk, float *gw, float *gk, int B, int C, int T) {\n dim3 gridDim(1, B * C / BB);\n dim3 blockDim(T >> 2);\n kernel_backward<< >>(w, k, gwk, gw, gk, B, C, T);\n}", + "type": "code", + "location": "/RWKV-v2-RNN/cuda/timex_cuda.cu:164-172" + }, + "289": { + "file_id": 13, + "content": "This code snippet is part of the RWKV-v2-RNN implementation in CUDA. It calculates the output of a time step and assigns it to the corresponding location in memory for gradient computation. The `cuda_backward` function configures the grid and block dimensions for a GPU kernel that performs backward propagation on a given dataset.", + "type": "comment" + }, + "290": { + "file_id": 14, + "content": "/RWKV-v3/cuda/timex_op.cpp", + "type": "filepath" + }, + "291": { + "file_id": 14, + "content": "This code defines a CUDA kernel for timex forward and backward operations in PyTorch, with the given snippet being the closing brace of a function or class definition.", + "type": "summary" + }, + "292": { + "file_id": 14, + "content": "#include \nvoid cuda_forward(const float *w, const float *k, float *x, float eps, int B, int C, int T);\nvoid cuda_backward(const float *w, const float *k, const float *gwk, float *gw, float *gk, int B, int C, int T);\nvoid forward(torch::Tensor &w, const torch::Tensor &k, torch::Tensor &x, double eps, int64_t B, int64_t C, int64_t T) {\n cuda_forward((const float *)w.data_ptr(), (const float *)k.data_ptr(), (float *)x.data_ptr(), eps, B, C, T);\n}\nvoid backward(torch::Tensor &w, const torch::Tensor &k, const torch::Tensor &gwk, torch::Tensor &gw, torch::Tensor &gk, int64_t B, int64_t C, int64_t T) {\n cuda_backward((const float *)w.data_ptr(), (const float *)k.data_ptr(), (const float *)gwk.data_ptr(), (float *)gw.data_ptr(), (float *)gk.data_ptr(), B, C, T);\n}\nPYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {\n m.def(\"forward\", &forward, \"timex forward\");\n m.def(\"backward\", &backward, \"timex backward\");\n}\nTORCH_LIBRARY(timex, m) {\n m.def(\"forward\", forward);\n m.def(\"backward\", backward);", + "type": "code", + "location": "/RWKV-v2-RNN/cuda/timex_op.cpp:1-20" + }, + "293": { + "file_id": 14, + "content": "This code defines a CUDA kernel for the timex forward and backward operations, which are then exposed to PyTorch through extension modules. The functions take in torch tensors, perform computations on them using the CUDA kernels, and return the results.", + "type": "comment" + }, + "294": { + "file_id": 14, + "content": "}", + "type": "code", + "location": "/RWKV-v2-RNN/cuda/timex_op.cpp:21-21" + }, + "295": { + "file_id": 14, + "content": "This code snippet is the closing brace (}) for a function or class definition, marking the end of a block.", + "type": "comment" + }, + "296": { + "file_id": 15, + "content": "/RWKV-v3/run.py", + "type": "filepath" + }, + "297": { + "file_id": 15, + "content": "The code sets up an RWKV model, tokenizes prompts, and handles unknown characters using rarest tokens from vocab.json. It saves/loads the model, generates text with logit sampling, updates context, measures time per iteration, and provides debug info.", + "type": "summary" + }, + "298": { + "file_id": 15, + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nimport numpy as np\nimport math\nimport time\nimport types\nimport copy\nimport torch\nfrom torch.nn import functional as F\nfrom src.utils import TOKENIZER, Dataset\nfrom src.model_run import RWKV_RNN\ntorch.backends.cudnn.benchmark = True\ntorch.backends.cudnn.allow_tf32 = True\ntorch.backends.cuda.matmul.allow_tf32 = True\nnp.set_printoptions(precision=4, suppress=True, linewidth=200)\n### Step 1: set model ##################################################################################\nctx_len = 1024\nn_layer = 6\nn_embd = 512\nmodel_type = 'RWKV' # 'RWKV' or 'RWKV-ffnPre'\n# your trained model\nMODEL_NAME = 'trained-1'\nWORD_NAME = 'vocab' # the .json vocab (generated by train.py\n# --> set UNKNOWN_CHAR to the rarest token in your vocab.json <--", + "type": "code", + "location": "/RWKV-v3/run.py:1-30" + }, + "299": { + "file_id": 15, + "content": "Code imports necessary libraries, sets up configurations for the RWKV language model, and defines variables to control its behavior. It also ensures that tensor computations will utilize TF32 for faster performance if possible. The code specifies a context length (ctx_len), number of layers (n_layer), and embeddings size (n_embd). The type of model is determined by the value of model_type, either 'RWKV' or 'RWKV-ffnPre'. The trained model name and vocabulary file are also defined for later use. Unknown characters in input are set to be the rarest token from the vocab.json file.", + "type": "comment" + } +} \ No newline at end of file diff --git a/docs/data/3.json b/docs/data/3.json new file mode 100644 index 00000000..af7e7076 --- /dev/null +++ b/docs/data/3.json @@ -0,0 +1,547 @@ +{ + "300": { + "file_id": 15, + "content": "# --> all unknown tokens in your context will be denoted by it <--\nUNKNOWN_CHAR = ' ' # here we just set it to [space] for simplicity\nRUN_DEVICE = 'cpu' # 'cpu' (already very fast) or 'cuda'\nDEBUG_DEBUG = False # True False - show softmax output\n### Step 2: set context ################################################################################\ncontext = \"\\nIn the\" # ==> this is your prompt\nNUM_TRIALS = 999\nLENGTH_PER_TRIAL = 500\nTEMPERATURE = 1.0\ntop_p = 0.7\ntop_p_newline = 0.9\n########################################################################################################\nprint(f'Loading {MODEL_NAME}...')\nmodel = RWKV_RNN(MODEL_NAME, RUN_DEVICE, model_type, n_layer, n_embd, ctx_len)\ntokenizer = TOKENIZER(WORD_NAME, UNKNOWN_CHAR=UNKNOWN_CHAR)\n########################################################################################################\ncontext = tokenizer.refine_context(context)\nprint('\\nYour prompt has ' + str(len(context)) + ' tokens.')\nprint('\\n--> Currently the firs", + "type": "code", + "location": "/RWKV-v3/run.py:31-58" + }, + "301": { + "file_id": 15, + "content": "Loading the RWKV model with specified configuration and tokenizing the prompt for further processing.", + "type": "comment" + }, + "302": { + "file_id": 15, + "content": "t run takes a while if your prompt is long, as we are using RNN to process the prompt. Use GPT to build the hidden state for better speed. <--\\n')\nfor TRIAL in range(1 if DEBUG_DEBUG else NUM_TRIALS):\n t_begin = time.time_ns()\n src_len = len(context)\n ctx = [tokenizer.stoi.get(s, tokenizer.UNKNOWN_CHAR) for s in context]\n print(('-' * 30) + context, end='')\n model.clear()\n if TRIAL == 0:\n init_state = types.SimpleNamespace()\n for i in range(src_len):\n x = ctx[:i+1]\n if i == src_len - 1:\n init_state.out = model.run(x)\n else:\n model.run(x)\n model.save(init_state)\n else:\n model.load(init_state)\n for i in range(src_len, src_len + (1 if DEBUG_DEBUG else LENGTH_PER_TRIAL)):\n x = ctx[:i+1]\n x = x[-ctx_len:]\n if i == src_len:\n out = copy.deepcopy(init_state.out)\n else:\n out = model.run(x)\n if DEBUG_DEBUG:\n print('model', np.array(x), '==>', np.array(", + "type": "code", + "location": "/RWKV-v3/run.py:58-89" + }, + "303": { + "file_id": 15, + "content": "This code is preparing a model for processing a given context. It initializes the model, clears its state, and then runs it on the input context to generate an output. The model is saved after initialization and loaded when needed. The process repeats for each trial length specified, building upon the previous hidden state. The debug mode prints additional information for troubleshooting.", + "type": "comment" + }, + "304": { + "file_id": 15, + "content": " out), np.max(out), np.min(out))\n char = tokenizer.sample_logits(out, x, ctx_len, temperature=TEMPERATURE,\n top_p_usual=top_p, top_p_newline=top_p_newline)\n char = char.item()\n print(tokenizer.itos[int(char)], end='', flush=True)\n ctx += [char]\n t_end = time.time_ns()\n print(\"\\n----------\", round((t_end - t_begin) / (10 ** 9), 2), end='s ')", + "type": "code", + "location": "/RWKV-v3/run.py:90-98" + }, + "305": { + "file_id": 15, + "content": "This code generates text by sampling logits from a tokenizer, then prints the corresponding character and updates the context. It also measures the time taken for each iteration and outputs it in seconds.", + "type": "comment" + }, + "306": { + "file_id": 16, + "content": "/RWKV-v3/src/model.py", + "type": "filepath" + }, + "307": { + "file_id": 16, + "content": "This code utilizes PyTorch and CUDA to implement RWKV Language Model and GPT model with time-based mixing, layer normalization, attention heads, Adam optimizer, weight decay, and efficient attention retrieval for forward computations.", + "type": "summary" + }, + "308": { + "file_id": 16, + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nfrom torch.utils.cpp_extension import load\nimport math\nimport numpy as np\nimport logging\nimport torch\nimport torch.nn as nn\nfrom torch.nn import functional as F\nlogger = logging.getLogger(__name__)\nRWKV_K_CLAMP = 60 # e^60 = 1e26\nRWKV_K_EPS = 1e-8\nRWKV_HEAD_QK_DIM = 256\nprint(f'\\nRWKV_K_CLAMP {RWKV_K_CLAMP} RWKV_K_EPS {RWKV_K_EPS} RWKV_HEAD_QK_DIM {RWKV_HEAD_QK_DIM}\\n')\n########################################################################################################\n# CUDA Kernel\n########################################################################################################\nT_MAX = 1024 # increase this if your ctx_len > 1024\nB_GROUP_FORWARD = 4 # set to 8 for best performance\nB_GROUP_BACKWARD = 2 # set to 2 for best performance (sometimes 8 is faster)", + "type": "code", + "location": "/RWKV-v3/src/model.py:1-25" + }, + "309": { + "file_id": 16, + "content": "The code is for the RWKV Language Model, implemented in PyTorch with CUDA kernel. It defines constants such as `RWKV_K_CLAMP`, `RWKV_K_EPS`, and `RWKV_HEAD_QK_DIM`. The CUDA kernel limits the maximum context length to 1024, allows grouping for forward and backward passes.", + "type": "comment" + }, + "310": { + "file_id": 16, + "content": "timex_cuda = load(name=\"timex\", sources=[\"cuda/timex_op.cpp\", \"cuda/timex_cuda.cu\"],\n verbose=True, extra_cuda_cflags=['--use_fast_math', '--extra-device-vectorization', f'-DTmax={T_MAX}', f'-DBF={B_GROUP_FORWARD}', f'-DBB={B_GROUP_BACKWARD}'])\nclass TimeX(torch.autograd.Function):\n @staticmethod\n def forward(ctx, w, k, B, C, T, eps):\n ctx.B = B\n ctx.C = C\n ctx.T = T\n assert ctx.T % 4 == 0 and ctx.T <= T_MAX and ctx.B % B_GROUP_FORWARD == 0 and ctx.B % B_GROUP_BACKWARD == 0\n w = w.contiguous()\n k = k.contiguous()\n ctx.save_for_backward(w, k)\n wk = torch.empty((B, C, T), device='cuda',\n memory_format=torch.contiguous_format)\n timex_cuda.forward(w, k, wk, eps, B, C, T)\n return wk\n @staticmethod\n def backward(ctx, gwk):\n assert ctx.T % 4 == 0 and ctx.T <= T_MAX and ctx.B % B_GROUP_FORWARD == 0 and ctx.B % B_GROUP_BACKWARD == 0\n w, k = ctx.saved_tensors\n gw = torch.empty((ctx.B, ctx.C, ctx.T), device='cuda',", + "type": "code", + "location": "/RWKV-v3/src/model.py:27-50" + }, + "311": { + "file_id": 16, + "content": "This code defines a TimeX class that extends torch.autograd.Function for the TimeX operation. It includes a forward method to calculate the output and a backward method for gradient computation. The class initializes context variables (B, C, T) based on input arguments, performs assertions on T, B, and checks contiguity of input tensors. It then calls the timex_cuda.forward function with saved tensors, an empty tensor for wk output, and other input arguments. The backward method performs similar assertions and uses saved tensors and gradient input gwk to calculate gradients for w and k. T_MAX, B_GROUP_FORWARD, and B_GROUP_BACKWARD are constants used in assertions.", + "type": "comment" + }, + "312": { + "file_id": 16, + "content": " memory_format=torch.contiguous_format)\n gk = torch.empty((ctx.B, ctx.C, ctx.T), device='cuda',\n memory_format=torch.contiguous_format)\n timex_cuda.backward(w, k, gwk.contiguous(), gw,\n gk, ctx.B, ctx.C, ctx.T)\n return (gw.sum(dim=0), gk, None, None, None, None)\n########################################################################################################\n# RWKV: RWKV Time-mix + RWKV Channel-mix\n########################################################################################################\ndef RWKV_Init(module, config): # fancy initialization of all lin & emb layer in the module\n for m in module.modules():\n if not isinstance(m, (nn.Linear, nn.Embedding)):\n continue\n with torch.no_grad():\n name = '[unknown weight]'\n for name, parameter in module.named_parameters(): # find the name of the weight\n if id(m.weight) == id(parameter):", + "type": "code", + "location": "/RWKV-v3/src/model.py:51-69" + }, + "313": { + "file_id": 16, + "content": "This code snippet initializes a model's weights using RWKV-v3's time and channel mixing techniques. It creates empty tensors, performs backward calculations on the weight matrix (w) and key matrix (k), and returns gradients for further processing. The `RWKV_Init` function initializes the linear and embedding layers of a module using specific naming conventions.", + "type": "comment" + }, + "314": { + "file_id": 16, + "content": " break\n shape = m.weight.data.shape\n gain = 1.0\n scale = 1.0 # extra scale for gain\n if isinstance(m, nn.Embedding):\n gain = math.sqrt(max(shape[0], shape[1]))\n if shape[0] == config.vocab_size and shape[1] == config.n_embd: # token emb?\n scale = 1e-4\n else:\n scale = 0\n if isinstance(m, nn.Linear):\n if m.bias is not None:\n m.bias.data.zero_()\n if shape[0] > shape[1]:\n gain = math.sqrt(shape[0] / shape[1])\n if shape[0] == config.vocab_size and shape[1] == config.n_embd: # final projection?\n scale = 0.5\n if hasattr(m, 'scale_init'):\n scale = m.scale_init\n # print(str(shape[0]).ljust(5), str(shape[1]).ljust(5), f'{round(scale,2):g}'.ljust(4), name)\n gain *= scale\n if scale == -999:\n nn.init.eye_(m.weight)", + "type": "code", + "location": "/RWKV-v3/src/model.py:70-98" + }, + "315": { + "file_id": 16, + "content": "This code checks the type of a layer in the neural network model and applies different weight initialization depending on its type. If it is an Embedding or Linear layer, it adjusts the gain and scale accordingly. If it has a scale_init attribute, that value is used for initialization. If scale is -999, it initializes weights with identity matrix.", + "type": "comment" + }, + "316": { + "file_id": 16, + "content": " elif gain == 0:\n # zero init is great for some RWKV matrices\n nn.init.zeros_(m.weight)\n elif gain > 0:\n nn.init.orthogonal_(m.weight, gain=gain)\n else:\n nn.init.normal_(m.weight, mean=0.0, std=-scale)\nclass RWKV_TimeMix(nn.Module):\n def __init__(self, config, layer_id):\n super().__init__()\n self.layer_id = layer_id\n self.ctx_len = config.ctx_len\n self.n_embd = config.n_embd\n attn_sz = config.n_embd\n with torch.no_grad(): # fancy init\n self.time_curve = torch.tensor([-(config.ctx_len - 2 - i) for i in range(config.ctx_len-1)]).unsqueeze(0)\n self.time_curve = self.time_curve.to('cuda')\n ratio_0_to_1 = (layer_id / (config.n_layer - 1)) # 0 to 1\n ratio_1_to_almost0 = (1.0 - (layer_id / config.n_layer)) # 1 to ~0\n # fancy time_decay\n decay_speed = torch.ones(attn_sz, 1)\n for h in range(attn_sz):\n decay_speed[h][0] = -5 + 8 * (h / (attn_sz-1)) ** (0.7 + 1.3 * ratio_0_to_1)", + "type": "code", + "location": "/RWKV-v3/src/model.py:99-127" + }, + "317": { + "file_id": 16, + "content": "This code initializes the weights of matrices in the RWKV_TimeMix module based on a gain value. If the gain is zero, it sets the weights to zero. If the gain is positive, it uses orthogonal initialization with the specified gain. Otherwise, it uses normal initialization with a mean of 0 and a negative standard deviation. The class RWKV_TimeMix is a custom module that takes a configuration and layer ID as inputs and performs time-based curve operations for attention scores. It also initializes a time_curve tensor and calculates a time_decay based on the layer ID.", + "type": "comment" + }, + "318": { + "file_id": 16, + "content": " self.time_decay = nn.Parameter(decay_speed)\n # print(layer_id, self.time_decay.flatten()[:3].cpu().numpy(), '...', self.time_decay.flatten()[-3:].cpu().numpy())\n # fancy time_first\n zigzag = (torch.tensor([(i+1)%3 - 1 for i in range(attn_sz)]) * 0.5).unsqueeze(1)\n self.time_first = nn.Parameter(torch.ones(attn_sz, 1) * math.log(0.3) + zigzag)\n # fancy time_mix\n x = torch.ones(1, 1, config.n_embd)\n for i in range(config.n_embd):\n x[0, 0, i] = i / config.n_embd\n self.time_mix_k = nn.Parameter(torch.pow(x, ratio_1_to_almost0))\n self.time_mix_v = nn.Parameter(torch.pow(x, ratio_1_to_almost0) + 0.3 * ratio_0_to_1)\n self.time_mix_r = nn.Parameter(torch.pow(x, 0.5 * ratio_1_to_almost0))\n self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))\n self.key = nn.Linear(config.n_embd, attn_sz, bias=False)\n self.value = nn.Linear(config.n_embd, attn_sz, bias=False)\n self.receptance = nn.Linear(config.n_embd, attn_sz, bias=False)", + "type": "code", + "location": "/RWKV-v3/src/model.py:128-148" + }, + "319": { + "file_id": 16, + "content": "This code initializes and sets various parameters for an attention mechanism in a model. It includes time-related parameters such as `time_decay`, `time_first`, `time_mix_k`, `time_mix_v`, `time_mix_r` and `time_shift`. The `key`, `value`, and `receptance` layers are also defined, each with a specified number of input/output dimensions. These parameters will be used to calculate attention scores between queries and keys, allowing for more effective information retrieval from the input sequence.", + "type": "comment" + }, + "320": { + "file_id": 16, + "content": " self.output = nn.Linear(attn_sz, config.n_embd, bias=False)\n self.key.scale_init = 0\n self.receptance.scale_init = 0\n self.output.scale_init = 0\n def forward(self, x):\n B, T, C = x.size() # x = (Batch,Time,Channel)\n # Mix x with the previous timestep to produce xk, xv, xr\n xx = self.time_shift(x) # self.time_shift = nn.ZeroPad2d((0,0,1,-1))\n xk = x * self.time_mix_k + xx * (1 - self.time_mix_k)\n xv = x * self.time_mix_v + xx * (1 - self.time_mix_v)\n xr = x * self.time_mix_r + xx * (1 - self.time_mix_r)\n # Use xk, xv, xr to produce k, v, r\n k = self.key(xk).transpose(-1, -2)\n v = self.value(xv).transpose(-1, -2)\n r = self.receptance(xr)\n # RWKV_K_CLAMP can be removed if the CUDA kernel substracts the correct k_max for each k (I will do this later)\n k = torch.clamp(k, max=RWKV_K_CLAMP) # clamp k to avoid overflow\n k = torch.exp(k)\n kv = k * v\n # Compute the W-curve = [e^(-n * e^time_decay), e^(-(n-1) * e^time_decay), ..., 1, e^(time_first)]", + "type": "code", + "location": "/RWKV-v3/src/model.py:150-175" + }, + "321": { + "file_id": 16, + "content": "This code initializes the model and defines the forward function, which performs time-mixing operations on input data, then uses this mixed data to produce attention keys, values, and receptances. These are then clamped and transformed into exponential form before being combined in a weighted sum. A W-curve is also calculated for some future use that may eliminate the need for clamping.", + "type": "comment" + }, + "322": { + "file_id": 16, + "content": " self.time_w = torch.cat(\n [torch.exp(self.time_decay) * self.time_curve, self.time_first], dim=-1)\n w = torch.exp(self.time_w)\n # Use W to mix kv and k respectively. Add K_EPS to wk to avoid divide-by-zero\n wkv = TimeX.apply(w, kv, B, C, T, 0)\n # RWKV_K_EPS can be removed if the CUDA kernel sets 0/0 = 0 (I will do this later)\n wk = TimeX.apply(w, k, B, C, T, RWKV_K_EPS)\n rwkv = torch.sigmoid(r) * (wkv / wk).transpose(-1, -2)\n rwkv = self.output(rwkv)\n return rwkv\nclass RWKV_ChannelMix(nn.Module):\n def __init__(self, config, layer_id):\n super().__init__()\n self.layer_id = layer_id\n self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))\n with torch.no_grad(): # fancy init of time_mix\n ratio_1_to_almost0 = (1.0 - (layer_id / config.n_layer)) # 1 to ~0\n x = torch.ones(1, 1, config.n_embd)\n for i in range(config.n_embd):\n x[0, 0, i] = i / config.n_embd\n self.time_mix_k = nn.Parameter(torch.pow(x, ratio_1_to_almost0))", + "type": "code", + "location": "/RWKV-v3/src/model.py:176-204" + }, + "323": { + "file_id": 16, + "content": "This code defines a RWKV_ChannelMix module that performs time-based mixing of kv and k tensors using the TimeX function. It also calculates wkv and wk tensors, applies sigmoid function, and outputs the resulting rwkv tensor. The class inherits from nn.Module and initializes with configuration parameters and layer ID. It includes a time_shift operation and sets fancy init for time_mix_k using torch.pow.", + "type": "comment" + }, + "324": { + "file_id": 16, + "content": " self.time_mix_r = nn.Parameter(torch.pow(x, ratio_1_to_almost0))\n hidden_sz = 4 * config.n_embd\n self.key = nn.Linear(config.n_embd, hidden_sz, bias=False)\n self.receptance = nn.Linear(config.n_embd, config.n_embd, bias=False)\n self.value = nn.Linear(hidden_sz, config.n_embd, bias=False)\n self.value.scale_init = 0\n self.receptance.scale_init = 0\n def forward(self, x):\n xx = self.time_shift(x)\n xk = x * self.time_mix_k + xx * (1 - self.time_mix_k)\n xr = x * self.time_mix_r + xx * (1 - self.time_mix_r)\n k = self.key(xk)\n k = torch.square(torch.relu(k))\n kv = self.value(k)\n rkv = torch.sigmoid(self.receptance(xr)) * kv\n return rkv\n########################################################################################################\n# The GPT Model with our blocks\n########################################################################################################\nclass GPTConfig:\n def __init__(self, vocab_size, ctx_len, **kwargs):", + "type": "code", + "location": "/RWKV-v3/src/model.py:205-233" + }, + "325": { + "file_id": 16, + "content": "This code is for a GPT model implementation with time-based mixing. It initializes parameters, defines forward pass calculations, and contains a configuration class for the model. The model uses time shifting, mixing, key/value calculations, and sigmoid activation functions in its operations.", + "type": "comment" + }, + "326": { + "file_id": 16, + "content": " self.vocab_size = vocab_size\n self.ctx_len = ctx_len\n for k, v in kwargs.items():\n setattr(self, k, v)\nclass Block(nn.Module):\n def __init__(self, config, layer_id):\n super().__init__()\n self.config = config\n self.layer_id = layer_id\n self.ln1 = nn.LayerNorm(config.n_embd)\n self.ln2 = nn.LayerNorm(config.n_embd)\n if self.layer_id == 0:\n self.ln0 = nn.LayerNorm(config.n_embd)\n if self.layer_id == 0 and self.config.model_type == 'RWKV-ffnPre':\n self.ffnPre = RWKV_ChannelMix(config, layer_id+1000)\n else:\n self.att = RWKV_TimeMix(config, layer_id)\n self.ffn = RWKV_ChannelMix(config, layer_id)\n def forward(self, x):\n if self.layer_id == 0:\n x = self.ln0(x) \n if self.layer_id == 0 and self.config.model_type == 'RWKV-ffnPre':\n x = x + self.ffnPre(self.ln1(x)) # better in some cases\n else:\n x = x + self.att(self.ln1(x))\n x = x + self.ffn(self.ln2(x))", + "type": "code", + "location": "/RWKV-v3/src/model.py:234-266" + }, + "327": { + "file_id": 16, + "content": "This code defines a class for the RWKV model's Block, which is an instance of nn.Module. It contains layer normalization layers and two modules: RWKV_TimeMix and RWKV_ChannelMix. The forward function applies these modules sequentially to input x after layer normalization. If this is the first block (layer_id == 0), it also includes an additional layer normalization and, if a specific model type is specified, applies the ffnPre module before the other modules.", + "type": "comment" + }, + "328": { + "file_id": 16, + "content": " return x\nclass GPT(nn.Module):\n def __init__(self, config):\n super().__init__()\n self.step = 0\n self.config = config\n self.emb = nn.Embedding(config.vocab_size, config.n_embd)\n self.blocks = nn.Sequential(*[Block(config, i)\n for i in range(config.n_layer)])\n self.ln_out = nn.LayerNorm(config.n_embd)\n self.head = nn.Linear(config.n_embd, config.vocab_size, bias=False)\n if RWKV_HEAD_QK_DIM > 0:\n self.head_q = nn.Linear(config.n_embd, RWKV_HEAD_QK_DIM, bias=False)\n self.head_q.scale_init = 0\n self.head_k = nn.Linear(config.n_embd, RWKV_HEAD_QK_DIM, bias=False)\n self.head_k.scale_init = 0.1\n self.register_buffer(\"copy_mask\", torch.tril(\n torch.ones(config.ctx_len, config.ctx_len)))\n self.ctx_len = config.ctx_len\n RWKV_Init(self, config)\n logger.info(\"number of parameters: %e\", sum(p.numel()\n for p in self.parameters()))", + "type": "code", + "location": "/RWKV-v3/src/model.py:267-297" + }, + "329": { + "file_id": 16, + "content": "The code defines a class called \"GPT\" which inherits from nn.Module and initializes its parameters based on the given configuration. It includes an embedding layer, multiple blocks, a normalization layer, and different linear layers for output. If RWKV_HEAD_QK_DIM is greater than 0, it also initializes extra head layers for Q and K. The code ends by printing the total number of parameters in the model and logging it.", + "type": "comment" + }, + "330": { + "file_id": 16, + "content": " def get_ctx_len(self):\n return self.ctx_len\n def _init_weights(self, module):\n if isinstance(module, (nn.Linear)):\n module.weight.data.normal_(mean=0.0, std=0.01)\n if isinstance(module, (nn.Embedding)):\n module.weight.data.normal_(mean=0.0, std=1e-5)\n if isinstance(module, nn.Linear) and module.bias is not None:\n module.bias.data.zero_()\n def configure_optimizers(self, train_config):\n # separate out all parameters to those that will and won't experience regularizing weight decay\n decay = set()\n no_decay = set()\n for mn, m in self.named_modules(): # here we disable weight_decay\n for pn, p in m.named_parameters():\n fpn = '%s.%s' % (mn, pn) if mn else pn # full param name\n no_decay.add(fpn)\n param_dict = {pn: p for pn, p in self.named_parameters()}\n inter_params = decay & no_decay\n union_params = decay | no_decay\n assert len(\n inter_params) == 0, \"parameters %s made it into both decay/no_decay sets!\" % (str(inter_params), )", + "type": "code", + "location": "/RWKV-v3/src/model.py:299-324" + }, + "331": { + "file_id": 16, + "content": "This code defines a model with functions for getting the context length, initializing weights, and configuring optimizers. It separates parameters into those subject to weight decay and those not, and ensures no parameter is included in both sets.", + "type": "comment" + }, + "332": { + "file_id": 16, + "content": " assert len(param_dict.keys() - union_params) == 0, \"parameters %s were not separated into either decay/no_decay set!\" \\\n % (str(param_dict.keys() - union_params), )\n optim_groups = [\n {\"params\": [param_dict[pn]\n for pn in sorted(list(no_decay))], \"weight_decay\": 0.0},\n ]\n optimizer = torch.optim.Adam(\n optim_groups, lr=train_config.learning_rate, betas=train_config.betas, eps=train_config.eps)\n return optimizer\n def forward(self, idx, targets=None):\n self.step += 1\n B, T = idx.size()\n assert T <= self.ctx_len, \"Cannot forward, because len(input) > model ctx_len.\"\n x = self.emb(idx)\n x = self.blocks(x)\n x = self.ln_out(x)\n if RWKV_HEAD_QK_DIM > 0:\n q = self.head_q(x)[:, :T, :]\n k = self.head_k(x)[:, :T, :]\n c = (q @ k.transpose(-2, -1)) * (1.0 / RWKV_HEAD_QK_DIM)\n c = c.masked_fill(self.copy_mask[:T, :T] == 0, 0)\n c = c @ F.one_hot(idx, num_classes=self.config.vocab_size).float()", + "type": "code", + "location": "/RWKV-v3/src/model.py:325-354" + }, + "333": { + "file_id": 16, + "content": "This code defines a model and an optimization function. The model has embedding layers, blocks, and a layer normalization layer. It also includes optional attention heads for query-key calculations. The forward function performs the model's computations based on input indexes and optionally produces output from attention heads. The optimizer sets up the Adam optimizer for training with specified learning rate and betas.", + "type": "comment" + }, + "334": { + "file_id": 16, + "content": " x = self.head(x) + c\n else:\n x = self.head(x)\n loss = None\n if targets is not None:\n loss = F.cross_entropy(x.view(-1, x.size(-1)), targets.view(-1))\n return x, loss", + "type": "code", + "location": "/RWKV-v3/src/model.py:355-363" + }, + "335": { + "file_id": 16, + "content": "This code calculates the output of a model and optionally computes a cross-entropy loss if targets are provided. If targets are None, it just returns the output. The head layer is used to process the input 'x'.", + "type": "comment" + }, + "336": { + "file_id": 17, + "content": "/RWKV-v3/src/model_run.py", + "type": "filepath" + }, + "337": { + "file_id": 17, + "content": "This code initializes a transformer model for RWKV language implementation with time-shifted inputs, utilizing convolutions and attention mechanisms. It performs layer normalization, self-attention, feed-forward operations on input 'x' using layers from the 'w' object and returns results after applying block-specific weights and calculations for context-aware generation in RWKV-v3 model.", + "type": "summary" + }, + "338": { + "file_id": 17, + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nimport types\nimport copy\nimport torch\nimport math\nfrom torch.nn import functional as F\nimport torch.nn as nn\nRWKV_K_CLAMP = 60\nRWKV_K_EPS = 1e-8\nRWKV_HEAD_QK_DIM = 256\nprint(f'\\nRWKV_K_CLAMP {RWKV_K_CLAMP} RWKV_K_EPS {RWKV_K_EPS} RWKV_HEAD_QK_DIM {RWKV_HEAD_QK_DIM}\\n')\nDEBUG_TIME = False # True False - show trained time-coeffs\n############################################################################################################\nRWKV_CFG = types.SimpleNamespace()\nclass RWKV_ChannelMix(nn.Module):\n def __init__(self, layer_id):\n super().__init__()\n self.layer_id = layer_id\n self.time_shift = nn.ZeroPad2d((0,0,1,-1))\n self.time_mix_k = nn.Parameter(torch.ones(1, 1, RWKV_CFG.n_embd))\n self.time_mix_r = nn.Parameter(torch.ones(1, 1, RWKV_CFG.n_embd))", + "type": "code", + "location": "/RWKV-v3/src/model_run.py:1-30" + }, + "339": { + "file_id": 17, + "content": "The code snippet is part of the RWKV language model implementation. It defines constants and a class for channel mixing operations within the model. The `RWKV_CFG` namespace holds various configuration values, and the `RWKV_ChannelMix` class represents a module with time-based channel mixing functionality using time shift, key, and query mixing parameters.", + "type": "comment" + }, + "340": { + "file_id": 17, + "content": " hidden_sz = 4 * RWKV_CFG.n_embd\n self.key = nn.Linear(RWKV_CFG.n_embd, hidden_sz, bias=False)\n self.receptance = nn.Linear(RWKV_CFG.n_embd, RWKV_CFG.n_embd, bias=False)\n self.value = nn.Linear(hidden_sz, RWKV_CFG.n_embd, bias=False)\n def forward(self, x):\n xx = self.time_shift(x)\n xk = x * self.time_mix_k + xx * (1 - self.time_mix_k)\n xr = x * self.time_mix_r + xx * (1 - self.time_mix_r)\n k = self.key(xk)\n k = torch.square(torch.relu(k))\n kv = self.value(k)\n rkv = torch.sigmoid(self.receptance(xr)) * kv\n return rkv\nclass RWKV_TimeMix(nn.Module):\n def __init__(self, layer_id):\n super().__init__()\n self.layer_id = layer_id\n self.time_decay = nn.Parameter(torch.ones(RWKV_CFG.n_embd, 1))\n self.time_curve = torch.tensor([-(RWKV_CFG.ctx_len - 2 - i) for i in range(RWKV_CFG.ctx_len-1)]).unsqueeze(0)\n self.time_first = nn.Parameter(torch.ones(RWKV_CFG.n_embd, 1) * math.log(0.3))\n self.time_shift = nn.ZeroPad2d((0,0,1,-1))", + "type": "code", + "location": "/RWKV-v3/src/model_run.py:32-57" + }, + "341": { + "file_id": 17, + "content": "RWKV-v3 model's forward function computes key-value pairs based on input tensor, using linear layers and element-wise operations. RWKV_TimeMix class initializes parameters for time-related operations in the model.", + "type": "comment" + }, + "342": { + "file_id": 17, + "content": " self.time_mix_k = nn.Parameter(torch.ones(1,1,RWKV_CFG.n_embd))\n self.time_mix_v = nn.Parameter(torch.ones(1,1,RWKV_CFG.n_embd))\n self.time_mix_r = nn.Parameter(torch.ones(1,1,RWKV_CFG.n_embd))\n self.key = nn.Linear(RWKV_CFG.n_embd, RWKV_CFG.n_embd, bias=False)\n self.value = nn.Linear(RWKV_CFG.n_embd, RWKV_CFG.n_embd, bias=False)\n self.receptance = nn.Linear(RWKV_CFG.n_embd, RWKV_CFG.n_embd, bias=False)\n self.output = nn.Linear(RWKV_CFG.n_embd, RWKV_CFG.n_embd, bias=False)\n def forward(self, x):\n B, T, C = x.size()\n xx = self.time_shift(x)\n xk = x * self.time_mix_k + xx * (1 - self.time_mix_k)\n xv = x * self.time_mix_v + xx * (1 - self.time_mix_v)\n xr = x * self.time_mix_r + xx * (1 - self.time_mix_r)\n k = self.key(xk).transpose(-1, -2)\n v = self.value(xv).transpose(-1, -2)\n r = self.receptance(xr)\n k = torch.clamp(k, max=RWKV_K_CLAMP)\n k = torch.exp(k)\n kv = k * v\n sel", + "type": "code", + "location": "/RWKV-v3/src/model_run.py:58-85" + }, + "343": { + "file_id": 17, + "content": "This code initializes parameters for a transformer model and defines its forward pass. It uses time-shifted inputs, applies multiplication with learnable mixing factors, and feeds them into separate key, value, and receptance linear layers before clamping the keys, applying exponential function, and element-wise multiplying with values to obtain kv outputs.", + "type": "comment" + }, + "344": { + "file_id": 17, + "content": "f.time_w = torch.cat([torch.exp(self.time_decay) * self.time_curve.to(self.time_decay.device), self.time_first], dim=-1)\n w = torch.exp(self.time_w)\n w = w[:,-T:].unsqueeze(1)\n wkv = F.conv1d(nn.ZeroPad2d((T-1, 0, 0, 0))(kv), w, groups=C)\n wk = F.conv1d(nn.ZeroPad2d((T-1, 0, 0, 0))(k), w, groups=C) + RWKV_K_EPS\n rwkv = torch.sigmoid(r) * (wkv / wk).transpose(-1, -2)\n rwkv = self.output(rwkv)\n return rwkv\nclass Block(nn.Module):\n def __init__(self, layer_id):\n super().__init__()\n self.layer_id = layer_id\n self.ln1 = nn.LayerNorm(RWKV_CFG.n_embd)\n self.ln2 = nn.LayerNorm(RWKV_CFG.n_embd)\n if self.layer_id == 0:\n self.ln0 = nn.LayerNorm(RWKV_CFG.n_embd)\n if self.layer_id == 0 and RWKV_CFG.model_type == 'RWKV-ffnPre':\n self.ffnPre = RWKV_ChannelMix(layer_id+1000)\n else:\n self.att = RWKV_TimeMix(layer_id)\n self.ffn = RWKV_ChannelMix(layer_id)\n def forward(self, x):\n if self.layer_id == 0:", + "type": "code", + "location": "/RWKV-v3/src/model_run.py:85-115" + }, + "345": { + "file_id": 17, + "content": "Code snippet defines a `Block` class and its forward pass for a transformer model. The block consists of layer normalizations, an attention mechanism (`RWKV_TimeMix`), feed-forward network (`RWKV_ChannelMix`), and optional pre-feed-forward layer (`RWKV_ffnPre`) for the first block only. The time dimension is handled by `time_decay`, `time_curve`, and `time_first` variables, which are used to compute the weights for the convolutions. These weights are then applied to the input through 1D convolutions (`F.conv1d`) after padding the inputs with `nn.ZeroPad2d`. Finally, the output is passed through an activation function (`torch.sigmoid`) and a final layer normalization before being returned.", + "type": "comment" + }, + "346": { + "file_id": 17, + "content": " x = self.ln0(x)\n if self.layer_id == 0 and RWKV_CFG.model_type == 'RWKV-ffnPre':\n x = x + self.ffnPre(self.ln1(x))\n else:\n x = x + self.att(self.ln1(x))\n x = x + self.ffn(self.ln2(x))\n return x\nclass RWKV_GPT(nn.Module):\n def __init__(self, MODEL_NAME, RUN_DEVICE, model_type, vocab_size, n_layer, n_embd, ctx_len):\n global RWKV_CFG\n super().__init__()\n RWKV_CFG.RUN_DEVICE = RUN_DEVICE\n RWKV_CFG.model_type = model_type\n RWKV_CFG.vocab_size = vocab_size\n RWKV_CFG.n_layer = n_layer\n RWKV_CFG.n_embd = n_embd\n RWKV_CFG.ctx_len = ctx_len\n print('\\nloading RWKV-GPT', MODEL_NAME)\n self.emb = nn.Embedding(vocab_size, n_embd)\n self.blocks = nn.Sequential(*[Block(i) for i in range(n_layer)])\n self.ln_out = nn.LayerNorm(n_embd)\n self.head = nn.Linear(n_embd, vocab_size, bias=False)\n if RWKV_HEAD_QK_DIM > 0:\n self.head_q = nn.Linear(n_embd, RWKV_HEAD_QK_DIM, bias=False)", + "type": "code", + "location": "/RWKV-v3/src/model_run.py:116-146" + }, + "347": { + "file_id": 17, + "content": "The code snippet is a part of the RWKV-GPT class, which inherits from nn.Module in PyTorch. The class defines the architecture of the RWKV model, including embedding layer, layers with residual connections and normalization, and output layers. It takes inputs like MODEL_NAME, RUN_DEVICE, model_type, vocab_size, n_layer, n_embd, and ctx_len as parameters. The code block defines the initialization of the model components and applies layer normalization and linear transformations for the input and output layers.", + "type": "comment" + }, + "348": { + "file_id": 17, + "content": " self.head_q.scale_init = 0\n self.head_k = nn.Linear(n_embd, RWKV_HEAD_QK_DIM, bias=False)\n self.head_k.scale_init = 0.1\n self.register_buffer(\"copy_mask\", torch.tril(\n torch.ones(ctx_len, ctx_len)))\n self.ctx_len = ctx_len\n self.eval()\n self.load_state_dict(torch.load(MODEL_NAME + '.pth'))\n self.eval()\n def forward(self, idx):\n B, T = idx.size()\n assert T <= self.ctx_len, \"Cannot forward, because len(input) > model ctx_len.\"\n x = self.emb(idx)\n x = self.blocks(x)\n x = self.ln_out(x)\n if RWKV_HEAD_QK_DIM > 0:\n q = self.head_q(x)[:, :T, :]\n k = self.head_k(x)[:, :T, :]\n c = (q @ k.transpose(-2, -1)) * (1.0 / RWKV_HEAD_QK_DIM)\n c = c.masked_fill(self.copy_mask[:T, :T] == 0, 0)\n c = c @ F.one_hot(idx, num_classes=RWKV_CFG.vocab_size).float()\n x = self.head(x) + c\n else:\n x = self.head(x) \n return x", + "type": "code", + "location": "/RWKV-v3/src/model_run.py:147-177" + }, + "349": { + "file_id": 17, + "content": "This code initializes a model for the RWKV-v3 language model. It sets the head_q scale init and head_k's scale init, registers a copy mask, assigns ctx_len, loads state from a model file, and defines a forward function that performs forward propagation on input idx. If RWKV_HEAD_QK_DIM is greater than 0, it computes the context vector c using attention between query q and key k, masks self-attention with copy_mask, and adds c to head output before returning x.", + "type": "comment" + }, + "350": { + "file_id": 17, + "content": "############################################################################################################\nclass RWKV_RNN():\n def __init__(self, MODEL_NAME, RUN_DEVICE, model_type, n_layer, n_embd, ctx_len):\n self.RUN_DEVICE = RUN_DEVICE\n self.model_type = model_type\n self.n_layer = n_layer\n self.n_embd = n_embd\n self.ctx_len = ctx_len\n self.w = types.SimpleNamespace()\n w = torch.load(MODEL_NAME + '.pth',\n map_location=torch.device(RUN_DEVICE))\n for x in w.keys():\n if '.time_' in x:\n w[x] = w[x].squeeze()\n if '.time_decay' in x:\n w[x] = torch.exp(-torch.exp(w[x]))\n if '.time_first' in x:\n w[x] = torch.exp(w[x])\n if DEBUG_TIME and '.time_' in x:\n print(x, w[x].squeeze().cpu().numpy())\n xx = x.split('.')\n here = self.w\n for i in range(len(xx)):\n if xx[i].isdigit():\n ii = int(xx[i])", + "type": "code", + "location": "/RWKV-v3/src/model_run.py:179-207" + }, + "351": { + "file_id": 17, + "content": "The code defines a class called `RWKV_RNN` and initializes its attributes with provided parameters. It loads the model weights from the specified file, performing transformations on certain keys if required. Debugging options are also available for time-related variables.", + "type": "comment" + }, + "352": { + "file_id": 17, + "content": " if ii not in here:\n here[ii] = types.SimpleNamespace()\n here = here[ii]\n else:\n if i == len(xx) - 1:\n setattr(here, xx[i], w[x])\n elif not hasattr(here, xx[i]):\n if xx[i+1].isdigit():\n setattr(here, xx[i], {})\n else:\n setattr(here, xx[i], types.SimpleNamespace())\n here = getattr(here, xx[i])\n self.clear()\n def clear(self):\n self.xx = {}\n self.aa = {}\n self.bb = {}\n self.hk = None\n def save(self, target):\n target.xx = copy.deepcopy(self.xx)\n target.aa = copy.deepcopy(self.aa)\n target.bb = copy.deepcopy(self.bb)\n target.hk = copy.deepcopy(self.hk)\n def load(self, target):\n self.xx = copy.deepcopy(target.xx)\n self.aa = copy.deepcopy(target.aa)\n self.bb = copy.deepcopy(target.bb)", + "type": "code", + "location": "/RWKV-v3/src/model_run.py:208-238" + }, + "353": { + "file_id": 17, + "content": "This code creates a hierarchical object structure using SimpleNamespace and dictionaries. It can be used to store and retrieve data in a nested manner. The clear method resets the stored data, while save and load methods allow copying the state of one instance to another.", + "type": "comment" + }, + "354": { + "file_id": 17, + "content": " self.hk = copy.deepcopy(target.hk)\n def LN(self, xx, w):\n return F.layer_norm(xx, (self.n_embd,), weight=w.weight, bias=w.bias)\n def FF(self, xx, w, name):\n if name not in self.xx:\n self.xx[name] = torch.zeros(self.n_embd, device=self.RUN_DEVICE)\n xk = xx * w.time_mix_k + self.xx[name] * (1 - w.time_mix_k)\n xr = xx * w.time_mix_r + self.xx[name] * (1 - w.time_mix_r)\n self.xx[name] = xx\n r = torch.sigmoid(w.receptance.weight @ xr)\n k = torch.square(torch.relu(w.key.weight @ xk))\n kv = w.value.weight @ k\n return r * kv\n def SA(self, xx, w, name):\n if name not in self.xx:\n self.xx[name] = torch.zeros(self.n_embd, device=self.RUN_DEVICE)\n self.aa[name] = torch.zeros(self.n_embd, device=self.RUN_DEVICE)\n self.bb[name] = torch.zeros(self.n_embd, device=self.RUN_DEVICE)\n xk = xx * w.time_mix_k + self.xx[name] * (1 - w.time_mix_k)\n xv = xx * w.time_mix_v + self.xx[name] * (1 - w.time_mix_v)", + "type": "code", + "location": "/RWKV-v3/src/model_run.py:239-264" + }, + "355": { + "file_id": 17, + "content": "The code defines three functions: `hk`, `LN`, and `FF`. The `hk` function copies the target's hk attribute. The `LN` function performs layer normalization on the input `xx` with provided weights `w`. The `FF` function implements a feed-forward layer, where it mixes `xx` with previous `name` values and applies sigmoid and relu functions before multiplying by weights. It also initializes `sa`, `aa`, and `bb` variables for subsequent SA operation.", + "type": "comment" + }, + "356": { + "file_id": 17, + "content": " xr = xx * w.time_mix_r + self.xx[name] * (1 - w.time_mix_r)\n self.xx[name] = xx\n r = torch.sigmoid(w.receptance.weight @ xr)\n k = torch.exp(torch.clamp(w.key.weight @ xk, max=RWKV_K_CLAMP))\n v = w.value.weight @ xv\n kv = k * v\n a = self.aa[name] + w.time_first * kv\n b = self.bb[name] + w.time_first * k\n self.aa[name] = w.time_decay * self.aa[name] + kv\n self.bb[name] = w.time_decay * self.bb[name] + k\n rwkv = r * a / (b + RWKV_K_EPS)\n return w.output.weight @ rwkv\n def run(self, ctx):\n w = self.w\n x = w.emb.weight[ctx[-1]]\n for i in range(self.n_layer):\n if i == 0:\n x = self.LN(x, w.blocks[i].ln0)\n if i == 0 and self.model_type == 'RWKV-ffnPre':\n x = x + self.FF(self.LN(x, w.blocks[i].ln1), w.blocks[i].ffnPre, f'ffnPre.{i}')\n else:\n x = x + self.SA(self.LN(x, w.blocks[i].ln1), w.blocks[i].att, f'att.{i}')\n x = x + self.FF(self.LN(x, w.blocks[i].ln2), w.blocks[i].ffn, f'ffn.{i}')", + "type": "code", + "location": "/RWKV-v3/src/model_run.py:265-294" + }, + "357": { + "file_id": 17, + "content": "This code performs a sequence of operations on the input 'x' using layers from the 'w' object. It applies layer normalization (LN), self-attention (SA), and feed-forward network (FF) for each block in the model. The result is then returned after applying weights and calculations specific to each block and layer.", + "type": "comment" + }, + "358": { + "file_id": 17, + "content": " x = self.LN(x, w.ln_out)\n if RWKV_HEAD_QK_DIM > 0:\n if self.hk == None:\n self.hk = (w.head_k.weight @ x).unsqueeze(0)\n else:\n self.hk = torch.cat(\n [self.hk, (w.head_k.weight @ x).unsqueeze(0)], dim=0)\n if self.hk.shape[0] > self.ctx_len:\n self.hk = self.hk[-self.ctx_len:, :]\n q = w.head_q.weight @ x\n x = w.head.weight @ x\n x = x.cpu().numpy().tolist()\n c = (self.hk @ q) / RWKV_HEAD_QK_DIM\n for i in range(len(c)):\n x[ctx[i]] += c[i]\n else:\n x = w.head.weight @ x\n x = x.cpu().numpy().tolist()\n return x", + "type": "code", + "location": "/RWKV-v3/src/model_run.py:296-319" + }, + "359": { + "file_id": 17, + "content": "This code is part of the RWKV-v3 model and performs attention calculations for context-aware generation. It uses a linear layer (LN) to normalize input x with w.ln_out, calculates attention vectors q and x, and stores them in self.hk. If RWKV_HEAD_QK_DIM is greater than 0, it performs attention calculations; otherwise, it skips the process. The output x is converted to a list and returned.", + "type": "comment" + }, + "360": { + "file_id": 18, + "content": "/RWKV-v3/src/trainer.py", + "type": "filepath" + }, + "361": { + "file_id": 18, + "content": "The Trainer class enables CUDA optimizations and handles data loading for training a language model. It iterates over the data, updates parameters, decays learning rate if necessary, logs progress, saves checkpoints, and manages learning rate decay with a tokens counter.", + "type": "summary" + }, + "362": { + "file_id": 18, + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nfrom torch.utils.data.dataloader import DataLoader\nfrom torch.optim.lr_scheduler import LambdaLR\nfrom torch.nn import functional as F\nimport torch.nn as nn\nimport torch.optim as optim\nimport torch\nfrom tqdm.auto import tqdm\nimport numpy as np\nimport logging\nimport os\nimport datetime\nimport sys\nimport math\n# import wandb # comment this if you don't have wandb\n# print('logging to wandb... (comment it if you don\\'t have wandb)')\nlogger = logging.getLogger(__name__)\ntorch.backends.cudnn.benchmark = True\ntorch.backends.cudnn.allow_tf32 = True\ntorch.backends.cuda.matmul.allow_tf32 = True\nlog_file = open(\"mylog.txt\", \"a\")\nclass TrainerConfig:\n max_epochs = 10\n batch_size = 64\n learning_rate = 4e-4\n betas = (0.9, 0.99)\n eps = 1e-8\n grad_norm_clip = 1.0", + "type": "code", + "location": "/RWKV-v3/src/trainer.py:1-36" + }, + "363": { + "file_id": 18, + "content": "The code imports necessary libraries for training a language model, sets some default hyperparameters such as maximum epochs and batch size, and creates a TrainerConfig class to manage these settings. The code also enables CUDA optimizations and opens a log file for output.", + "type": "comment" + }, + "364": { + "file_id": 18, + "content": " lr_decay = True # linear warmup followed by cosine decay\n warmup_tokens = 0\n final_tokens = 0\n epoch_save_frequency = 0\n epoch_save_path = 'trained-'\n num_workers = 0 # for DataLoader\n def __init__(self, **kwargs):\n for k, v in kwargs.items():\n setattr(self, k, v)\nclass Trainer:\n def __init__(self, model, train_dataset, test_dataset, config):\n self.model = model\n self.train_dataset = train_dataset\n self.test_dataset = test_dataset\n self.config = config\n self.avg_loss = -1\n self.steps = 0\n if 'wandb' in sys.modules:\n cfg = model.config\n for k in config.__dict__:\n setattr(cfg, k, config.__dict__[k]) # combine cfg\n wandb.init(project=\"RWKV-LM\", name=self.get_run_name() + '-' +\n datetime.datetime.today().strftime('%Y-%m-%d-%H-%M-%S'), config=cfg, save_code=False)\n self.device = 'cpu'\n if torch.cuda.is_available(): # take over whatever gpus are on the system", + "type": "code", + "location": "/RWKV-v3/src/trainer.py:37-67" + }, + "365": { + "file_id": 18, + "content": "This code initializes a Trainer class with parameters for model, train and test datasets, and config. It also includes options for learning rate decay, warmup and final tokens, epoch save frequency, number of data loader workers, and device usage. The code checks if the WandB module is available and sets up wandb initialization with combined configuration from model and user input.", + "type": "comment" + }, + "366": { + "file_id": 18, + "content": " self.device = torch.cuda.current_device()\n def get_run_name(self):\n raw_model = self.model.module if hasattr(\n self.model, \"module\") else self.model\n cfg = raw_model.config\n run_name = str(cfg.vocab_size) + '-' + str(cfg.ctx_len) + '-' + \\\n cfg.model_type + '-' + str(cfg.n_layer) + '-' + str(cfg.n_embd)\n return run_name\n def train(self):\n model, config = self.model, self.config\n raw_model = model.module if hasattr(self.model, \"module\") else model\n optimizer = raw_model.configure_optimizers(config)\n def run_epoch(split):\n is_train = split == 'train'\n model.train(is_train)\n data = self.train_dataset if is_train else self.test_dataset\n if config.num_workers > 0:\n loader = DataLoader(data, shuffle=False, pin_memory=True,\n batch_size=config.batch_size,\n num_workers=config.num_workers)\n else:", + "type": "code", + "location": "/RWKV-v3/src/trainer.py:68-92" + }, + "367": { + "file_id": 18, + "content": "The code initializes the device for CUDA operations, generates a unique run name based on model configuration, and defines a function to run an epoch. It also configures optimizers according to the model's parameters and handles data loading for training and testing with specified configurations.", + "type": "comment" + }, + "368": { + "file_id": 18, + "content": " loader = DataLoader(data, shuffle=False,\n batch_size=config.batch_size,\n num_workers=config.num_workers)\n pbar = tqdm(enumerate(loader), total=len(\n loader), bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}') if is_train else enumerate(loader)\n for it, (x, y) in pbar:\n x = x.to(self.device) # place data on the correct device\n y = y.to(self.device)\n with torch.set_grad_enabled(is_train):\n _, loss = model(x, y) # forward the model\n if is_train: # backprop and update the parameters\n model.zero_grad()\n loss.backward()\n if config.grad_norm_clip > 0:\n torch.nn.utils.clip_grad_norm_(\n model.parameters(), config.grad_norm_clip)\n optimizer.step()\n if config.lr_decay: # decay the learning rate based on our progress", + "type": "code", + "location": "/RWKV-v3/src/trainer.py:93-117" + }, + "369": { + "file_id": 18, + "content": "This code initializes a DataLoader for loading data in batches, creating a progress bar (pbar) to track progress, and iterates over the data. It then places the data on the correct device, forwards the model, computes loss, backpropagates if training, updates parameters, clips gradients if necessary, and optionally decays learning rate based on progress.", + "type": "comment" + }, + "370": { + "file_id": 18, + "content": " # number of tokens processed this step (i.e. label is not -100)\n self.tokens += (y >= 0).sum()\n lr_final_factor = config.lr_final / config.learning_rate\n if self.tokens < config.warmup_tokens:\n # linear warmup\n lr_mult = lr_final_factor + \\\n (1 - lr_final_factor) * float(self.tokens) / \\\n float(config.warmup_tokens)\n progress = 0\n else:\n # exponential learning rate decay\n progress = float(self.tokens - config.warmup_tokens) / float(max(1, config.final_tokens - config.warmup_tokens))\n if progress >= 1:\n lr_mult = lr_final_factor\n else:\n lr_mult = math.exp(math.log(lr_final_factor) * pow(progress, 1))", + "type": "code", + "location": "/RWKV-v3/src/trainer.py:118-133" + }, + "371": { + "file_id": 18, + "content": "This code segment is responsible for determining the learning rate multiplier during training. It first checks if the number of processed tokens is less than the warmup token count. If so, it performs linear warmup by interpolating between the final learning rate and the initial learning rate based on the proportion of processed tokens to warmup tokens. Otherwise, it calculates the exponential learning rate decay by using a progress variable representing the fraction of completed tokens beyond the warmup phase.", + "type": "comment" + }, + "372": { + "file_id": 18, + "content": " lr = config.learning_rate * lr_mult\n for param_group in optimizer.param_groups:\n param_group['lr'] = lr\n else:\n lr = config.learning_rate\n now_loss = loss.item() # report progress\n self.lr = lr\n if 'wandb' in sys.modules:\n wandb.log({\"loss\": now_loss},\n step=self.steps * self.config.batch_size)\n self.steps += 1\n if self.avg_loss < 0:\n self.avg_loss = now_loss\n else:\n factor = 1 / (it + 1)\n self.avg_loss = self.avg_loss * \\\n (1.0 - factor) + now_loss * factor\n pbar.set_description(\n f\"mini-epoch {epoch+1} prog {progress*100.0:.2f}% iter {it}: ppl {math.exp(self.avg_loss):.2f} loss {self.avg_loss:.4f} lr {lr:e}\")", + "type": "code", + "location": "/RWKV-v3/src/trainer.py:134-155" + }, + "373": { + "file_id": 18, + "content": "This code updates the learning rate (lr) based on a config file and adjusts the loss, average loss, and progress during training. It also logs the loss to WandB and updates the progress bar description with relevant information like loss, lr, etc.", + "type": "comment" + }, + "374": { + "file_id": 18, + "content": " self.tokens = 0 # counter used for learning rate decay\n for epoch in range(config.max_epochs):\n run_epoch('train')\n log_file.write(\n f'{epoch+1} {self.avg_loss:.6f} {math.exp(self.avg_loss):.4f} {self.lr:.8f} {datetime.datetime.now()} \\n')\n log_file.flush()\n if (self.config.epoch_save_frequency > 0 and epoch % self.config.epoch_save_frequency == 0) or (epoch == config.max_epochs - 1):\n # DataParallel wrappers keep raw model object in .module\n raw_model = self.model.module if hasattr(\n self.model, \"module\") else self.model\n torch.save(raw_model.state_dict(),\n self.config.epoch_save_path + str(epoch+1) + '.pth')", + "type": "code", + "location": "/RWKV-v3/src/trainer.py:157-171" + }, + "375": { + "file_id": 18, + "content": "This code initializes a tokens counter for learning rate decay, trains the model for specified epochs, logs progress, and saves model checkpoints at user-specified intervals or at the end of training.", + "type": "comment" + }, + "376": { + "file_id": 19, + "content": "/RWKV-v3/src/utils.py", + "type": "filepath" + }, + "377": { + "file_id": 19, + "content": "This code defines a custom dataset class and tokenizer for RWKV language model, initializing objects with dictionaries and functions. It handles text generation using top-p sampling with temperature parameter and includes conversion functions for input values.", + "type": "summary" + }, + "378": { + "file_id": 19, + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nimport json\nimport random\nimport time\nimport math\nimport numpy as np\nimport torch\nimport torch.nn as nn\nfrom torch.nn import functional as F\nfrom torch.utils.data import Dataset\nclass Dataset(Dataset):\n def __init__(self, data, ctx_len, epoch_length_fixed):\n print('building token list...', end=' ')\n unique = sorted(list(set(data)))\n # print()\n # for u in unique:\n # print(u, end=' ')\n # print('\\n\\n')\n xx = 0\n xxObj = {}\n for u in unique:\n xxObj[xx] = u\n xx += 1\n with open('vocab.json', \"w\", encoding=\"utf-16\") as vocab_file:\n vocab_file.write(json.dumps(xxObj, ensure_ascii=False))\n data_size, vocab_size = len(data), len(unique)\n print('data has %d tokens, %d unique.' % (data_size, vocab_size))", + "type": "code", + "location": "/RWKV-v3/src/utils.py:1-34" + }, + "379": { + "file_id": 19, + "content": "The code defines a custom dataset class for the RWKV language model, which reads in data and builds a token list. It then stores the unique tokens in a dictionary-like format and writes it to a 'vocab.json' file. The function prints the total number of tokens in the data and the number of unique tokens.", + "type": "comment" + }, + "380": { + "file_id": 19, + "content": " self.stoi = {ch: i for i, ch in enumerate(unique)}\n self.itos = {i: ch for i, ch in enumerate(unique)}\n self.ctx_len = ctx_len\n self.epoch_length_fixed = epoch_length_fixed\n self.vocab_size = vocab_size\n self.data = data\n def __len__(self):\n return self.epoch_length_fixed\n def __getitem__(self, idx):\n # cheat: pick a random spot in dataset\n i = np.random.randint(0, len(self.data) - (self.ctx_len + 1))\n chunk = self.data[i:i+self.ctx_len+1]\n dix = [self.stoi[s] for s in chunk]\n x = torch.tensor(dix[:-1], dtype=torch.long,\n device=torch.device('cuda'))\n y = torch.tensor(dix[1:], dtype=torch.long,\n device=torch.device('cuda'))\n return x, y\nclass TOKENIZER():\n def __init__(self, WORD_NAME, UNKNOWN_CHAR='\\ue083'):\n with open(WORD_NAME + '.json', \"r\", encoding=\"utf-16\") as result_file:\n self.word_table = json.load(result_file)\n self.vocab_size = len(self.word_table)", + "type": "code", + "location": "/RWKV-v3/src/utils.py:35-62" + }, + "381": { + "file_id": 19, + "content": "The code above initializes an object for a tokenizer that converts text data into numerical representations. The object contains dictionaries mapping characters to indices (stoi) and indices to characters (itos), context length, fixed epoch length, vocabulary size, and the actual text data. It also provides functions for getting elements at specific index and calculating lengths of the tokenizer instance. The tokenizer class is initialized with a Word Name file and an optional Unknown Character.", + "type": "comment" + }, + "382": { + "file_id": 19, + "content": " self.stoi = {v: int(k) for k, v in self.word_table.items()}\n self.itos = {int(k): v for k, v in self.word_table.items()}\n self.UNKNOWN_CHAR = self.stoi[UNKNOWN_CHAR]\n def refine_context(self, context):\n context = context.strip().split('\\n')\n for c in range(len(context)):\n context[c] = context[c].strip().strip('\\u3000').strip('\\r')\n context = list(filter(lambda c: c != '', context))\n context = '\\n' + ('\\n'.join(context)).strip()\n if context == '':\n context = '\\n'\n return context\n def sample_logits(self, out, x, ctx_len, temperature=1.0, top_p_usual=None, top_p_newline=None):\n # out[self.UNKNOWN_CHAR] = -float('Inf')\n lastChar = int(x[-1])\n probs = F.softmax(torch.tensor(out), dim=-1)\n if self.itos[lastChar] == '\\n':\n top_p = top_p_newline\n else:\n top_p = top_p_usual\n sorted_probs, s_index = torch.sort(probs, descending=True)\n # for j in range(30):\n # pp = sorted_probs[j].item()", + "type": "code", + "location": "/RWKV-v3/src/utils.py:64-95" + }, + "383": { + "file_id": 19, + "content": "Function `refine_context` strips and filters context strings.\n\"sample\\_logits\" calculates softmax probs, applies top\\_p if last char is newline, sorts probs, then...", + "type": "comment" + }, + "384": { + "file_id": 19, + "content": " # if pp < 0.005:\n # break\n # ss = self.itos[int(s_index[j])].replace('\\n','_')\n # print(f'{math.floor(pp*100):>3.0f}{ss}', end='')\n # print('')\n cumulative_probs = torch.cumsum(sorted_probs, dim=-1).numpy()\n cutoff = float(sorted_probs[np.argmax(cumulative_probs > top_p)])\n probs[probs < cutoff] = 0\n # print(\"[\" + str(round(cutoff,4)) + ' ' + str(round(to_float(sum(probs)),3)) + \"]\", end = \"\")\n if temperature != 1.0:\n probs = probs.pow(1.0 / temperature)\n return torch.multinomial(probs, num_samples=1)[0]\ndef to_float(x):\n return x.cpu().detach().numpy().flatten()[0].astype(float)\ndef set_seed(seed):\n random.seed(seed)\n np.random.seed(seed)\n torch.manual_seed(seed)\n torch.cuda.manual_seed_all(seed)", + "type": "code", + "location": "/RWKV-v3/src/utils.py:96-122" + }, + "385": { + "file_id": 19, + "content": "This code snippet is from the RWKV-LM project's \"utils.py\" file, and it appears to handle text generation using top-p sampling with a temperature parameter. The function generates a single token based on the given input and calculates cumulative probabilities. It then determines a cutoff value for the probabilities and sets any values below that cutoff to zero. If the temperature is not set to 1.0, it applies power normalization to the probabilities. Finally, it uses torch.multinomial to select one token based on the modified probabilities and returns it. The code also includes a to_float function for converting input values to floats and a set_seed function for setting random number generator seeds.", + "type": "comment" + }, + "386": { + "file_id": 20, + "content": "/RWKV-v3/train.py", + "type": "filepath" + }, + "387": { + "file_id": 20, + "content": "This code trains an RWKV-LM and GPT language model, optimizes GPU memory usage, saves partial models, and trains with specified parameters. The trained model's state dictionary is saved with identifiers for file identification.", + "type": "summary" + }, + "388": { + "file_id": 20, + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nimport os\n# if False: # True False ---> Set to False if you don't understand it\n# print(\"\\n\\n[[[ SPECIAL DEBUG MODE FOR MYSELF. DON'T ENABLE THIS IF YOU DON'T UNDERSTAND IT ]]]\\n\\n\")\n# os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n# import src.utils\n# src.utils.set_seed(42) # make training deterministic (including dataloader). if you are doing this, remember to change seed when you load a model (otherwise the dataloader loads old samples)\nimport logging\nimport datetime\nfrom src.model import GPT, GPTConfig\nfrom src.trainer import Trainer, TrainerConfig\nfrom src.utils import Dataset\nimport torch\nimport numpy as np\nnp.set_printoptions(precision=4, suppress=True, linewidth=200)\nlogging.basicConfig(format=\"%(asctime)s - %(levelname)s - %(name)s - %(message)s\",", + "type": "code", + "location": "/RWKV-v3/train.py:1-22" + }, + "389": { + "file_id": 20, + "content": "The code is importing necessary modules and setting up the environment for training a language model called RWKV-LM. It also includes a special debug mode option that can be enabled but is currently set to False, and it sets seed for deterministic training. The code imports classes from other files, including GPT model and TrainerConfig for configuration settings. The logging module is configured to display specific information in the console.", + "type": "comment" + }, + "390": { + "file_id": 20, + "content": " datefmt=\"%Y-%m-%d %H:%M:%S\", level=logging.INFO,)\ntorch.backends.cudnn.benchmark = True\ntorch.backends.cudnn.allow_tf32 = True\ntorch.backends.cuda.matmul.allow_tf32 = True\n### Step 1: set training data ##########################################################################\ndatafile = \"../data/enwik8\" # your data\ndatafile_encoding = 'utf-8'\n# datafile_encoding = 'utf-16le'\n### Step 2: set model size #############################################################################\n# ----> test deeper models (n_layer at least 12) to see the advantage of RWKV-3 over RWKV-2\nctx_len = 1024 # increase T_MAX in model.py if your ctx_len > 1024\nn_layer = 6\nn_embd = 512\n# 'RWKV' (better for English) or 'RWKV-ffnPre' (better in some cases)\nmodel_type = 'RWKV'\n# ---> there is a RWKV_HEAD_QK_DIM in model.py and model_run.py\n# set it to 256, then it's using my headQK trick (similar to a tiny attention) to improve loss\n# set it to 0, then it's a pure RNN (attention-free)\n### Step 3: set batch size #############################################################################", + "type": "code", + "location": "/RWKV-v3/train.py:23-48" + }, + "391": { + "file_id": 20, + "content": "This code sets up training parameters for the RWKV-v3 language model. It specifies the data file, model size, batch size, and enables CUDA features for efficient GPU utilization. The model type can be either 'RWKV' or 'RWKV-ffnPre', and there is an optional headQK dimension to improve loss. This configuration allows for training deeper models with better performance.", + "type": "comment" + }, + "392": { + "file_id": 20, + "content": "# ---> batch_size must be divisible by B_GROUP_FORWARD and B_GROUP_BACKWARD in model.py\n# for example, if your batch_size = 20, you can set B_GROUP_FORWARD = 4, B_GROUP_BACKWARD = 2\n# if you see \"CUDA out of memory\", reduce batch_size. Use nvidia-smi to find the highest value for your GPU.\nbatch_size = 12\n### Step 4: set learning rate, number of mini-epochs #######################################################\n# By default we are using exponential LR decay.\n#\n# Here are my suggestions for training a good model.\n# Let's say you will train a L6-D512 model.\n# 1) Set lr_init = lr_final = 8e-4. Let it run for some mini-epochs, until the improvement of loss become slow.\n# 2) Check epoch_save_frequency and make sure the partially-trained model is saved. Ctrl+C to stop the run.\n# 3) Set lr_init = 8e-4, lr_final = 1e-5, warmup_tokens = ctx_len * batch_size * 50, betas = (0.9, 0.999).\n# 4) Search for \"torch.load\" here and modify it to load the partially-trained model. Continue the training.\n# \n# For L12-D768, set lr_init = 6e-4. For L24-D1024, set lr_init = 4e-4. For L24-D2048, set lr_init = 3e-4.", + "type": "code", + "location": "/RWKV-v3/train.py:50-65" + }, + "393": { + "file_id": 20, + "content": "This code snippet provides guidance for setting batch_size, learning rate, and the number of mini-epochs in RWKV-v3's training script. It suggests adjusting batch_size to optimize GPU memory usage and offers suggestions on how to set learning rate values for different model sizes. The comments also advise saving a partially trained model for resuming later, with instructions on how to implement this.", + "type": "comment" + }, + "394": { + "file_id": 20, + "content": "lr_init = 8e-4 # we can use larger lr because of preLN\nlr_final = 1e-5\n# the mini-epoch is very short and of fixed length (length = ctx_len * epoch_length_fixed tokens)\nn_epoch = 500\nepoch_length_fixed = 10000\n# 0 = never, 1 = every mini-epoch, 2 = every two mini-epochs, ...\nepoch_save_frequency = 10\nepoch_save_path = 'trained-'\n########################################################################################################\ngrad_norm_clip = 1.0\nwarmup_tokens = ctx_len * batch_size * 0\nbetas = (0.9, 0.99)\neps = 4e-9\nnum_workers = 0\n########################################################################################################\n# Load data\n########################################################################################################\nprint('loading data... ' + datafile)\ntrain_dataset = Dataset(open(\n datafile, \"r\", encoding=datafile_encoding).read(), ctx_len, epoch_length_fixed)\n########################################################################################################", + "type": "code", + "location": "/RWKV-v3/train.py:67-96" + }, + "395": { + "file_id": 20, + "content": "This code initializes learning rate, defines training parameters, loads the dataset, and sets up data processing for training RWKV language model. It specifies learning rate initialization, final value, number of epochs, fixed batch length, save frequency, gradient norm clipping, warmup tokens, betas for Adam optimizer, epsilon for stability, and number of workers for data loading.", + "type": "comment" + }, + "396": { + "file_id": 20, + "content": "# Train model\n########################################################################################################\nif __name__ == '__main__':\n model = GPT(GPTConfig(train_dataset.vocab_size, train_dataset.ctx_len, model_type=model_type,\n n_layer=n_layer, n_embd=n_embd)).cuda()\n ### ---> load a trained model <---\n # m2 = torch.load('trained-61.pth')\n # model.load_state_dict(m2)\n print('model', model_type, 'epoch', n_epoch, 'batchsz', batch_size, 'betas',\n betas, 'eps', eps, 'ctx', ctx_len, 'layer', n_layer, 'embd', n_embd, )\n tconf = TrainerConfig(model_type=model_type, max_epochs=n_epoch, batch_size=batch_size,\n learning_rate=lr_init, lr_decay=True, lr_final=lr_final, betas=betas, eps=eps, grad_norm_clip=grad_norm_clip,\n warmup_tokens=warmup_tokens, final_tokens=n_epoch*len(train_dataset)*ctx_len, num_workers=num_workers, epoch_save_frequency=epoch_save_frequency, epoch_save_path=epoch_save_path)", + "type": "code", + "location": "/RWKV-v3/train.py:97-112" + }, + "397": { + "file_id": 20, + "content": "This code snippet is responsible for training a model using the GPT architecture. It loads a previously trained model, specifies the trainer configuration, and then proceeds to train the model with the specified number of epochs, batch size, learning rate, and other hyperparameters.", + "type": "comment" + }, + "398": { + "file_id": 20, + "content": " trainer = Trainer(model, train_dataset, None, tconf)\n trainer.train()\n torch.save(model.state_dict(), 'trained-' + str(n_epoch) + '-' + trainer.get_run_name() +\n '-' + datetime.datetime.today().strftime('%Y-%m-%d-%H-%M-%S') + '.pth')", + "type": "code", + "location": "/RWKV-v3/train.py:113-118" + }, + "399": { + "file_id": 20, + "content": "Saving the model's state dictionary after training, including epoch number and run name, with a timestamp for file identification.", + "type": "comment" + } +} \ No newline at end of file diff --git a/docs/data/4.json b/docs/data/4.json new file mode 100644 index 00000000..3c84aa61 --- /dev/null +++ b/docs/data/4.json @@ -0,0 +1,545 @@ +{ + "400": { + "file_id": 21, + "content": "/RWKV-v3/verify.py", + "type": "filepath" + }, + "401": { + "file_id": 21, + "content": "RWKV Language Model is verified using GPT architecture, creating RWKV-GPT and RWKV-RNN models. Context data, input length, and model forward execution are performed on context tensor. Padding and batching done for compatibility with forward and backward groups. Model training forward pass executed, detaching and moving results to CPU before printing.", + "type": "summary" + }, + "402": { + "file_id": 21, + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\n# this is for verifying the results of different models and make sure they agree with each other\nimport numpy as np\nnp.set_printoptions(precision=4, suppress=True, linewidth=200)\nimport os\nos.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\nRUN_DEVICE = 'cuda'\nimport torch\nfrom src.model_run import RWKV_RNN, RWKV_GPT\nfrom src.model import GPT, GPTConfig\nctx_len = 1024\nn_layer = 6\nn_embd = 512\nmodel_type = 'RWKV'\nmodel_name = 'trained-1'\nfrom src.utils import TOKENIZER\ntokenizer = TOKENIZER('vocab', UNKNOWN_CHAR=' ')\n########################################################################################################\nmodel_train = GPT(GPTConfig(tokenizer.vocab_size, ctx_len, model_type=model_type, n_layer=n_layer, n_embd=n_embd)).cuda()\nprint('loading ' + model_name)", + "type": "code", + "location": "/RWKV-v3/verify.py:1-31" + }, + "403": { + "file_id": 21, + "content": "Loading and configuring the RWKV Language Model (RWKV-LM) for verification, using GPT architecture with specific configurations and tokenizer.", + "type": "comment" + }, + "404": { + "file_id": 21, + "content": "m2 = torch.load(model_name + '.pth', map_location=RUN_DEVICE)\nmodel_train.load_state_dict(m2)\nmodel_rnn = RWKV_RNN(model_name, RUN_DEVICE, model_type, n_layer, n_embd, ctx_len)\nmodel_gpt = RWKV_GPT(model_name, RUN_DEVICE, model_type, tokenizer.vocab_size, n_layer, n_embd, ctx_len).cuda()\n########################################################################################################\ncontext = '\\nIn a'\nctx = [tokenizer.stoi.get(s, tokenizer.UNKNOWN_CHAR) for s in context]\nprint(f'input len {len(ctx)} data {ctx}')\n########################################################################################################\nprint('\\nRWKV-GPT output')\nout = model_gpt.forward(torch.tensor(ctx).unsqueeze(0).cuda())[0].detach().cpu().numpy()\nprint(out)\nprint('\\nRWKV-RNN output')\nmodel_rnn.clear()\nsrc_len = len(ctx)\nfor i in range(src_len):\n x = ctx[:i+1]\n out = model_rnn.run(x)\n if i < 3 or i >= src_len - 3:\n print(torch.tensor(out).detach().cpu().numpy())\n if i == 2:\n print('...')\nprint('\\nRWKV-train output')", + "type": "code", + "location": "/RWKV-v3/verify.py:32-61" + }, + "405": { + "file_id": 21, + "content": "Loading the model from a checkpoint file and creating both RWKV-GPT and RWKV-RNN models.\nPrinting input length and data for context.\nOutput of RWKV-GPT model using forward function on context tensor.\nOutput of RWKV-RNN model running on context with select indices printed.\nOutput of the train model running on context with select indices printed.", + "type": "comment" + }, + "406": { + "file_id": 21, + "content": "ctx += [0] * (ctx_len - src_len) # pad to ctx_len\nctx = [ctx] * 4 # increase batch size (to make it work with B_GROUP_FORWARD & B_GROUP_BACKWARD)\nout = model_train.forward(torch.tensor(ctx).cuda())[0][0][:src_len].detach().cpu().numpy()\nprint(out, '\\n')", + "type": "code", + "location": "/RWKV-v3/verify.py:62-65" + }, + "407": { + "file_id": 21, + "content": "Padding the context list to match the length of source, then duplicating it to increase batch size for compatibility with forward and backward groups. Performs model training forward pass on padded context, detaches and moves result to CPU before printing.", + "type": "comment" + }, + "408": { + "file_id": 22, + "content": "/RWKV-v4/cuda/wkv_cuda.cu", + "type": "filepath" + }, + "409": { + "file_id": 22, + "content": "RWKV-v4's CUDA kernel performs RNN operations, gradient calculation, and avoids overflow with matrix multiplication. Efficient parallel execution techniques are used in cuda_forward and cuda_backward functions, optimizing register usage and ensuring proper block and thread configurations for GPU execution.", + "type": "summary" + }, + "410": { + "file_id": 22, + "content": "#include \n#include \n#define MIN_VALUE (-1e38)\ntemplate \n__global__ void kernel_forward(const int B, const int T, const int C,\n const F *__restrict__ const _w, const F *__restrict__ const _u, const F *__restrict__ const _k, const F *__restrict__ const _v,\n F *__restrict__ const _y) {\n const int idx = blockIdx.x * blockDim.x + threadIdx.x;\n const int _b = idx / C;\n const int _c = idx % C;\n const int _offset = _b * T * C + _c;\n F u = _u[_c];\n F w = _w[_c];\n const F *__restrict__ const k = _k + _offset;\n const F *__restrict__ const v = _v + _offset;\n F *__restrict__ const y = _y + _offset;\n F p = 0, q = 0, o = MIN_VALUE;\n // p and q are running sums divided by exp(o) (to avoid overflows)\n for (int i = 0; i < T; i++) {\n const int ii = i * C;\n F no = max(o, u + k[ii]);\n F A = exp(o - no);\n F B = exp(u + k[ii] - no);\n y[ii] = (A * p + B * v[ii]) / (A * q + B);", + "type": "code", + "location": "/RWKV-v4/cuda/wkv_cuda.cu:1-29" + }, + "411": { + "file_id": 22, + "content": "This code snippet defines a CUDA kernel function for the forward pass of a recurrent neural network (RNN). The function takes input and weight tensors as arguments, performs matrix multiplications, and accumulates results using running sums to avoid overflows. The result is stored in output tensor y.", + "type": "comment" + }, + "412": { + "file_id": 22, + "content": " no = max(w + o, k[ii]);\n A = exp(w + o - no);\n B = exp(k[ii] - no);\n p = A * p + B * v[ii];\n q = A * q + B;\n o = no;\n }\n}\ntemplate \n__global__ void kernel_backward(const int B, const int T, const int C,\n const F *__restrict__ const _w, const F *__restrict__ const _u, const F *__restrict__ const _k, const F *__restrict__ const _v, const F *__restrict__ const _gy,\n F *__restrict__ const _gw, F *__restrict__ const _gu, F *__restrict__ const _gk, F *__restrict__ const _gv) {\n const int idx = blockIdx.x * blockDim.x + threadIdx.x;\n const int _b = idx / C;\n const int _c = idx % C;\n const int _offset = _b * T * C + _c;\n F u = _u[_c];\n F w = _w[_c];\n const F *__restrict__ const k = _k + _offset;\n const F *__restrict__ const v = _v + _offset;\n const F *__restrict__ const gy = _gy + _offset;\n F *__restrict__ const gk = _gk + _offset;\n F *__restrict__ const gv = _gv + _offset;", + "type": "code", + "location": "/RWKV-v4/cuda/wkv_cuda.cu:31-56" + }, + "413": { + "file_id": 22, + "content": "Kernel function for backward propagation in RWKV-v4. Calculates gradients for weight matrix w, input u and key matrix k. Uses blockIdx and threadIdx to calculate offsets for accessing data.", + "type": "comment" + }, + "414": { + "file_id": 22, + "content": " F y[Tmax], z[Tmax], zexp[Tmax];\n F gw = 0, gu = 0;\n F p = 0, q = 0;\n F dpdw = 0, dqdw = 0;\n F o = MIN_VALUE;\n for (int i = 0; i < T; i++) {\n const int ii = i * C;\n F no = max(o, k[ii] + u);\n F A = exp(o - no);\n F B = exp(k[ii] + u - no);\n F num = A * p + B * v[ii];\n F iden = 1 / (A * q + B);\n y[i] = num * iden;\n z[i] = iden;\n zexp[i] = k[ii] + u - no;\n gw += gy[ii] * (dpdw - dqdw * y[i]) * iden * A;\n gu += gy[ii] * (v[ii] - y[i]) * B * iden;\n no = max(w + o, k[ii]);\n A = exp(w + o - no);\n B = exp(k[ii] - no);\n dpdw = A * (p + dpdw);\n dqdw = A * (q + dqdw);\n p = A * p + B * v[ii];\n q = A * q + B;\n o = no;\n }\n F gp = 0, gq = 0;\n o = MIN_VALUE;\n for (int i = T - 1; i >= 0; i--) {\n const int ii = i * C;\n F A = gy[ii] * z[i] * exp(zexp[i]);\n F B = exp(k[ii] + o);\n gk[ii] = A * (v[ii] - y[i]) + B * (gp * v[ii] + gq);\n gv[ii] = A + B * gp;", + "type": "code", + "location": "/RWKV-v4/cuda/wkv_cuda.cu:58-97" + }, + "415": { + "file_id": 22, + "content": "This code calculates the gradients of model parameters by iterating through a dataset, updating intermediate variables, and storing gradients in gk and gv arrays. It uses matrix multiplication and exponential operations.", + "type": "comment" + }, + "416": { + "file_id": 22, + "content": " F no = max(w + o, zexp[i] - k[ii] - u);\n A = exp(w + o - no);\n B = gy[ii] * z[i] * exp(zexp[i] - k[ii] - u - no);\n gp = A * gp + B;\n gq = A * gq - B * y[i];\n o = no;\n }\n // Multiply by w because the w -> -exp(w) preprocessing is halfway in the backwards pass, even though it's not in the forward pass\n const int _offsetBC = _b * C + _c;\n _gw[_offsetBC] += gw * _w[_c];\n _gu[_offsetBC] += gu;\n}\nvoid cuda_forward(int B, int T, int C, float *w, float *u, float *k, float *v, float *y) {\n dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance\n assert(B * C % threadsPerBlock.x == 0);\n dim3 numBlocks(B * C / threadsPerBlock.x);\n kernel_forward<< >>(B, T, C, w, u, k, v, y);\n}\nvoid cuda_backward(int B, int T, int C, float *w, float *u, float *k, float *v, float *gy, float *gw, float *gu, float *gk, float *gv) {\n dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance", + "type": "code", + "location": "/RWKV-v4/cuda/wkv_cuda.cu:99-121" + }, + "417": { + "file_id": 22, + "content": "This code is a part of the RWKV-v4 library, specifically the cuda_forward and cuda_backward functions for CUDA implementation. It involves setting up blocks and threads based on the given dimensions B, T, and C. The kernels kernel_forward and kernel_backward are launched with appropriate block and thread configurations to perform matrix operations in parallel. This code utilizes an optimal number of registers and asserts that B * C is divisible by the chosen number of threads per block for efficient execution.", + "type": "comment" + }, + "418": { + "file_id": 22, + "content": " assert(B * C % threadsPerBlock.x == 0);\n dim3 numBlocks(B * C / threadsPerBlock.x);\n kernel_backward<< >>(B, T, C, w, u, k, v, gy, gw, gu, gk, gv);\n}", + "type": "code", + "location": "/RWKV-v4/cuda/wkv_cuda.cu:122-125" + }, + "419": { + "file_id": 22, + "content": "This code ensures that the number of blocks is an integer multiple of threads per block, then launches a kernel function for backward computation. It uses B, C, and threadsPerBlock parameters to determine the number of blocks and threads per block for efficient GPU execution.", + "type": "comment" + }, + "420": { + "file_id": 23, + "content": "/RWKV-v4/cuda/wkv_op.cpp", + "type": "filepath" + }, + "421": { + "file_id": 23, + "content": "The code defines C++ functions for the forward and backward passes of a WKV model in RWKV-LM/RWKV-v4, utilizing PyTorch's torch::Tensor class and CUDA kernels on GPU.", + "type": "summary" + }, + "422": { + "file_id": 23, + "content": "#include \nvoid cuda_forward(int B, int T, int C, float *w, float *u, float *k, float *v, float *y);\nvoid cuda_backward(int B, int T, int C, float *w, float *u, float *k, float *v, float *gy, float *gw, float *gu, float *gk, float *gv);\nvoid forward(int64_t B, int64_t T, int64_t C, torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y) {\n cuda_forward(B, T, C, w.data_ptr (), u.data_ptr (), k.data_ptr (), v.data_ptr (), y.data_ptr ());\n}\nvoid backward(int64_t B, int64_t T, int64_t C, torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &gy, torch::Tensor &gw, torch::Tensor &gu, torch::Tensor &gk, torch::Tensor &gv) {\n cuda_backward(B, T, C, w.data_ptr (), u.data_ptr (), k.data_ptr (), v.data_ptr (), gy.data_ptr (), gw.data_ptr (), gu.data_ptr (), gk.data_ptr (), gv.data_ptr ());\n}\nPYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {\n m.def(\"forward\", &forward, \"wkv forward\");", + "type": "code", + "location": "/RWKV-v4/cuda/wkv_op.cpp:1-14" + }, + "423": { + "file_id": 23, + "content": "This code defines a C++ function for the forward pass of the WaveGrad RWKV model, and a backward pass for gradient calculation. It uses the torch::Tensor class from PyTorch library to handle tensors and wraps the CUDA kernels with torch::extension to enable GPU computation.", + "type": "comment" + }, + "424": { + "file_id": 23, + "content": " m.def(\"backward\", &backward, \"wkv backward\");\n}\nTORCH_LIBRARY(wkv, m) {\n m.def(\"forward\", forward);\n m.def(\"backward\", backward);\n}", + "type": "code", + "location": "/RWKV-v4/cuda/wkv_op.cpp:15-21" + }, + "425": { + "file_id": 23, + "content": "The code is defining C++ functions for the forward and backward passes of a Wavelet Quantized Variational Kalman Filter (WKV) model in the RWKV-LM/RWKV-v4/cuda/wkv_op.cpp file, which are then exported as bindings through Torch library with names \"forward\" and \"backward\".", + "type": "comment" + }, + "426": { + "file_id": 24, + "content": "/RWKV-v4/run.py", + "type": "filepath" + }, + "427": { + "file_id": 24, + "content": "This code utilizes the RWKV Language Model for text generation, setting up parameters and initializing the model. It tokenizes prompts, performs multiple trials, calculates logits, handles special cases, prints tokens, and saves/loads model states.", + "type": "summary" + }, + "428": { + "file_id": 24, + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nimport numpy as np\nimport math, os\nimport time\nimport types\nimport copy\nimport torch\nfrom torch.nn import functional as F\nfrom src.utils import TOKENIZER, Dataset\ntorch.backends.cudnn.benchmark = True\ntorch.backends.cudnn.allow_tf32 = True\ntorch.backends.cuda.matmul.allow_tf32 = True\nnp.set_printoptions(precision=4, suppress=True, linewidth=200)\n########################################################################################################\n# Step 1: set model\n# \n# Set TOKEN_MODE to 'char' or 'bpe' if the model is trained by 'train.py' from scratch.\n#\n# Set TOKEN_MODE to 'pile' if you want to test pre-trained pile models.\n########################################################################################################\nTOKEN_MODE = 'char' # char / bpe / pile", + "type": "code", + "location": "/RWKV-v4/run.py:1-26" + }, + "429": { + "file_id": 24, + "content": "This code is importing necessary libraries and modules for the RWKV Language Model. It sets the TOKEN_MODE variable to either 'char', 'bpe', or 'pile' depending on whether the model is trained from scratch, pre-trained pile models are being tested, etc. This helps set up the appropriate configuration for running the model.", + "type": "comment" + }, + "430": { + "file_id": 24, + "content": "n_layer = 6\nn_embd = 512\nctx_len = 1024\nif TOKEN_MODE == 'char':\n MODEL_NAME = 'trained-500' # your trained model\n WORD_NAME = 'vocab' # the .json vocab (generated by train.py)\n # set UNKNOWN_CHAR to the rarest token in your vocab.json, and all unknown tokens in your prompt will be denoted by it\n UNKNOWN_CHAR = ' ' # here we just set it to ' ' for simplicity\nelif TOKEN_MODE == 'bpe':\n MODEL_NAME = 'trained-500' # your trained model\n WORD_NAME = ['model-vocab.json', 'model-merges.txt'] # [vocab, merge] for your BPE model\n UNKNOWN_CHAR = None\nelif TOKEN_MODE == 'pile':\n WORD_NAME = ['20B_tokenizer.json', '20B_tokenizer.json']\n UNKNOWN_CHAR = None\n #---> you can set MODEL_NAME to your fine-tuned model <---\n MODEL_NAME = 'RWKV-4-Pile-169M-20220807-8023'\n # MODEL_NAME = 'trained-11'\n n_layer = 12\n n_embd = 768\n ctx_len = 1024\n # MODEL_NAME = 'RWKV-4-Pile-430M-20220808-8066'\n # n_layer = 24\n # n_embd = 1024\n # ctx_len = 1024\n # MODEL_NAME = 'RWKV-4-Pile-1B5-20220903-8040'", + "type": "code", + "location": "/RWKV-v4/run.py:28-60" + }, + "431": { + "file_id": 24, + "content": "This code is responsible for setting the necessary parameters and model name based on the tokenization mode. The modes include character, byte-pair encoding (BPE), and Pile. If using a pre-trained RWKV model, it allows specifying a fine-tuned model for better performance. Different model architectures like BERT, RoBERTa, and GPT are used depending on the mode. The parameters n_layer, n_embd, and ctx_len define the number of layers, embedding dimensions, and context length respectively for the chosen model.", + "type": "comment" + }, + "432": { + "file_id": 24, + "content": " # n_layer = 24\n # n_embd = 2048\n # ctx_len = 1024 \nos.environ['RWKV_FLOAT_MODE'] = 'fp32' # 'bf16' / 'fp16' / 'fp32' (note: only using fp32 at this moment)\nos.environ['RWKV_RUN_DEVICE'] = 'cpu' # 'cpu' (already very fast) or 'cuda'\nmodel_type = 'RWKV' # 'RWKV' or 'RWKV-ffnPre'\n########################################################################################################\n# Step 2: set prompt & sampling stuffs\n########################################################################################################\n# context = 'A'\n# context = \"\\nIn the\"\n# context = '\\nSugar:'\ncontext = '\\nIn a shocking finding, scientist discovered a herd of dragons living in a remote, previously unexplored valley, in Tibet. Even more surprising to the researchers was the fact that the dragons spoke perfect Chinese.'\nNUM_TRIALS = 999\nLENGTH_PER_TRIAL = 333\nTEMPERATURE = 1.0\ntop_p = 0.7\ntop_p_newline = 0.9 # only used in TOKEN_MODE = char\nDEBUG_DEBUG = False # True False --> show softmax output\n####", + "type": "code", + "location": "/RWKV-v4/run.py:61-87" + }, + "433": { + "file_id": 24, + "content": "The code is setting the environment variables for the RWKV model, specifying the model type, and providing the prompt and sampling parameters. The model is currently using fp32 (floating-point arithmetic) and running on the CPU. The prompt provided is a text describing the discovery of Chinese-speaking dragons in Tibet. The code will perform 999 trials, each producing output of length 333, with temperature set to 1.0 for sampling and top_p set to 0.7 and 0.9 for softmax output.", + "type": "comment" + }, + "434": { + "file_id": 24, + "content": "####################################################################################################\nprint(f'Loading {MODEL_NAME}...')\nfrom src.model_run import RWKV_RNN\nmodel = RWKV_RNN(MODEL_NAME, os.environ['RWKV_RUN_DEVICE'], model_type, n_layer, n_embd, ctx_len)\ntokenizer = TOKENIZER(WORD_NAME, UNKNOWN_CHAR=UNKNOWN_CHAR)\n########################################################################################################\nif tokenizer.charMode:\n context = tokenizer.refine_context(context)\n ctx = [tokenizer.stoi.get(s, tokenizer.UNKNOWN_CHAR) for s in context]\nelse:\n ctx = tokenizer.tokenizer.encode(context)\nsrc_len = len(ctx)\nsrc_ctx = ctx.copy()\nprint('\\nYour prompt has ' + str(src_len) + ' tokens.')\nprint('\\n--> Currently the first run takes a while if your prompt is long, as we are using RNN to process the prompt. Use GPT to build the hidden state for better speed. <--\\n')\nfor TRIAL in range(1 if DEBUG_DEBUG else NUM_TRIALS):\n t_begin = time.time_ns()\n print(('-' * 30) + context, end='')", + "type": "code", + "location": "/RWKV-v4/run.py:87-109" + }, + "435": { + "file_id": 24, + "content": "Loading the specified RWKV model, creating a tokenizer for text processing, refining context if in character mode, converting context to tokens, printing the number of tokens in prompt, informing about the time taken to process long prompts with RNN or using GPT for faster speed, and starting a loop for multiple trials.", + "type": "comment" + }, + "436": { + "file_id": 24, + "content": " ctx = src_ctx.copy()\n model.clear()\n if TRIAL == 0:\n init_state = types.SimpleNamespace()\n for i in range(src_len):\n x = ctx[:i+1]\n if i == src_len - 1:\n init_state.out = model.run(x)\n else:\n model.run(x)\n model.save(init_state)\n else:\n model.load(init_state)\n for i in range(src_len, src_len + (1 if DEBUG_DEBUG else LENGTH_PER_TRIAL)):\n x = ctx[:i+1]\n x = x[-ctx_len:]\n if i == src_len:\n out = copy.deepcopy(init_state.out)\n else:\n out = model.run(x)\n if DEBUG_DEBUG:\n print('model', np.array(x), '==>', np.array(\n out), np.max(out), np.min(out))\n if TOKEN_MODE == 'pile':\n out[0] = -999999999 # disable <|endoftext|>\n char = tokenizer.sample_logits(out, x, ctx_len, temperature=TEMPERATURE,\n top_p_usual=top_p, top_p_newline=top_p_newline)\n char = char.item()\n if tokenizer.charMode:", + "type": "code", + "location": "/RWKV-v4/run.py:110-142" + }, + "437": { + "file_id": 24, + "content": "This code initializes a model and its state, then generates text based on the input sequence. It saves and loads the model's state for subsequent trials, calculates logits for character sampling, and handles special cases like disabling <|endoftext|>. The DEBUG_DEBUG print statement displays output statistics if enabled.", + "type": "comment" + }, + "438": { + "file_id": 24, + "content": " print(tokenizer.itos[int(char)], end='', flush=True)\n else:\n print(tokenizer.tokenizer.decode(int(char)), end='', flush=True)\n ctx += [char]\n t_end = time.time_ns()\n print(\"\\n----------\", round((t_end - t_begin) / (10 ** 9), 2), end='s ')", + "type": "code", + "location": "/RWKV-v4/run.py:143-149" + }, + "439": { + "file_id": 24, + "content": "The code is printing each token in a sequence using either the integer representation or the decoded version from the tokenizer, and storing each character in the ctx list. It also measures the time taken for this process and prints it in seconds at the end.", + "type": "comment" + }, + "440": { + "file_id": 25, + "content": "/RWKV-v4/src/binidx.py", + "type": "filepath" + }, + "441": { + "file_id": 25, + "content": "The code defines a class \"MMapIndexedDataset\" with indexing methods and binary indexing functions for datasets, loading binary data from file paths using numpy's memmap. It supports setting document index and checking file existence.", + "type": "summary" + }, + "442": { + "file_id": 25, + "content": "from lib2to3.pgen2 import token\nimport os\nimport torch\nimport numpy as np\nimport shutil\nimport struct\nfrom functools import lru_cache\nfrom itertools import accumulate\ndef print_rank_0(*message):\n \"\"\"If distributed is initialized print only on rank 0.\"\"\"\n if torch.distributed.is_initialized():\n if torch.distributed.get_rank() == 0:\n print(*message, flush=True)\n else:\n print(*message, flush=True)\ndef _warmup_mmap_file(path):\n pass\n # with open(path, \"rb\") as stream:\n # while stream.read(100 * 1024 * 1024):\n # pass\ndtypes = {\n 1: np.uint8,\n 2: np.int8,\n 3: np.int16,\n 4: np.int32,\n 5: np.int64,\n 6: float,\n 7: np.double,\n 8: np.uint16,\n}\ndef code(dtype):\n for k in dtypes.keys():\n if dtypes[k] == dtype:\n return k\n raise ValueError(dtype)\ndef index_file_path(prefix_path):\n return prefix_path + \".idx\"\ndef data_file_path(prefix_path):\n return prefix_path + \".bin\"\nclass MMapIndexedDataset(torch.utils.data.Dataset):\n class Index(object):", + "type": "code", + "location": "/RWKV-v4/src/binidx.py:1-48" + }, + "443": { + "file_id": 25, + "content": "This code is importing necessary libraries and defining functions for indexed datasets. It defines a class \"MMapIndexedDataset\" with an inner class \"Index\". The file includes utility functions like _warmup_mmap_file, print_rank_0, index_file_path, data_file_path, and code which handle reading and manipulating binary data from files. It also defines dtypes dictionary mapping numerical types to their respective codes.", + "type": "comment" + }, + "444": { + "file_id": 25, + "content": " _HDR_MAGIC = b\"MMIDIDX\\x00\\x00\"\n def __init__(self, path, skip_warmup=False):\n with open(path, \"rb\") as stream:\n magic_test = stream.read(9)\n assert self._HDR_MAGIC == magic_test, (\n \"Index file doesn't match expected format. \"\n \"Make sure that --dataset-impl is configured properly.\"\n )\n # Little endian unsigned 64 Bit integer\n version = struct.unpack(\" first run, init model params (very slow for large models) <--]\")\n print(\"[so you shall only do it for 1 single GPU and save the checkpt and load it when using multiple GPU]\\n\")\n for mm in model.modules():\n if \"RecursiveScriptModule\" in str(type(mm)):\n if mm.original_name not in [\"Linear\"]:\n continue\n ww = None\n for name, param in mm.named_parameters():\n if name == \"weight\":\n ww = param\n else:\n m = mm\n if not isinstance(m, (nn.Linear, nn.Embedding)):\n continue\n ww = m.weight\n with torch.no_grad():\n name = \"[unknown weight]\"\n for name, parameter in model.named_parameters(): # find the name of the weight", + "type": "code", + "location": "/RWKV-v4/src/model.py:104-126" + }, + "467": { + "file_id": 26, + "content": "This code initializes all linear and embedding layers in a model using fancy initialization. This is done by iterating through all modules of the model, skipping non-linear and non-embedding layers. It finds the weight parameters for these layers and performs some operations to initialize them, including finding their names. The code also provides some information about the process, such as it being slow for large models and needing to be run on a single GPU before loading onto others.", + "type": "comment" + }, + "468": { + "file_id": 26, + "content": " if id(ww) == id(parameter):\n break\n shape = ww.shape\n gain = 1.0\n scale = 1.0 # extra scale for gain\n if isinstance(m, nn.Embedding):\n gain = math.sqrt(max(shape[0], shape[1]))\n if shape[0] == args.vocab_size and shape[1] == args.n_embd: # token emb?\n scale = 1e-4\n else:\n scale = 0\n if isinstance(m, nn.Linear):\n if shape[0] > shape[1]:\n gain = math.sqrt(shape[0] / shape[1])\n if shape[0] == args.vocab_size and shape[1] == args.n_embd: # final projection?\n scale = 0.5\n if hasattr(m, \"scale_init\"):\n scale = m.scale_init\n # print(f\"{str(shape[0]).ljust(5)} {str(shape[1]).ljust(5)} {str(scale).ljust(4)} {name}\")\n gain *= scale\n if scale == -999:\n nn.init.eye_(ww)\n elif gain == 0:\n # zero init is great for some RWKV matrices", + "type": "code", + "location": "/RWKV-v4/src/model.py:127-156" + }, + "469": { + "file_id": 26, + "content": "This code is adjusting the weight matrix (`ww`) initializer of various neural network layers based on their shapes and types. It sets the gain and scale factors accordingly to optimize the model's performance. If `scale` is -999, it initializes with eye initialization. If `gain` is 0, it uses zero initialization. This process helps in setting up the weight matrices efficiently for RWKV models.", + "type": "comment" + }, + "470": { + "file_id": 26, + "content": " nn.init.zeros_(ww)\n elif gain > 0:\n nn.init.orthogonal_(ww, gain=gain)\n else:\n nn.init.normal_(ww, mean=0.0, std=-scale)\nclass RWKV_TimeMix(torch.jit.ScriptModule):\n def __init__(self, config, layer_id):\n super().__init__()\n self.layer_id = layer_id\n self.ctx_len = config.ctx_len\n self.n_embd = config.n_embd\n attn_sz = config.n_embd\n with torch.no_grad(): # fancy init\n ratio_0_to_1 = (layer_id / (config.n_layer - 1)) # 0 to 1\n ratio_1_to_almost0 = (1.0 - (layer_id / config.n_layer)) # 1 to ~0\n # fancy time_decay\n decay_speed = torch.ones(attn_sz)\n for h in range(attn_sz):\n decay_speed[h] = -5 + 8 * (h / (attn_sz-1)) ** (0.7 + 1.3 * ratio_0_to_1)\n self.time_decay = nn.Parameter(decay_speed)\n # print(layer_id, self.time_decay.flatten()[:3].cpu().numpy(), '...', self.time_decay.flatten()[-3:].cpu().numpy())\n # fancy time_first", + "type": "code", + "location": "/RWKV-v4/src/model.py:157-184" + }, + "471": { + "file_id": 26, + "content": "This code is initializing a layer of the RWKV_TimeMix model with fancy initialization for time decay parameter. It calculates the time decay speed based on the current layer and attenuation size, and assigns it to the `self.time_decay` parameter in the class.", + "type": "comment" + }, + "472": { + "file_id": 26, + "content": " zigzag = (torch.tensor([(i+1)%3 - 1 for i in range(attn_sz)]) * 0.5)\n self.time_first = nn.Parameter(torch.ones(attn_sz) * math.log(0.3) + zigzag)\n # fancy time_mix\n x = torch.ones(1, 1, config.n_embd)\n for i in range(config.n_embd):\n x[0, 0, i] = i / config.n_embd\n self.time_mix_k = nn.Parameter(torch.pow(x, ratio_1_to_almost0))\n self.time_mix_v = nn.Parameter(torch.pow(x, ratio_1_to_almost0) + 0.3 * ratio_0_to_1)\n self.time_mix_r = nn.Parameter(torch.pow(x, 0.5 * ratio_1_to_almost0))\n self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))\n self.key = nn.Linear(config.n_embd, attn_sz, bias=False)\n self.value = nn.Linear(config.n_embd, attn_sz, bias=False)\n self.receptance = nn.Linear(config.n_embd, attn_sz, bias=False)\n self.output = nn.Linear(attn_sz, config.n_embd, bias=False)\n self.key.scale_init = 0\n self.receptance.scale_init = 0\n self.output.scale_init = 0", + "type": "code", + "location": "/RWKV-v4/src/model.py:185-206" + }, + "473": { + "file_id": 26, + "content": "The code above initializes various layers for an attention mechanism in a transformer model. It defines parameters for time-based mixing, shifting, and linear transformations for keys, values, and output. The key, value, and output layers are initialized with zero scaling.", + "type": "comment" + }, + "474": { + "file_id": 26, + "content": " @torch.jit.script_method\n def jit_func(self, x):\n # Mix x with the previous timestep to produce xk, xv, xr\n xx = self.time_shift(x)\n xk = x * self.time_mix_k + xx * (1 - self.time_mix_k)\n xv = x * self.time_mix_v + xx * (1 - self.time_mix_v)\n xr = x * self.time_mix_r + xx * (1 - self.time_mix_r)\n # Use xk, xv, xr to produce k, v, r\n k = self.key(xk)\n v = self.value(xv)\n r = self.receptance(xr)\n sr = torch.sigmoid(r)\n return sr, k, v\n def forward(self, x):\n B, T, C = x.size() # x = (Batch,Time,Channel)\n sr, k, v = self.jit_func(x)\n rwkv = sr * RUN_CUDA(B, T, C, self.time_decay, self.time_first, k, v)\n rwkv = self.output(rwkv)\n return rwkv\nclass RWKV_ChannelMix(torch.jit.ScriptModule):\n def __init__(self, config, layer_id):\n super().__init__()\n self.layer_id = layer_id\n self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))\n with torch.no_grad(): # fancy init of time_mix", + "type": "code", + "location": "/RWKV-v4/src/model.py:208-242" + }, + "475": { + "file_id": 26, + "content": "The code defines a model that performs channel-wise mixing and applies a time shift. The jit_func method takes an input tensor x, mixes it with the previous timestep to produce xk, xv, xr, and then calculates k, v, r using these mixed tensors. The forward method takes an input tensor x, calls the jit_func to obtain sr, k, v, applies a time decay, and returns the output from an output layer. RWKV_ChannelMix is a ScriptModule class that initializes the time shift and uses fancy initialization for time_mix.", + "type": "comment" + }, + "476": { + "file_id": 26, + "content": " ratio_1_to_almost0 = (1.0 - (layer_id / config.n_layer)) # 1 to ~0\n x = torch.ones(1, 1, config.n_embd)\n for i in range(config.n_embd):\n x[0, 0, i] = i / config.n_embd\n self.time_mix_k = nn.Parameter(torch.pow(x, ratio_1_to_almost0))\n self.time_mix_r = nn.Parameter(torch.pow(x, ratio_1_to_almost0))\n hidden_sz = 4 * config.n_embd\n self.key = nn.Linear(config.n_embd, hidden_sz, bias=False)\n self.receptance = nn.Linear(config.n_embd, config.n_embd, bias=False)\n self.value = nn.Linear(hidden_sz, config.n_embd, bias=False)\n self.value.scale_init = 0\n self.receptance.scale_init = 0\n @torch.jit.script_method\n def forward(self, x):\n xx = self.time_shift(x)\n xk = x * self.time_mix_k + xx * (1 - self.time_mix_k)\n xr = x * self.time_mix_r + xx * (1 - self.time_mix_r)\n k = self.key(xk)\n k = torch.square(torch.relu(k))\n kv = self.value(k)\n rkv = torch.sigmoid(self.receptance(xr)) * kv", + "type": "code", + "location": "/RWKV-v4/src/model.py:243-270" + }, + "477": { + "file_id": 26, + "content": "Code snippet initializes two parameters for time-mixing, sets hidden size and defines linear layers for key, receptance, and value in a transformer model. The forward method applies time-shifting, mixing, passing through key and value layers, and calculates the final output using sigmoid activation and multiplication.", + "type": "comment" + }, + "478": { + "file_id": 26, + "content": " return rkv\n########################################################################################################\n# The GPT Model with our blocks\n########################################################################################################\nclass GPTConfig:\n def __init__(self, vocab_size, ctx_len, **kwargs):\n self.vocab_size = vocab_size\n self.ctx_len = ctx_len\n for k, v in kwargs.items():\n setattr(self, k, v)\nclass Block(nn.Module):\n def __init__(self, config, layer_id):\n super().__init__()\n self.config = config\n self.layer_id = layer_id\n self.ln1 = nn.LayerNorm(config.n_embd)\n self.ln2 = nn.LayerNorm(config.n_embd)\n if self.layer_id == 0:\n self.ln0 = nn.LayerNorm(config.n_embd)\n if self.layer_id == 0 and self.config.model_type == 'RWKV-ffnPre':\n self.ffnPre = RWKV_ChannelMix(config, 0)\n else:\n self.att = RWKV_TimeMix(config, layer_id)\n self.ffn = RWKV_ChannelMix(config, layer_id)", + "type": "code", + "location": "/RWKV-v4/src/model.py:271-303" + }, + "479": { + "file_id": 26, + "content": "The code defines a GPT model with customizable blocks and config parameters. The GPTConfig class holds the vocabulary size, context length, and additional keyword-value pairs. The Block class is a module for these customizable blocks, using layer normalization, attention, and feed-forward layers, depending on the block type and position.", + "type": "comment" + }, + "480": { + "file_id": 26, + "content": " def forward(self, x):\n if self.layer_id == 0:\n x = self.ln0(x) \n if self.layer_id == 0 and self.config.model_type == 'RWKV-ffnPre':\n x = x + self.ffnPre(self.ln1(x)) # better in some cases\n else:\n x = x + self.att(self.ln1(x))\n x = x + self.ffn(self.ln2(x))\n return x\nclass GPT(nn.Module):\n def __init__(self, config):\n super().__init__()\n self.step = 0\n self.config = config\n self.emb = nn.Embedding(config.vocab_size, config.n_embd)\n self.blocks = nn.Sequential(*[Block(config, i)\n for i in range(config.n_layer)])\n self.ln_out = nn.LayerNorm(config.n_embd)\n self.head = nn.Linear(config.n_embd, config.vocab_size, bias=False)\n if RWKV_HEAD_QK_DIM > 0:\n self.head_q = nn.Linear(config.n_embd, RWKV_HEAD_QK_DIM, bias=False)\n self.head_q.scale_init = 0\n self.head_k = nn.Linear(config.n_embd, RWKV_HEAD_QK_DIM, bias=False)", + "type": "code", + "location": "/RWKV-v4/src/model.py:305-333" + }, + "481": { + "file_id": 26, + "content": "This code defines a GPT model class with layer normalization, embedding, and multiple blocks. The forward function performs feed-forward and attention mechanisms, and the __init__ function initializes the model parameters based on the given configuration.", + "type": "comment" + }, + "482": { + "file_id": 26, + "content": " self.head_k.scale_init = 0.1\n self.register_buffer(\"copy_mask\", torch.tril(\n torch.ones(config.ctx_len, config.ctx_len)))\n self.ctx_len = config.ctx_len\n try:\n if os.environ['RWKV_LOAD_MODEL'] == str(False):\n RWKV_Init(self, config) \n except:\n pass\n logger.info(\"number of parameters: %e\", sum(p.numel()\n for p in self.parameters()))\n def get_ctx_len(self):\n return self.ctx_len\n def _init_weights(self, module):\n if isinstance(module, (nn.Linear)):\n module.weight.data.normal_(mean=0.0, std=0.01)\n if isinstance(module, (nn.Embedding)):\n module.weight.data.normal_(mean=0.0, std=1e-5)\n if isinstance(module, nn.Linear) and module.bias is not None:\n module.bias.data.zero_()\n def configure_optimizers(self, train_config):\n no_decay = set()\n for mn, m in self.named_modules(): # here we disable weight_decay\n for pn, p in m.named_parameters():", + "type": "code", + "location": "/RWKV-v4/src/model.py:334-364" + }, + "483": { + "file_id": 26, + "content": "This code initializes the model, sets parameters such as head_k scale and copy mask, checks if the model should be loaded, and logs the number of parameters. It also defines methods for getting ctx_len, initializing weights with specific values, and configuring optimizers.", + "type": "comment" + }, + "484": { + "file_id": 26, + "content": " fpn = '%s.%s' % (mn, pn) if mn else pn # full param name\n no_decay.add(fpn)\n param_dict = {pn: p for pn, p in self.named_parameters()}\n optim_groups = [\n {\"params\": [param_dict[pn]\n for pn in sorted(list(no_decay))], \"weight_decay\": 0.0},\n ]\n try:\n optimizer = FusedAdam(optim_groups, lr=train_config.learning_rate, betas=train_config.betas, eps=train_config.eps, bias_correction=True, adam_w_mode=False, weight_decay=0, amsgrad=False)\n except:\n print('\\n\\nDeepSpeed not found. Using torch optimizer instead (probably slower)\\n\\n')\n optimizer = torch.optim.Adam(optim_groups, lr=train_config.learning_rate, betas=train_config.betas, eps=train_config.eps)\n return optimizer\n def forward(self, idx, targets=None):\n idx = idx.to(self.emb.weight.device)\n self.step += 1\n B, T = idx.size()\n assert T <= self.ctx_len, \"Cannot forward, because len(input) > model ctx_len.\"", + "type": "code", + "location": "/RWKV-v4/src/model.py:365-387" + }, + "485": { + "file_id": 26, + "content": "This code initializes an optimizer for a model, either using DeepSpeed's FusedAdam if available or falling back to torch.optim.Adam. It defines the full param name and creates optimization groups based on whether or not weight decay should be applied. The forward function performs forward pass of the model and asserts that the input length is less than or equal to the context length.", + "type": "comment" + }, + "486": { + "file_id": 26, + "content": " x = self.emb(idx)\n x = self.blocks(x)\n x = self.ln_out(x)\n if RWKV_HEAD_QK_DIM > 0:\n q = self.head_q(x)[:, :T, :]\n k = self.head_k(x)[:, :T, :]\n c = (q @ k.transpose(-2, -1)) * (1.0 / RWKV_HEAD_QK_DIM)\n c = c.masked_fill(self.copy_mask[:T, :T] == 0, 0)\n if '32' in os.environ['RWKV_FLOAT_MODE']:\n c = c @ F.one_hot(idx, num_classes=self.config.vocab_size)\n elif os.environ['RWKV_FLOAT_MODE'] == 'fp16':\n c = c @ F.one_hot(idx, num_classes=self.config.vocab_size).half()\n elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':\n c = c @ F.one_hot(idx, num_classes=self.config.vocab_size).bfloat16()\n x = self.head(x) + c\n else:\n x = self.head(x)\n loss = None\n if targets is not None:\n loss = F.cross_entropy(x.view(-1, x.size(-1)), targets.to(x.device).view(-1))\n return L2Wrap.apply(loss, x)", + "type": "code", + "location": "/RWKV-v4/src/model.py:389-414" + }, + "487": { + "file_id": 26, + "content": "This code calculates the attention scores using head layers, and then applies them to the output. If RWKV_HEAD_QK_DIM is greater than 0, it performs multi-head attention by computing the attention scores and scaling them based on the number of heads. Then, it adds a one-hot vector to the output depending on the RWKV_FLOAT_MODE environment variable, and finally calculates the cross-entropy loss between the output and targets (if provided), and returns the L2Wrap applied output and the loss.", + "type": "comment" + }, + "488": { + "file_id": 27, + "content": "/RWKV-v4/src/model_run.py", + "type": "filepath" + }, + "489": { + "file_id": 27, + "content": "The RWKV Language Model employs CUDA support, time-shifted operations, and forward passes for efficient execution. It trains a transformer model with head-multihead attention using RWKV-v4, organizes weights in namespaced structure, and performs layer normalization, feed-forward operations, and applies layers like LN and FFN.", + "type": "summary" + }, + "490": { + "file_id": 27, + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nimport types\nimport copy\nimport torch\nimport math, os\nfrom torch.nn import functional as F\nimport torch.nn as nn\nRWKV_HEAD_QK_DIM = 0\nprint(f'\\nRWKV_HEAD_QK_DIM {RWKV_HEAD_QK_DIM}\\n')\nDEBUG_TIME = False # True False - show trained time-coeffs\n########################################################################################################\n# CUDA Kernel\n########################################################################################################\nif os.environ['RWKV_RUN_DEVICE'] == 'cuda':\n T_MAX = 1024 # increase this if your ctx_len is long [NOTE: TAKES LOTS OF VRAM!]\n # it's possible to go beyond CUDA limitations if you slice the ctx and pass the hidden state in each slice\n from torch.utils.cpp_extension import load", + "type": "code", + "location": "/RWKV-v4/src/model_run.py:1-25" + }, + "491": { + "file_id": 27, + "content": "This code snippet is from the RWKV Language Model. It begins with importing necessary libraries and defines some constants like RWKV_HEAD_QK_DIM, DEBUG_TIME, and T_MAX. The code checks if the execution environment is CUDA and imports a CUDA kernel if it is. This language model is designed to perform natural language processing tasks with potential VRAM usage limitations.", + "type": "comment" + }, + "492": { + "file_id": 27, + "content": " wkv_cuda = load(name=\"wkv\", sources=[\"cuda/wkv_op.cpp\", \"cuda/wkv_cuda.cu\"],\n verbose=True, extra_cuda_cflags=['-res-usage', '--maxrregcount 60', '--use_fast_math', '-O3', '-Xptxas -O3', f'-DTmax={T_MAX}'])\n class WKV(torch.autograd.Function):\n @staticmethod\n def forward(ctx, B, T, C, w, u, k, v):\n ctx.B = B\n ctx.T = T\n ctx.C = C\n assert T <= T_MAX\n assert B * C % min(C, 1024) == 0\n if '32' in os.environ['RWKV_FLOAT_MODE']:\n w = -torch.exp(w.contiguous())\n u = u.contiguous()\n k = k.contiguous()\n v = v.contiguous()\n else:\n w = -torch.exp(w.float().contiguous())\n u = u.float().contiguous()\n k = k.float().contiguous()\n v = v.float().contiguous()\n ctx.save_for_backward(w, u, k, v)\n y = torch.empty((B, T, C), device='cuda', memory_format=torch.contiguous_format)", + "type": "code", + "location": "/RWKV-v4/src/model_run.py:26-48" + }, + "493": { + "file_id": 27, + "content": "This code initializes a WKV object, loads the model from CUDA, and handles float mode conversion. It asserts certain conditions, such as T being less than or equal to a maximum value, and B*C being divisible by min(C, 1024). The code also checks the environment variable 'RWKV_FLOAT_MODE' and converts the float types accordingly for compatibility. The model parameters are stored in the ctx object for backward propagation.", + "type": "comment" + }, + "494": { + "file_id": 27, + "content": " wkv_cuda.forward(B, T, C, w, u, k, v, y)\n if '32' in os.environ['RWKV_FLOAT_MODE']:\n return y\n elif os.environ['RWKV_FLOAT_MODE'] == 'fp16':\n return y.half()\n elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':\n return y.bfloat16()\n @staticmethod\n def backward(ctx, gy):\n B = ctx.B\n T = ctx.T\n C = ctx.C\n assert T <= T_MAX\n assert B * C % min(C, 1024) == 0\n w, u, k, v = ctx.saved_tensors\n gw = torch.zeros((B, C), device='cuda').contiguous()\n gu = torch.zeros((B, C), device='cuda').contiguous()\n gk = torch.zeros((B, T, C), device='cuda').contiguous()\n gv = torch.zeros((B, T, C), device='cuda').contiguous()\n if '32' in os.environ['RWKV_FLOAT_MODE']:\n wkv_cuda.backward(B, T, C, w, u, k, v, gy.contiguous(), gw, gu, gk, gv)\n else:\n wkv_cuda.backward(B, T, C, w, u, k, v, gy.float().contiguous(), gw, gu, gk, gv)", + "type": "code", + "location": "/RWKV-v4/src/model_run.py:49-72" + }, + "495": { + "file_id": 27, + "content": "This code defines a function `forward` for a model that performs operations on input tensors B, T, C, w, u, k, and v. It also includes a backward function for gradient calculations using saved tensors. The `forward` function returns the output y, which is modified based on the environment variable RWKV_FLOAT_MODE. The backward function performs gradient calculations based on the input tensor gy and saves gradients in gw, gu, gk, and gv.", + "type": "comment" + }, + "496": { + "file_id": 27, + "content": " gw = torch.sum(gw, dim=0)\n gu = torch.sum(gu, dim=0)\n if '32' in os.environ['RWKV_FLOAT_MODE']:\n return (None, None, None, gw, gu, gk, gv)\n elif os.environ['RWKV_FLOAT_MODE'] == 'fp16':\n return (None, None, None, gw.half(), gu.half(), gk.half(), gv.half())\n elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':\n return (None, None, None, gw.bfloat16(), gu.bfloat16(), gk.bfloat16(), gv.bfloat16())\n def RUN_CUDA(B, T, C, w, u, k, v):\n return WKV.apply(B, T, C, w.cuda(), u.cuda(), k.cuda(), v.cuda())\n############################################################################################################\nRWKV_CFG = types.SimpleNamespace()\nclass RWKV_ChannelMix(nn.Module):\n def __init__(self, layer_id):\n super().__init__()\n self.layer_id = layer_id\n self.time_shift = nn.ZeroPad2d((0,0,1,-1))\n self.time_mix_k = nn.Parameter(torch.ones(1, 1, RWKV_CFG.n_embd))\n self.time_mix_r = nn.Parameter(torch.ones(1, 1, RWKV_CFG.n_embd))", + "type": "code", + "location": "/RWKV-v4/src/model_run.py:73-96" + }, + "497": { + "file_id": 27, + "content": "This code snippet is part of a model training and inference process. It defines a function `RUN_CUDA` for running the model on CUDA devices, and initializes a module `RWKV_ChannelMix`. The code also sets up various tensor operations such as summations, and environment variable checks for floating point precision modes. The module is part of the RWKV language model framework.", + "type": "comment" + }, + "498": { + "file_id": 27, + "content": " hidden_sz = 4 * RWKV_CFG.n_embd\n self.key = nn.Linear(RWKV_CFG.n_embd, hidden_sz, bias=False)\n self.receptance = nn.Linear(RWKV_CFG.n_embd, RWKV_CFG.n_embd, bias=False)\n self.value = nn.Linear(hidden_sz, RWKV_CFG.n_embd, bias=False)\n def forward(self, x):\n xx = self.time_shift(x)\n xk = x * self.time_mix_k + xx * (1 - self.time_mix_k)\n xr = x * self.time_mix_r + xx * (1 - self.time_mix_r)\n k = self.key(xk)\n k = torch.square(torch.relu(k))\n kv = self.value(k)\n rkv = torch.sigmoid(self.receptance(xr)) * kv\n return rkv\nclass RWKV_TimeMix(nn.Module):\n def __init__(self, layer_id):\n super().__init__()\n self.layer_id = layer_id\n self.time_decay = nn.Parameter(torch.ones(RWKV_CFG.n_embd))\n self.time_first = nn.Parameter(torch.ones(RWKV_CFG.n_embd) * math.log(0.3))\n self.time_shift = nn.ZeroPad2d((0,0,1,-1))\n self.time_mix_k = nn.Parameter(torch.ones(1,1,RWKV_CFG.n_embd))\n self.time_mix_v = nn.Parameter(torch.ones(1,1,RWKV_CFG.n_embd))", + "type": "code", + "location": "/RWKV-v4/src/model_run.py:98-124" + }, + "499": { + "file_id": 27, + "content": "Class \"RWKV_TimeMix\" initializes with layer id, and contains parameters for time decay, first position correction, shift operation, key mix, and value mix. The forward function applies time shifting, mixes with key and value mix parameters, applies square and relu operations on keys, multiplies by sigmoid-transformed values, and returns the result.", + "type": "comment" + } +} \ No newline at end of file diff --git a/docs/data/5.json b/docs/data/5.json new file mode 100644 index 00000000..977e238d --- /dev/null +++ b/docs/data/5.json @@ -0,0 +1,547 @@ +{ + "500": { + "file_id": 27, + "content": " self.time_mix_r = nn.Parameter(torch.ones(1,1,RWKV_CFG.n_embd))\n self.key = nn.Linear(RWKV_CFG.n_embd, RWKV_CFG.n_embd, bias=False)\n self.value = nn.Linear(RWKV_CFG.n_embd, RWKV_CFG.n_embd, bias=False)\n self.receptance = nn.Linear(RWKV_CFG.n_embd, RWKV_CFG.n_embd, bias=False)\n self.output = nn.Linear(RWKV_CFG.n_embd, RWKV_CFG.n_embd, bias=False)\n def forward(self, x):\n B, T, C = x.size()\n xx = self.time_shift(x)\n xk = x * self.time_mix_k + xx * (1 - self.time_mix_k)\n xv = x * self.time_mix_v + xx * (1 - self.time_mix_v)\n xr = x * self.time_mix_r + xx * (1 - self.time_mix_r)\n k = self.key(xk)\n v = self.value(xv)\n r = self.receptance(xr)\n rwkv = torch.sigmoid(r) * RUN_CUDA(B, T, C, self.time_decay, self.time_first, k, v)\n rwkv = self.output(rwkv)\n return rwkv\nclass Block(nn.Module):\n def __init__(self, layer_id):\n super().__init__()\n self.layer_id = layer_id\n self.ln1 = nn.LayerNorm(RWKV_CFG.n_embd)", + "type": "code", + "location": "/RWKV-v4/src/model_run.py:125-155" + }, + "501": { + "file_id": 27, + "content": "The code initializes a module with time-based mixing parameters and applies a series of linear transformations to the input. The forward function performs time-shifted operations, then multiplies with learned coefficients for key, value, and receptance components. The results are passed through a sigmoid activation, another layer normalization, and a final linear transformation before returning the final output. This block is part of the RWKV model implementation.", + "type": "comment" + }, + "502": { + "file_id": 27, + "content": " self.ln2 = nn.LayerNorm(RWKV_CFG.n_embd)\n if self.layer_id == 0:\n self.ln0 = nn.LayerNorm(RWKV_CFG.n_embd)\n if self.layer_id == 0 and RWKV_CFG.model_type == 'RWKV-ffnPre':\n self.ffnPre = RWKV_ChannelMix(layer_id+1000)\n else:\n self.att = RWKV_TimeMix(layer_id)\n self.ffn = RWKV_ChannelMix(layer_id)\n def forward(self, x):\n if self.layer_id == 0:\n x = self.ln0(x)\n if self.layer_id == 0 and RWKV_CFG.model_type == 'RWKV-ffnPre':\n x = x + self.ffnPre(self.ln1(x))\n else:\n x = x + self.att(self.ln1(x))\n x = x + self.ffn(self.ln2(x))\n return x\nclass RWKV_GPT(nn.Module):\n def __init__(self, MODEL_NAME, RUN_DEVICE, model_type, vocab_size, n_layer, n_embd, ctx_len):\n global RWKV_CFG\n super().__init__()\n RWKV_CFG.RUN_DEVICE = RUN_DEVICE\n RWKV_CFG.model_type = model_type\n RWKV_CFG.vocab_size = vocab_size\n RWKV_CFG.n_layer = n_layer\n RWKV_CFG.n_embd = n_embd", + "type": "code", + "location": "/RWKV-v4/src/model_run.py:156-186" + }, + "503": { + "file_id": 27, + "content": "This code initializes a RWKV-GPT model instance with specified configuration, and includes layer normalization and different forward pass depending on the layer ID and model type. The forward method performs layer normalization and adds the outputs of specific layers or modules, resulting in the final output.", + "type": "comment" + }, + "504": { + "file_id": 27, + "content": " RWKV_CFG.ctx_len = ctx_len\n print('\\nloading RWKV-GPT', MODEL_NAME)\n self.emb = nn.Embedding(vocab_size, n_embd)\n self.blocks = nn.Sequential(*[Block(i) for i in range(n_layer)])\n self.ln_out = nn.LayerNorm(n_embd)\n self.head = nn.Linear(n_embd, vocab_size, bias=False)\n if RWKV_HEAD_QK_DIM > 0:\n self.head_q = nn.Linear(n_embd, RWKV_HEAD_QK_DIM, bias=False)\n self.head_q.scale_init = 0\n self.head_k = nn.Linear(n_embd, RWKV_HEAD_QK_DIM, bias=False)\n self.head_k.scale_init = 0.1\n self.register_buffer(\"copy_mask\", torch.tril(\n torch.ones(ctx_len, ctx_len)))\n self.ctx_len = ctx_len\n self.eval()\n self.load_state_dict(torch.load(MODEL_NAME + '.pth'))\n self.eval()\n def forward(self, idx):\n B, T = idx.size()\n assert T <= self.ctx_len, \"Cannot forward, because len(input) > model ctx_len.\"\n x = self.emb(idx)\n x = self.blocks(x)\n x = self.ln_out(x)", + "type": "code", + "location": "/RWKV-v4/src/model_run.py:187-217" + }, + "505": { + "file_id": 27, + "content": "This code initializes an RWKV model, sets its context length, and loads the state dictionary from a saved file. It also includes optional head layers for QK vectors and a copy mechanism. The forward pass performs embedding, passes through blocks, applies layer normalization, and returns the output.", + "type": "comment" + }, + "506": { + "file_id": 27, + "content": " if RWKV_HEAD_QK_DIM > 0:\n q = self.head_q(x)[:, :T, :]\n k = self.head_k(x)[:, :T, :]\n c = (q @ k.transpose(-2, -1)) * (1.0 / RWKV_HEAD_QK_DIM)\n c = c.masked_fill(self.copy_mask[:T, :T] == 0, 0)\n if '32' in os.environ['RWKV_FLOAT_MODE']:\n c = c @ F.one_hot(idx, num_classes=RWKV_CFG.vocab_size)\n elif os.environ['RWKV_FLOAT_MODE'] == 'fp16':\n c = c @ F.one_hot(idx, num_classes=RWKV_CFG.vocab_size).half()\n elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':\n c = c @ F.one_hot(idx, num_classes=RWKV_CFG.vocab_size).bfloat16()\n x = self.head(x) + c\n else:\n x = self.head(x) \n return x\n############################################################################################################\nclass RWKV_RNN(): # this is running in FP32 at this moment\n def __init__(self, MODEL_NAME, RUN_DEVICE, model_type, n_layer, n_embd, ctx_len):\n self.RUN_DEVICE = RUN_DEVICE", + "type": "code", + "location": "/RWKV-v4/src/model_run.py:219-242" + }, + "507": { + "file_id": 27, + "content": "This code segment is part of a model training process. It performs a head-multihead attention mechanism and then applies the result to the input. If RWKV_HEAD_QK_DIM is greater than 0, it calculates and applies the attention matrix, otherwise it simply passes through the input. The floating point mode is set based on the environment variable, with options for FP32, FP16, or BF16 precision. The class RWKV_RNN initializes a model with specified parameters like device, model type, number of layers, embedding size, and context length.", + "type": "comment" + }, + "508": { + "file_id": 27, + "content": " self.model_type = model_type\n self.n_layer = n_layer\n self.n_embd = n_embd\n self.ctx_len = ctx_len\n self.w = types.SimpleNamespace()\n w = torch.load(MODEL_NAME + '.pth',\n map_location=torch.device(RUN_DEVICE))\n for x in w.keys():\n w[x] = w[x].float()\n if '.time_' in x:\n w[x] = w[x].squeeze()\n if '.time_decay' in x:\n w[x] = -torch.exp(w[x])\n if DEBUG_TIME and '.time_' in x:\n print(x, w[x].squeeze().cpu().numpy())\n xx = x.split('.')\n here = self.w\n for i in range(len(xx)):\n if xx[i].isdigit():\n ii = int(xx[i])\n if ii not in here:\n here[ii] = types.SimpleNamespace()\n here = here[ii]\n else:\n if i == len(xx) - 1:\n setattr(here, xx[i], w[x])\n elif not hasattr(here, xx[i]):", + "type": "code", + "location": "/RWKV-v4/src/model_run.py:243-272" + }, + "509": { + "file_id": 27, + "content": "This code loads a pre-trained RWKV-v4 model from a file and assigns the weights to the corresponding layers. It also performs some processing on time-related parameters, such as squeezing them and applying exponential decay. The code then organizes the loaded weights into a nested namespace structure based on their names, following a hierarchical pattern with layer numbers and layer types.", + "type": "comment" + }, + "510": { + "file_id": 27, + "content": " if xx[i+1].isdigit():\n setattr(here, xx[i], {})\n else:\n setattr(here, xx[i], types.SimpleNamespace())\n here = getattr(here, xx[i])\n self.clear()\n def clear(self):\n self.xx = {}\n self.aa = {}\n self.bb = {}\n self.pp = {}\n self.hk = None\n def save(self, target):\n target.xx = copy.deepcopy(self.xx)\n target.aa = copy.deepcopy(self.aa)\n target.bb = copy.deepcopy(self.bb)\n target.pp = copy.deepcopy(self.pp)\n target.hk = copy.deepcopy(self.hk)\n def load(self, target):\n self.xx = copy.deepcopy(target.xx)\n self.aa = copy.deepcopy(target.aa)\n self.bb = copy.deepcopy(target.bb)\n self.pp = copy.deepcopy(target.pp)\n self.hk = copy.deepcopy(target.hk)\n def LN(self, xx, w):\n return F.layer_norm(xx, (self.n_embd,), weight=w.weight, bias=w.bias)\n def FF(self, xx, w, name):\n if name not in self.xx:", + "type": "code", + "location": "/RWKV-v4/src/model_run.py:273-306" + }, + "511": { + "file_id": 27, + "content": "This code represents a class that can load, clear, and save various variables (xx, aa, bb, pp). It also contains methods to perform layer normalization (LN) and feed-forward operations (FF). The code uses the `getattr` function to dynamically access attributes based on input, and it initializes certain attributes as SimpleNamespace or empty dictionaries. The `clear`, `save`, and `load` functions are used to manage the state of the class variables.", + "type": "comment" + }, + "512": { + "file_id": 27, + "content": " self.xx[name] = torch.zeros(self.n_embd, device=self.RUN_DEVICE)\n xk = xx * w.time_mix_k + self.xx[name] * (1 - w.time_mix_k)\n xr = xx * w.time_mix_r + self.xx[name] * (1 - w.time_mix_r)\n self.xx[name] = xx\n r = torch.sigmoid(w.receptance.weight @ xr)\n k = torch.square(torch.relu(w.key.weight @ xk))\n kv = w.value.weight @ k\n return r * kv\n def SA(self, xx, w, name):\n if name not in self.xx:\n self.xx[name] = torch.zeros(self.n_embd, device=self.RUN_DEVICE)\n self.aa[name] = torch.zeros(self.n_embd, device=self.RUN_DEVICE)\n self.bb[name] = torch.zeros(self.n_embd, device=self.RUN_DEVICE)\n self.pp[name] = torch.zeros(self.n_embd, device=self.RUN_DEVICE) - 1e30\n xk = xx * w.time_mix_k + self.xx[name] * (1 - w.time_mix_k)\n xv = xx * w.time_mix_v + self.xx[name] * (1 - w.time_mix_v)\n xr = xx * w.time_mix_r + self.xx[name] * (1 - w.time_mix_r)\n self.xx[name] = xx\n r = torch.sigmoid(w.receptance.weight @ xr)", + "type": "code", + "location": "/RWKV-v4/src/model_run.py:307-330" + }, + "513": { + "file_id": 27, + "content": "Code is a part of an attention mechanism in a transformer model. It calculates the key, value and returns a weighted sum. The SA function initializes variables for each name.", + "type": "comment" + }, + "514": { + "file_id": 27, + "content": " k = w.key.weight @ xk\n v = w.value.weight @ xv\n pp = self.pp[name]\n aa = self.aa[name]\n bb = self.bb[name]\n ww = w.time_first + k\n p = torch.maximum(pp, ww)\n e1 = torch.exp(pp - p)\n e2 = torch.exp(ww - p)\n a = e1 * aa + e2 * v\n b = e1 * bb + e2\n ww = pp + w.time_decay\n p = torch.maximum(ww, k)\n e1 = torch.exp(ww - p)\n e2 = torch.exp(k - p)\n self.aa[name] = e1 * aa + e2 * v\n self.bb[name] = e1 * bb + e2\n self.pp[name] = p\n rwkv = r * a / b\n return w.output.weight @ rwkv\n def run(self, ctx):\n w = self.w\n x = w.emb.weight[ctx[-1]]\n for i in range(self.n_layer):\n if i == 0:\n x = self.LN(x, w.blocks[i].ln0)\n if i == 0 and self.model_type == 'RWKV-ffnPre':\n x = x + self.FF(self.LN(x, w.blocks[i].ln1), w.blocks[i].ffnPre, f'ffnPre.{i}')\n else:\n x = x + self.SA(self.LN(x, w.blocks[i].ln1), w.blocks[i].att, f'att.{i}')", + "type": "code", + "location": "/RWKV-v4/src/model_run.py:332-366" + }, + "515": { + "file_id": 27, + "content": "Function defines the operation of a RWKV model. It applies layers like LN, FFN, and SA in a loop to transform input x. The function uses variables pp, aa, bb, and ww for intermediate calculations related to time-decaying weights and exponential operations. Output is the weighted sum of input x transformed by the applied layers.", + "type": "comment" + }, + "516": { + "file_id": 27, + "content": " x = x + self.FF(self.LN(x, w.blocks[i].ln2), w.blocks[i].ffn, f'ffn.{i}')\n x = self.LN(x, w.ln_out)\n if RWKV_HEAD_QK_DIM > 0:\n if self.hk == None:\n self.hk = (w.head_k.weight @ x).unsqueeze(0)\n else:\n self.hk = torch.cat(\n [self.hk, (w.head_k.weight @ x).unsqueeze(0)], dim=0)\n if self.hk.shape[0] > self.ctx_len:\n self.hk = self.hk[-self.ctx_len:, :]\n q = w.head_q.weight @ x\n x = w.head.weight @ x\n x = x.cpu().numpy().tolist()\n c = (self.hk @ q) / RWKV_HEAD_QK_DIM\n for i in range(len(c)):\n x[ctx[i]] += c[i]\n else:\n x = w.head.weight @ x\n x = x.cpu().numpy().tolist()\n return x", + "type": "code", + "location": "/RWKV-v4/src/model_run.py:367-392" + }, + "517": { + "file_id": 27, + "content": "This code applies a feed-forward network (FFN) and layer normalization (LN) to the input 'x' and updates it based on the context length ('ctx_len'). It also handles the case when RWKV_HEAD_QK_DIM is greater than 0, calculating the head key matrix ('hk') and updating 'x' accordingly. Finally, it returns the updated 'x'.", + "type": "comment" + }, + "518": { + "file_id": 28, + "content": "/RWKV-v4/src/trainer.py", + "type": "filepath" + }, + "519": { + "file_id": 28, + "content": "The code establishes the RWKV Language Model trainer, trains a model with specified parameters, updates optimizer, handles multi-GPU, averages losses, logs progress, and saves states at each epoch.", + "type": "summary" + }, + "520": { + "file_id": 28, + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nimport os\nNUM_GPUS = int(os.environ['RWKV_NUM_GPUS'])\nUSE_WANDB = (int(os.environ['USE_WANDB']) == 1)\nfrom torch.utils.data.dataloader import DataLoader\nimport torch\nfrom tqdm.auto import tqdm\nimport logging\nimport datetime\nimport math\nfrom pytorch_lightning.lite import LightningLite\nimport gc\nlogger = logging.getLogger(__name__)\ntorch.backends.cudnn.benchmark = True\nif os.environ['RWKV_FLOAT_MODE'] == 'fp32':\n torch.backends.cudnn.allow_tf32 = False\n torch.backends.cuda.matmul.allow_tf32 = False\nelse:\n torch.backends.cudnn.allow_tf32 = True\n torch.backends.cuda.matmul.allow_tf32 = True\nclass TrainerConfig:\n batch_size = 64\n learning_rate = 4e-4\n betas = (0.9, 0.99)\n eps = 1e-8\n grad_norm_clip = 1.0\n warmup_tokens = 0", + "type": "code", + "location": "/RWKV-v4/src/trainer.py:1-34" + }, + "521": { + "file_id": 28, + "content": "This code sets up the trainer configuration for the RWKV Language Model. It defines the batch size, learning rate, optimizer parameters, gradient norm clip, and warmup tokens. The code also ensures proper CUDA backend configurations based on the environment variables.", + "type": "comment" + }, + "522": { + "file_id": 28, + "content": " final_tokens = 0\n epoch_save_frequency = 0\n epoch_save_path = 'trained-'\n num_workers = 0 # for DataLoader\n def __init__(self, **kwargs):\n for k, v in kwargs.items():\n setattr(self, k, v)\nfrom src.model import GPT, GPTConfig\nclass Trainer(LightningLite):\n def get_run_name(self):\n raw_model = self.model.module if hasattr(\n self.model, \"module\") else self.model\n cfg = raw_model.config\n run_name = str(cfg.vocab_size) + '-' + str(cfg.ctx_len) + '-' + \\\n cfg.model_type + '-' + str(cfg.n_layer) + '-' + str(cfg.n_embd)\n return run_name\n def run(self, m_cfg, train_dataset, test_dataset, config):\n self.cuda_id = int(str(self.device).strip('cuda:'))\n print('[0]')\n model = GPT(GPTConfig(train_dataset.vocab_size, train_dataset.ctx_len, model_type=m_cfg.model_type,\n n_layer=m_cfg.n_layer, n_embd=m_cfg.n_embd))\n print('[1]')\n with torch.no_grad():\n if m_cfg.LOAD_MODEL:", + "type": "code", + "location": "/RWKV-v4/src/trainer.py:35-63" + }, + "523": { + "file_id": 28, + "content": "The code defines a Trainer class that initializes various variables and contains methods for model training. The `get_run_name` method generates the run name based on the model's configuration, `run` method prepares the model, and in this snippet, it checks if a pre-trained model should be loaded.", + "type": "comment" + }, + "524": { + "file_id": 28, + "content": " print('loading', m_cfg.MODEL_NAME)\n m2 = torch.load(m_cfg.MODEL_NAME + '.pth', map_location='cpu')\n model.load_state_dict(m2)\n del m2\n model.to(self.device)\n self.model = model\n self.train_dataset = train_dataset\n self.test_dataset = test_dataset\n self.config = config\n self.avg_loss = -1\n self.EPOCH_BEGIN = m_cfg.EPOCH_BEGIN\n self.steps = self.EPOCH_BEGIN * (len(self.train_dataset) // (config.batch_size // NUM_GPUS))\n if self.cuda_id == 0:\n log_file = open(\"mylog.txt\", \"a\")\n if USE_WANDB:\n print('logging to wandb... (comment it if you don\\'t have wandb)')\n import wandb # comment this if you don't have wandb\n cfg = model.config\n for k in config.__dict__:\n setattr(cfg, k, config.__dict__[k]) # combine cfg\n wandb.init(project=\"RWKV-LM\", name=self.get_run_name() + '-' + datetime.datetime.today().strftime('%Y-%m-%d-%H-%M-%S'), config=cfg, save_code=False)", + "type": "code", + "location": "/RWKV-v4/src/trainer.py:64-87" + }, + "525": { + "file_id": 28, + "content": "Loading model, transferring it to GPU, and initializing logging for training.\nThe code loads the model from a specified file path, transfers it to the device's GPU, and opens a log file if necessary. If WandB is enabled, it initializes WandB with project details and a unique run name based on the current date and time. The configuration is combined and saved without saving the code itself.", + "type": "comment" + }, + "526": { + "file_id": 28, + "content": " model, config = self.model, self.config\n raw_model = model.module if hasattr(self.model, \"module\") else model\n optimizer = raw_model.configure_optimizers(config)\n model, optimizer = self.setup(model, optimizer)\n print('[3]')\n def run_epoch(split):\n is_train = split == 'train'\n model.train(is_train)\n data = self.train_dataset if is_train else self.test_dataset\n data.idx_begin = self.steps * config.batch_size + 1\n data.cuda_id = self.cuda_id\n if config.num_workers > 0:\n loader = DataLoader(data, shuffle=False, pin_memory=True,\n batch_size=config.batch_size // NUM_GPUS,\n num_workers=config.num_workers)\n else:\n loader = DataLoader(data, shuffle=False,\n batch_size=config.batch_size // NUM_GPUS,\n num_workers=config.num_workers)", + "type": "code", + "location": "/RWKV-v4/src/trainer.py:89-109" + }, + "527": { + "file_id": 28, + "content": "This code defines a function \"run_epoch\" that takes in a split (train or test) and performs the necessary configurations for training or testing. It sets the model to train mode if split is 'train'. Then, it assigns the corresponding dataset (train or test) to the variable data. The data's idx_begin is set to the current steps multiplied by config.batch_size + 1, and cuda_id is set to self.cuda_id. Finally, it creates a DataLoader for the dataset with specified batch size and number of workers based on config settings.", + "type": "comment" + }, + "528": { + "file_id": 28, + "content": " pbar = tqdm(enumerate(loader), total=len(\n loader), bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}') if is_train else enumerate(loader)\n loader = self.setup_dataloaders(loader)\n gc.collect()\n torch.cuda.empty_cache()\n for it, (x, y) in pbar:\n with torch.set_grad_enabled(is_train):\n loss = model(x, y) # forward the model\n if os.environ['RWKV_DEEPSPEED'] == '0':\n all_loss = [loss.clone()]\n else:\n all_loss = [loss.clone() for _ in range(NUM_GPUS)]\n torch.distributed.all_gather(all_loss, loss)\n if is_train: # backprop and update the parameters\n model.zero_grad()\n self.backward(loss)\n # deepspeed will handle gradient_clipping\n optimizer.step()\n # decay the learning rate based on our progress\n self.tokens += (y >= 0).sum() # number of tokens processed this step (i.e. label is not -100)", + "type": "code", + "location": "/RWKV-v4/src/trainer.py:111-136" + }, + "529": { + "file_id": 28, + "content": "This code sets up a dataloader and trains an RWKV model. It iterates over the dataloader, forwards data through the model, calculates loss, and performs backpropagation if training. Depending on the DEEPSPEED environment variable, it handles all_gather for multi-GPU scenarios. Finally, it updates the optimizer and decay the learning rate based on progress.", + "type": "comment" + }, + "530": { + "file_id": 28, + "content": " lr_final_factor = config.lr_final / config.learning_rate\n if self.tokens < config.warmup_tokens:\n # linear warmup\n lr_mult = lr_final_factor + \\\n (1 - lr_final_factor) * float(self.tokens) / \\\n float(config.warmup_tokens)\n progress = 0\n else:\n # exponential learning rate decay\n progress = float(self.tokens - config.warmup_tokens) / float(max(1, config.final_tokens - config.warmup_tokens))\n if progress >= 1:\n lr_mult = lr_final_factor\n else:\n lr_mult = math.exp(math.log(lr_final_factor) * pow(progress, 1))\n lr = config.learning_rate * lr_mult\n for param_group in optimizer.param_groups:\n param_group['lr'] = lr\n self.lr = lr", + "type": "code", + "location": "/RWKV-v4/src/trainer.py:137-156" + }, + "531": { + "file_id": 28, + "content": "This code determines the learning rate (lr) for training a model using the RWKV algorithm. It uses config parameters such as lr_final, learning_rate, warmup_tokens, and final_tokens to calculate the learning rate based on whether the current token count is in the warm-up phase or not. If in the warm-up phase (tokens < warmup_tokens), it performs linear interpolation. If past warm-up phase, it does exponential decay. The calculated lr is then applied to optimizer's param_groups and stored in self.lr.", + "type": "comment" + }, + "532": { + "file_id": 28, + "content": " self.steps += 1\n now_loss = 0\n for gg in range(NUM_GPUS):\n now_loss += all_loss[gg].item()\n now_loss = now_loss / NUM_GPUS # report progress \n if USE_WANDB and self.cuda_id == 0:\n wandb.log({\"loss\": now_loss}, step = self.steps)\n if self.avg_loss < 0:\n self.avg_loss = now_loss\n else:\n factor = 1 / (it + 1)\n self.avg_loss = self.avg_loss * (1.0 - factor) + now_loss * factor\n pbar.set_description(f\"miniE {epoch+1+self.EPOCH_BEGIN} s {self.steps} prog {progress*100.0:.2f}% : ppl {math.exp(self.avg_loss):.6f} loss {self.avg_loss:.6f} lr {lr:e}\")\n self.tokens = 0 # counter used for learning rate decay\n for epoch in range(99999999):\n run_epoch('train')\n if math.isnan(self.avg_loss):\n exit(0)", + "type": "code", + "location": "/RWKV-v4/src/trainer.py:157-179" + }, + "533": { + "file_id": 28, + "content": "The code calculates the average loss over multiple GPUs, updates a moving average of the loss, logs the current loss to Wandb (if applicable), and sets the progress description. It also resets the tokens counter for learning rate decay and runs an epoch.", + "type": "comment" + }, + "534": { + "file_id": 28, + "content": " if self.cuda_id == 0:\n log_file.write(f'{epoch+1+self.EPOCH_BEGIN} {self.avg_loss:.6f} {math.exp(self.avg_loss):.4f} {self.lr:.8f} {datetime.datetime.now()} {epoch+1} \\n')\n log_file.flush()\n if (self.config.epoch_save_frequency > 0 and epoch % self.config.epoch_save_frequency == 0) or (epoch == config.max_epochs - 1):\n raw_model = self.model.module if hasattr(self.model, \"module\") else self.model\n torch.save(raw_model.state_dict(), self.config.epoch_save_path + str(epoch+1+self.EPOCH_BEGIN) + '.pth')", + "type": "code", + "location": "/RWKV-v4/src/trainer.py:181-187" + }, + "535": { + "file_id": 28, + "content": "This code snippet saves the model's state every time an epoch ends, or if the current epoch is a multiple of `config.epoch_save_frequency`. If using GPU, it saves the model's state dict as a .pth file with the epoch number in the filename and path specified by `config.epoch_save_path`. Additionally, it logs loss values during each epoch.", + "type": "comment" + }, + "536": { + "file_id": 29, + "content": "/RWKV-v4/src/utils.py", + "type": "filepath" + }, + "537": { + "file_id": 29, + "content": "The code imports libraries, defines a Dataset class for data handling, sets vocabulary size, generates unique tokens, maps characters to integers, prints data and token sizes, initializes a tokenizer, calculates dataset length, samples logits, applies softmax with soft constraints on newlines, includes \"probs_sample\" function, converts tensor to float value, and sets random seeds for Python, numpy, and PyTorch.", + "type": "summary" + }, + "538": { + "file_id": 29, + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nimport os\ntry:\n NUM_GPUS = int(os.environ['RWKV_NUM_GPUS'])\nexcept:\n NUM_GPUS = 1\nimport json\nimport random\nimport numpy as np\nimport torch\nfrom torch.nn import functional as F\nfrom torch.utils.data import Dataset\nclass Dataset(Dataset):\n def __init__(self, data, ctx_len, epoch_length_fixed):\n self.ctx_len = ctx_len\n self.epoch_length_fixed = epoch_length_fixed\n self.data = data\n if 'MMapIndexedDataset' in str(type(self.data)):\n self.vocab_size = int(os.environ['VOCAB_SIZE'])\n print('current vocab size =', self.vocab_size, \"(make sure it's correct)\")\n self.data_size = len(self.data._bin_buffer) // 2\n print(f'data has {self.data_size} tokens.')\n elif 'numpy' in str(type(self.data)):", + "type": "code", + "location": "/RWKV-v4/src/utils.py:1-29" + }, + "539": { + "file_id": 29, + "content": "The code is importing necessary libraries and defining a class Dataset for handling data. It checks the number of GPUs, reads input data, and determines the vocabulary size. It prints current vocabulary size and data token count for verification.", + "type": "comment" + }, + "540": { + "file_id": 29, + "content": " self.vocab_size = int(os.environ['VOCAB_SIZE'])\n print('current vocab size =', self.vocab_size, \"(make sure it's correct)\")\n self.data_size = len(self.data)\n print(f'data has {self.data_size} tokens.')\n else:\n print('building token list...', end=' ')\n unique = sorted(list(set(data)))\n self.vocab_size = len(unique)\n # print()\n # for u in unique:\n # print(u, end=' ')\n # print('\\n\\n')\n xx = 0\n xxObj = {}\n for u in unique:\n xxObj[xx] = u\n xx += 1\n with open('vocab.json', \"w\", encoding=\"utf-16\") as vocab_file:\n vocab_file.write(json.dumps(xxObj, ensure_ascii=False))\n self.data_size = len(self.data)\n print('data has %d tokens, %d unique.' % (self.data_size, self.vocab_size))\n self.stoi = {ch: i for i, ch in enumerate(unique)}\n self.itos = {i: ch for i, ch in enumerate(unique)}", + "type": "code", + "location": "/RWKV-v4/src/utils.py:30-53" + }, + "541": { + "file_id": 29, + "content": "This code sets the vocabulary size based on environment variable 'VOCAB_SIZE'. If the size is not specified, it generates a unique token list from data and stores it in 'vocab.json', then calculates the vocabulary and data sizes. It also maps characters to unique integers and inverse mapping. Finally, it prints the data size and number of unique tokens.", + "type": "comment" + }, + "542": { + "file_id": 29, + "content": " def __len__(self):\n return self.epoch_length_fixed // NUM_GPUS\n def __getitem__(self, idx):\n #\n # we are cheating: pick a random spot in dataset\n #\n i = np.random.randint(0, self.data_size - (self.ctx_len + 1))\n if 'MMapIndexedDataset' in str(type(self.data)):\n dix = self.data.get(idx=0, offset=i, length=self.ctx_len + 1).astype(int)\n elif 'numpy' in str(type(self.data)):\n dix = self.data[i:i+self.ctx_len+1]\n else:\n dix = [self.stoi[s] for s in self.data[i:i+self.ctx_len+1]]\n x = torch.tensor(dix[:-1], dtype=torch.long)\n y = torch.tensor(dix[1:], dtype=torch.long)\n return x, y\nclass TOKENIZER():\n def __init__(self, WORD_NAME, UNKNOWN_CHAR='\\ue083'):\n if 'list' in str(type(WORD_NAME)):\n self.charMode = False\n if WORD_NAME[0] == WORD_NAME[1]:\n from transformers import PreTrainedTokenizerFast\n self.tokenizer = PreTrainedTokenizerFast(tokenizer_file=WORD_NAME[0])", + "type": "code", + "location": "/RWKV-v4/src/utils.py:55-81" + }, + "543": { + "file_id": 29, + "content": "This code defines a class that initializes a tokenizer using pre-trained word embeddings. It also includes methods to calculate the length and retrieve items from the dataset, handling different data types like MMapIndexedDataset or numpy arrays. The tokenizer is initialized with a list of words or a PreTrainedTokenizerFast instance based on the input argument.", + "type": "comment" + }, + "544": { + "file_id": 29, + "content": " else:\n from transformers import GPT2TokenizerFast\n self.tokenizer = GPT2TokenizerFast(WORD_NAME[0], WORD_NAME[1])\n self.vocab_size = len(self.tokenizer)\n else:\n self.charMode = True\n with open(WORD_NAME + '.json', \"r\", encoding=\"utf-16\") as result_file:\n self.word_table = json.load(result_file)\n self.vocab_size = len(self.word_table)\n self.stoi = {v: int(k) for k, v in self.word_table.items()}\n self.itos = {int(k): v for k, v in self.word_table.items()}\n self.UNKNOWN_CHAR = self.stoi[UNKNOWN_CHAR]\n def refine_context(self, context):\n context = context.strip().split('\\n')\n for c in range(len(context)):\n context[c] = context[c].strip().strip('\\u3000').strip('\\r')\n context = list(filter(lambda c: c != '', context))\n context = '\\n' + ('\\n'.join(context)).strip()\n if context == '':\n context = '\\n'\n return context", + "type": "code", + "location": "/RWKV-v4/src/utils.py:82-106" + }, + "545": { + "file_id": 29, + "content": "This code checks if a tokenizer or word table is provided, initializes them accordingly and sets the vocabulary size. It also refines the context input by stripping unnecessary characters and returning an empty string if no content is found.", + "type": "comment" + }, + "546": { + "file_id": 29, + "content": " def sample_logits(self, out, x, ctx_len, temperature=1.0, top_p_usual=None, top_p_newline=None):\n # out[self.UNKNOWN_CHAR] = -float('Inf')\n lastChar = int(x[-1])\n probs = F.softmax(torch.tensor(out), dim=-1)\n if self.charMode:\n if self.itos[lastChar] == '\\n':\n top_p = top_p_newline\n else:\n top_p = top_p_usual\n else:\n top_p = top_p_usual\n sorted_probs, s_index = torch.sort(probs, descending=True)\n # for j in range(30):\n # pp = sorted_probs[j].item()\n # if pp < 0.005:\n # break\n # ss = self.itos[int(s_index[j])].replace('\\n','_')\n # print(f'{math.floor(pp*100):>3.0f}{ss}', end='')\n # print('')\n cumulative_probs = torch.cumsum(sorted_probs, dim=-1).numpy()\n cutoff = float(sorted_probs[np.argmax(cumulative_probs > top_p)])\n probs[probs < cutoff] = 0\n # print(\"[\" + str(round(cutoff,4)) + ' ' + str(round(to_float(sum(probs)),3)) + \"]\", end = \"\")", + "type": "code", + "location": "/RWKV-v4/src/utils.py:108-137" + }, + "547": { + "file_id": 29, + "content": "This function samples logits from the output of the model and applies softmax to obtain probabilities. It handles newlines by changing the top_p value when last character is a newline, otherwise it uses top_p_usual. It then sorts probabilities in descending order, finds the cutoff for sampling, and sets probabilities below this cutoff to 0.", + "type": "comment" + }, + "548": { + "file_id": 29, + "content": " if temperature != 1.0:\n probs = probs.pow(1.0 / temperature)\n return torch.multinomial(probs, num_samples=1)[0]\ndef to_float(x):\n return x.cpu().detach().numpy().flatten()[0].astype(float)\ndef set_seed(seed):\n random.seed(seed)\n np.random.seed(seed)\n torch.manual_seed(seed)\n torch.cuda.manual_seed_all(seed)", + "type": "code", + "location": "/RWKV-v4/src/utils.py:139-153" + }, + "549": { + "file_id": 29, + "content": "This code snippet contains three functions: \"probs_sample\" which samples one sample from the multinomial distribution if temperature is not 1, \"to_float\" that converts a tensor to a float value, and \"set_seed\" for setting random seeds in Python, numpy, and PyTorch.", + "type": "comment" + }, + "550": { + "file_id": 30, + "content": "/RWKV-v4/train.py", + "type": "filepath" + }, + "551": { + "file_id": 30, + "content": "The code sets up a RWKV model trainer, configures the Trainer object with DeepSpeed, and enables FP16 mode for efficient RWKV-v4 language model training.", + "type": "summary" + }, + "552": { + "file_id": 30, + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nimport os\nimport logging, types\nfrom src.utils import Dataset\nimport torch\nimport numpy as np\nfrom src.binidx import MMapIndexedDataset\nnp.set_printoptions(precision=4, suppress=True, linewidth=200)\nlogging.basicConfig(format=\"%(asctime)s - %(levelname)s - %(name)s - %(message)s\",\n datefmt=\"%Y-%m-%d %H:%M:%S\", level=logging.INFO,)\n# if False: # True False ---> Set to False if you don't understand it\n# print(\"\\n\\n[[[ SPECIAL DEBUG MODE FOR MYSELF. DON'T ENABLE THIS IF YOU DON'T UNDERSTAND IT ]]]\\n\\n\")\n# import src.utils\n# src.utils.set_seed(42) # make training deterministic (including dataloader). if you are doing this, remember to change seed when you load a model (otherwise the dataloader loads old samples)", + "type": "code", + "location": "/RWKV-v4/train.py:1-19" + }, + "553": { + "file_id": 30, + "content": "This code is an import section for a language model, specifically RWKV-v4. It sets up logging, imports necessary modules, and includes options to set debug mode or seed the training process for determinism. The code is part of the 'train.py' file in the RWKV-LM repository.", + "type": "comment" + }, + "554": { + "file_id": 30, + "content": "########################################################################################################\n# Step 1: set training data & cfg\n########################################################################################################\nEXPRESS_PILE_MODE = False # True: express mode for fine-tuning a pile model // False: usual training\nEXPRESS_PILE_MODEL_NAME = 'RWKV-4-Pile-169M-20220807-8023'\nEXPRESS_PILE_MODEL_TYPE = 'RWKV-4-Pile-169M'\n# EXPRESS_PILE_MODEL_NAME = 'RWKV-4-Pile-430M-20220808-8066'\n# EXPRESS_PILE_MODEL_TYPE = 'RWKV-4-Pile-430M'\n# EXPRESS_PILE_MODEL_NAME = 'RWKV-4-Pile-1B5-20220903-8040'\n# EXPRESS_PILE_MODEL_TYPE = 'RWKV-4-Pile-1B5'\n########################################################################################################\ndatafile = \"../data/enwik8\" # your data\ndatafile_encoding = 'utf-8' # 'utf-8' / 'utf-16le' / 'numpy' (for fine-tuning pile models) / 'binidx' (the Megatron-LM 'binidx' format)\n# datafile = 'my-gpt_seq_document'\n# datafile_encoding = 'binidx'\nif EXPRESS_PILE_MODE:", + "type": "code", + "location": "/RWKV-v4/train.py:21-42" + }, + "555": { + "file_id": 30, + "content": "This code sets the training data and configuration for a text generation model. It uses the RWKV-LM's Pile model and can be fine-tuned with different model names and types. The datafile specifies where to find the training data, and datafile_encoding specifies the file encoding format. If EXPRESS_PILE_MODE is True, the code uses a specific pile model for fine-tuning.", + "type": "comment" + }, + "556": { + "file_id": 30, + "content": " datafile = 'train.npy' # use 'prepare-data.py' in https://github.com/BlinkDL/RWKV-v2-RNN-Pile/tree/main/RWKV-v3 to tokenize .txt into .npy\n datafile_encoding = 'numpy'\n#\n# set VOCAB_SIZE = 0 (auto-compute) if you are training a char-level LM from scratch\n# set VOCAB_SIZE = 50277 for fine-tuning pile models\n# set VOCAB_SIZE = your_vocab_size for 'binidx' data\n#\nos.environ['VOCAB_SIZE'] = '0'\nif EXPRESS_PILE_MODE:\n os.environ['VOCAB_SIZE'] = '50277'\n#\n# Currently it's slow to initialize a new model. Hence I suggest this procedure for multi-GPU training:\n# 1) set RWKV_NUM_GPUS = '1' and let it run for 1 miniEpoch and it will save a trained-1.pth\n# 2) set RWKV_NUM_GPUS = '8' (or your #GPU), batch_size = single_gpu_batchsz * RWKV_NUM_GPUS,\n# EPOCH_BEGIN = 1, LOAD_MODEL = True, and it will load 'trained-1.pth' and continue the training from it\n#\nos.environ['RWKV_NUM_GPUS'] = '1' # num of GPUs to use\n#\n# 'bf16' (fast & stable)\n# 'fp16' (fast & will overflow after training a large model for very long. can be solved in the future)", + "type": "code", + "location": "/RWKV-v4/train.py:43-65" + }, + "557": { + "file_id": 30, + "content": "This code sets the datafile, datafile_encoding, and VOCAB_SIZE environment variables for RWKV-v4 training. It also suggests a procedure for multi-GPU training involving setting RWKV_NUM_GPUS and other parameters. The supported precisions are 'bf16' and 'fp16'.", + "type": "comment" + }, + "558": { + "file_id": 30, + "content": "# 'tf32' (decent speed & stable)\n# 'fp32' (!!!very slow!!! only for verification)\nos.environ['RWKV_FLOAT_MODE'] = 'bf16'\nos.environ['RWKV_DEEPSPEED'] = '1' # Use DeepSpeed? 0 = False, 1 = True\nif int(os.environ['RWKV_NUM_GPUS']) == 1: # Usually you don't need DeepSpeed for 1 GPU training.\n os.environ['RWKV_DEEPSPEED'] = '0' # However, sometimes DeepSpeed saves VRAM even for 1 GPU training. So you shall try it.\nos.environ['USE_WANDB'] = '0' # wandb logging. 0 = False, 1 = True\n########################################################################################################\n# Step 2: set model details\n########################################################################################################\nEPOCH_BEGIN = 0 # begins with miniEpoch = EPOCH_BEGIN\nLOAD_MODEL = False # shall we load the #EPOCH_BEGIN model and continue the training from it?\nn_layer = 6\nn_embd = 512\nctx_len = 1024 # increase T_MAX in src/model.py if your ctx_len is longer\nmodel_type = 'RWKV' # 'RWKV' or 'RWKV-ffnPre' (sometimes better)", + "type": "code", + "location": "/RWKV-v4/train.py:66-88" + }, + "559": { + "file_id": 30, + "content": "The code is setting the environment variables for the training process. It allows choosing different float modes and deciding whether to use DeepSpeed for improved efficiency or not. Additionally, it sets the model details such as number of layers, embedding size, and context length.", + "type": "comment" + }, + "560": { + "file_id": 30, + "content": "# there is also a RWKV_HEAD_QK_DIM in model.py and model_run.py\n# set it to 256, then it's using my headQK trick (a tiny attention) to improve loss\n# set it to 0, then it's a pure RNN (attention-free)\nif EXPRESS_PILE_MODE:\n LOAD_MODEL = True\n if EXPRESS_PILE_MODEL_TYPE == 'RWKV-4-Pile-169M':\n n_layer = 12\n n_embd = 768\n ctx_len = 1024\n elif EXPRESS_PILE_MODEL_TYPE == 'RWKV-4-Pile-430M':\n n_layer = 24\n n_embd = 1024\n ctx_len = 1024\n elif EXPRESS_PILE_MODEL_TYPE == 'RWKV-4-Pile-1B5':\n n_layer = 24\n n_embd = 2048\n ctx_len = 1024\n########################################################################################################\n# Step 3: set batch size & learning rate etc.\n########################################################################################################\n# if you see \"CUDA out of memory\", reduce batch_size. Use nvidia-smi to find the highest value for your GPU.\nbatch_size = 12 * int(os.environ['RWKV_NUM_GPUS'])", + "type": "code", + "location": "/RWKV-v4/train.py:90-114" + }, + "561": { + "file_id": 30, + "content": "The code is setting the hyperparameters and model configuration for the RWKV-v4 language model based on the chosen EXPRESS_PILE_MODEL_TYPE. It defines the number of layers, embedding dimension, context length, and batch size according to the selected model type. The code also advises reducing the batch size if encountering \"CUDA out of memory\" error.", + "type": "comment" + }, + "562": { + "file_id": 30, + "content": "assert (batch_size % int(os.environ['RWKV_NUM_GPUS']) == 0)\n# By default we are using exponential LR decay.\n# Here are my suggestions for training.\n# Let's say you are training a L6-D512 model.\n# 1) Set lr_init = lr_final = 8e-4. Let it run for some mini-epochs, until you feel like reducing LR.\n# 2) Check epoch_save_frequency and make sure the partially-trained model is saved. Ctrl+C to stop the run.\n# 3) Set lr_init = 8e-4, lr_final = 1e-5, betas = (0.9, 0.999).\n# 4) Set EPOCH_BEGIN & LOAD_MODEL to load the partially-trained model. Continue the training.\n# \n# For L12-D768, set lr_init = 6e-4. For L24-D1024, set lr_init = 4e-4. For L24-D2048, set lr_init = 3e-4.\nlr_init = 8e-4\nlr_final = 1e-5\n# the mini-epoch is very short and of fixed length (length = ctx_len * epoch_length_fixed tokens)\nn_epoch = 500\nepoch_length_fixed = (10000 // batch_size) * batch_size # feel free to increase it if you have lots of GPU\n# epoch_save_frequency 0 = never, 1 = every mini-epoch, 2 = every two mini-epochs, ...\nepoch_save_frequency = 10", + "type": "code", + "location": "/RWKV-v4/train.py:115-135" + }, + "563": { + "file_id": 30, + "content": "The code provides suggestions for training an RWKV-v4 model. It recommends setting the initial and final learning rates (lr_init and lr_final) based on the model size, saving partially trained models with epoch_save_frequency, and adjusting the epoch length and batch size according to available GPU resources.", + "type": "comment" + }, + "564": { + "file_id": 30, + "content": "epoch_save_path = 'trained-'\nif EXPRESS_PILE_MODE:\n if EXPRESS_PILE_MODEL_TYPE == 'RWKV-4-Pile-169M':\n lr_init = 2e-5\n else:\n lr_init = 1e-5\n lr_final = 1e-5\n n_epoch = 100000\n### misc stuffs ########################################################################################\nif LOAD_MODEL and EPOCH_BEGIN > 0: # we are not saving gradients, so let's have some warmup if we load a model\n warmup_tokens = 50 * ctx_len * batch_size // NUM_GPUS\nelse:\n warmup_tokens = 0\nbetas = (0.9, 0.99) # set betas = (0.9, 0.999) if your model has been trained for a while\neps = 1e-8\nnum_workers = 1 # DataLoader worker. I only tested num_workers = 1\nNUM_GPUS = int(os.environ['RWKV_NUM_GPUS'])\nos.environ['RWKV_LOAD_MODEL'] = str(LOAD_MODEL)\nMODEL_NAME = epoch_save_path + str(EPOCH_BEGIN)\nif EXPRESS_PILE_MODE:\n betas = (0.9, 0.999)\n MODEL_NAME = EXPRESS_PILE_MODEL_NAME\ntorch.backends.cudnn.benchmark = True\nif os.environ['RWKV_FLOAT_MODE'] == 'fp32':\n torch.backends.cudnn.allow_tf32 = False", + "type": "code", + "location": "/RWKV-v4/train.py:136-168" + }, + "565": { + "file_id": 30, + "content": "This code sets up various parameters for model training, such as learning rate, epoch number, warmup tokens, betas for optimizer, epsilon, number of data loader workers, and number of GPUs. It also configures some environment variables and enables CUDA benchmarking based on the floating point mode. The code then assigns a model name based on the epoch number or the express pile mode if enabled.", + "type": "comment" + }, + "566": { + "file_id": 30, + "content": " torch.backends.cuda.matmul.allow_tf32 = False\nelse:\n torch.backends.cudnn.allow_tf32 = True\n torch.backends.cuda.matmul.allow_tf32 = True\n########################################################################################################\n# Load data\n########################################################################################################\nprint(f'loading {datafile_encoding} data... ' + datafile)\nif datafile_encoding == 'binidx':\n train_dataset = Dataset(MMapIndexedDataset(datafile), ctx_len, epoch_length_fixed)\nelif datafile_encoding == 'numpy':\n train_dataset = Dataset(np.load(datafile).astype('int'), ctx_len, epoch_length_fixed)\nelse:\n train_dataset = Dataset(open(datafile, \"r\", encoding=datafile_encoding).read(), ctx_len, epoch_length_fixed)\n########################################################################################################\n# Train model\n########################################################################################################\nif __name__ == '__main__':", + "type": "code", + "location": "/RWKV-v4/train.py:169-190" + }, + "567": { + "file_id": 30, + "content": "Loading and preparing data for training the model, with support for different data file formats (binidx, numpy, or text file). Ensures CUDA and cudnn settings are properly configured based on the environment.", + "type": "comment" + }, + "568": { + "file_id": 30, + "content": " from src.trainer import Trainer, TrainerConfig\n print('\\nmodel', model_type, os.environ['RWKV_FLOAT_MODE'], 'epoch', n_epoch, 'batchsz', batch_size, 'betas',\n betas, 'eps', eps, 'ctx', ctx_len, 'layer', n_layer, 'embd', n_embd, '\\n')\n tconf = TrainerConfig(model_type=model_type, max_epochs=n_epoch, batch_size=batch_size,\n learning_rate=lr_init, lr_decay=True, lr_final=lr_final, betas=betas, eps=eps,\n warmup_tokens=warmup_tokens, final_tokens=n_epoch*len(train_dataset)*ctx_len, num_workers=num_workers, epoch_save_frequency=epoch_save_frequency, epoch_save_path=epoch_save_path)\n m_cfg = types.SimpleNamespace()\n m_cfg.model_type = model_type\n m_cfg.n_layer = n_layer\n m_cfg.n_embd = n_embd\n m_cfg.EPOCH_BEGIN = EPOCH_BEGIN\n m_cfg.LOAD_MODEL = LOAD_MODEL\n m_cfg.MODEL_NAME = MODEL_NAME\n if os.environ['RWKV_DEEPSPEED'] == '0':\n if os.environ['RWKV_FLOAT_MODE'] == 'fp16':\n trainer = Trainer(devices=NUM_GPUS, accelerator=\"gpu\", precision=16) ", + "type": "code", + "location": "/RWKV-v4/train.py:191-209" + }, + "569": { + "file_id": 30, + "content": "This code sets up a trainer for the RWKV model. It prints out information such as the model type, float mode, epoch count, and other relevant parameters before configuring the Trainer object with these details. The code also checks if DeepSpeed should be used based on an environment variable and sets up the Trainer accordingly, using either FP16 precision for GPU acceleration or regular floating point precision for CPU-only execution.", + "type": "comment" + }, + "570": { + "file_id": 30, + "content": " elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':\n trainer = Trainer(devices=NUM_GPUS, accelerator=\"gpu\", precision='bf16')\n elif '32' in os.environ['RWKV_FLOAT_MODE']:\n trainer = Trainer(devices=NUM_GPUS, accelerator=\"gpu\", precision=32)\n else:\n from pytorch_lightning.strategies import DeepSpeedStrategy\n DEEPSPEED_CFG = {\n \"zero_allow_untested_optimizer\":True,\n \"zero_optimization\":{\n \"stage\":2,\n \"contiguous_gradients\":True,\n \"overlap_comm\":True,\n \"allgather_partitions\":True,\n \"reduce_scatter\":True,\n \"allgather_bucket_size\":200000000,\n \"reduce_bucket_size\":200000000,\n \"sub_group_size\":1000000000000\n },\n \"activation_checkpointing\":{\n \"partition_activations\":False,\n \"cpu_checkpointing\":False,\n \"contiguous_memory_optimization\":False,\n \"synchronize_checkpoint_boundary\":False", + "type": "code", + "location": "/RWKV-v4/train.py:210-233" + }, + "571": { + "file_id": 30, + "content": "This code checks the value of the environment variable \"RWKV_FLOAT_MODE\" to determine the trainer type and precision for training. If it's 'bf16', a GPU trainer with bf16 precision is used, otherwise if it contains '32', a GPU trainer with 32-bit precision is used. Otherwise, a DeepSpeedStrategy is imported, and its configuration is set up for further optimization during the training process.", + "type": "comment" + }, + "572": { + "file_id": 30, + "content": " },\n \"aio\":{\n \"block_size\":1048576,\n \"queue_depth\":8,\n \"single_submit\":False,\n \"overlap_events\":True,\n \"thread_count\":1\n },\n \"gradient_clipping\": 1.0,\n \"gradient_accumulation_steps\": 1,\n }\n if NUM_GPUS == 1:\n DEEPSPEED_CFG['zero_optimization'] = {\n \"stage\":1, # saves some VRAM\n \"contiguous_gradients\":False,\n \"overlap_comm\":False,\n \"allgather_partitions\":False,\n \"reduce_scatter\":False,\n \"allgather_bucket_size\":200000000,\n \"reduce_bucket_size\":200000000,\n \"sub_group_size\":1000000000000\n }\n if os.environ['RWKV_FLOAT_MODE'] == 'fp16':\n DEEPSPEED_CFG[\"fp16\"] = {\n \"fp16\": True,\n \"enabled\": True,\n \"loss_scale\": 0,\n \"initial_scale_power\": 12,\n \"loss_scale_window\": 1000,", + "type": "code", + "location": "/RWKV-v4/train.py:234-263" + }, + "573": { + "file_id": 30, + "content": "This code configures DeepSpeed settings for RWKV-v4 model training. It includes various options such as block size, queue depth, and overlap events for the aio section, gradient clipping and accumulation steps, and DeepSpeed optimization settings like stage, contiguous gradients, and bucket sizes when using 1 GPU. Additionally, it enables FP16 mode if RWKV_FLOAT_MODE is set to 'fp16'.", + "type": "comment" + }, + "574": { + "file_id": 30, + "content": " \"hysteresis\": 2,\n \"min_loss_scale\": 1\n }\n trainer = Trainer(strategy=DeepSpeedStrategy(config=DEEPSPEED_CFG), devices=NUM_GPUS, accelerator=\"gpu\", precision=16)\n elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':\n DEEPSPEED_CFG[\"bf16\"] = {\n \"enabled\": True\n }\n trainer = Trainer(strategy=DeepSpeedStrategy(config=DEEPSPEED_CFG), devices=NUM_GPUS, accelerator=\"gpu\", precision='bf16')\n elif '32' in os.environ['RWKV_FLOAT_MODE']:\n trainer = Trainer(strategy=DeepSpeedStrategy(config=DEEPSPEED_CFG), devices=NUM_GPUS, accelerator=\"gpu\", precision=32)\n print(trainer._strategy.config)\n trainer.run(m_cfg, train_dataset, None, tconf)", + "type": "code", + "location": "/RWKV-v4/train.py:264-280" + }, + "575": { + "file_id": 30, + "content": "The code checks the environment variable 'RWKV_FLOAT_MODE' and configures the DeepSpeed strategy accordingly. If it is set to 'fp16', it enables fp16 mode with hysteresis and minimum loss scale. If it is set to 'bf16', it enables bf16 mode. If '32' is present in the environment variable, it uses 32-bit precision. The trainer is then initialized with these configurations and the training process starts using the specified strategy.", + "type": "comment" + }, + "576": { + "file_id": 31, + "content": "/RWKV-v4/verify.py", + "type": "filepath" + }, + "577": { + "file_id": 31, + "content": "This code initializes and compares the outputs of RWKV-GPT and RWKV-RNN models, tokenizing input context and training model_train on GPU with tensor data. It sets environment variables for hardware mode and floating-point precision.", + "type": "summary" + }, + "578": { + "file_id": 31, + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\n# this is for verifying the results of different models and make sure they agree with each other\nimport numpy as np\nnp.set_printoptions(precision=4, suppress=True, linewidth=200)\nimport os\nos.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\nos.environ['RWKV_FLOAT_MODE'] = 'bf16' # 'bf16' (stable) or 'fp16' (will overflow after training a large model for very long. can be solved in the future)\nos.environ['RWKV_RUN_DEVICE'] = 'cuda'\nRUN_DEVICE = os.environ['RWKV_RUN_DEVICE']\nimport torch\nfrom src.model_run import RWKV_RNN, RWKV_GPT\nfrom src.model import GPT, GPTConfig\nTOKEN_MODE = 'pile' # char / pile\nif TOKEN_MODE == 'char':\n MODEL_NAME = 'trained-1'\n WORD_NAME = 'vocab' # the .json vocab (generated by train.py)\n ctx_len = 1024\n n_layer = 6", + "type": "code", + "location": "/RWKV-v4/verify.py:1-26" + }, + "579": { + "file_id": 31, + "content": "This code verifies the results of different models to ensure consistency. It imports necessary libraries, sets environment variables for hardware mode and floating-point precision, defines the model to run (RWKV_RNN or GPT), specifies token mode as either character or pile, and defines variables for model name, vocabulary file, context length, and number of layers.", + "type": "comment" + }, + "580": { + "file_id": 31, + "content": " n_embd = 512\n UNKNOWN_CHAR = ' ' # here we just set it to [space] for simplicity\nelif TOKEN_MODE == 'pile':\n WORD_NAME = ['20B_tokenizer.json', '20B_tokenizer.json']\n MODEL_NAME = 'RWKV-4-Pile-169M-20220807-8023'\n ctx_len = 1024\n n_layer = 12\n n_embd = 768\n UNKNOWN_CHAR = None\nmodel_type = 'RWKV'\nfrom src.utils import TOKENIZER\ntokenizer = TOKENIZER(WORD_NAME, UNKNOWN_CHAR=UNKNOWN_CHAR)\nif TOKEN_MODE == 'pile':\n tokenizer.vocab_size = 50277\n########################################################################################################\nmodel_train = GPT(GPTConfig(tokenizer.vocab_size, ctx_len, model_type=model_type, n_layer=n_layer, n_embd=n_embd)).cuda()\nif os.environ['RWKV_FLOAT_MODE'] == 'fp16':\n model_train = model_train.half()\nelif os.environ['RWKV_FLOAT_MODE'] == 'bf16':\n model_train = model_train.bfloat16()\nprint('loading ' + MODEL_NAME)\nm2 = torch.load(MODEL_NAME + '.pth', map_location=RUN_DEVICE)\nmodel_train.load_state_dict(m2)\nmodel_rnn = RWKV_RNN(MODEL_NAME, RUN_DEVICE, model_type, n_layer, n_embd, ctx_len)", + "type": "code", + "location": "/RWKV-v4/verify.py:27-57" + }, + "581": { + "file_id": 31, + "content": "The code is initializing a model for the RWKV language model, specifically the \"RWKV-v4\" variant. It checks the TOKEN_MODE and sets up the tokenizer and model accordingly. The tokenizer's vocab_size is set to 50277 if in 'pile' mode, and the model is loaded from a specific .pth file. The model is also converted to fp16 or bf16 depending on the environment variable RWKV_FLOAT_MODE.", + "type": "comment" + }, + "582": { + "file_id": 31, + "content": "model_gpt = RWKV_GPT(MODEL_NAME, RUN_DEVICE, model_type, tokenizer.vocab_size, n_layer, n_embd, ctx_len).cuda()\n########################################################################################################\n# context = '\\nIn a'\ncontext = '\\nIn a shocking finding, scientist discovered a herd of dragons living in a remote, previously unexplored valley, in Tibet. Even more surprising to the researchers was the fact that the dragons spoke perfect Chinese.'\nif TOKEN_MODE == 'char':\n ctx = [tokenizer.stoi.get(s, tokenizer.UNKNOWN_CHAR) for s in context]\nelif TOKEN_MODE == 'pile':\n ctx = tokenizer.tokenizer.encode(context)\nprint(f'input len {len(ctx)} data {ctx}')\n########################################################################################################\nprint('\\nRWKV-GPT output')\nout = model_gpt.forward(torch.tensor(ctx).unsqueeze(0).cuda())[0].detach().cpu().numpy()\nprint(out)\nprint('\\nRWKV-RNN output')\nmodel_rnn.clear()\nsrc_len = len(ctx)\nfor i in range(src_len):\n x = ctx[:i+1]", + "type": "code", + "location": "/RWKV-v4/verify.py:58-81" + }, + "583": { + "file_id": 31, + "content": "The code initializes an RWKV-GPT model, tokenizes input context in either character or pile mode, and prints the output of both RWKV-GPT and RWKV-RNN models for the given input.", + "type": "comment" + }, + "584": { + "file_id": 31, + "content": " out = model_rnn.run(x)\n if i < 3 or i >= src_len - 3:\n print(torch.tensor(out).detach().cpu().numpy())\n if i == 2:\n print('...')\nprint('\\nRWKV-train output')\nout = model_train.forward(torch.tensor([ctx]).cuda())[0][0].detach().cpu().float().numpy()\nprint(out, '\\n')", + "type": "code", + "location": "/RWKV-v4/verify.py:82-90" + }, + "585": { + "file_id": 31, + "content": "This code snippet is checking the output of RWKV-v4 model at specific indices and then prints the RWKV-train output. It first runs the model_rnn on input x and checks if i (index) is less than 3 or greater than src\\_len - 3, printing the detached output to CPU numpy array. If i equals 2, it prints '...'. Then, it trains the model_train on cuda with ctx tensor, gets the forward output and prints it as float numpy array.", + "type": "comment" + }, + "586": { + "file_id": 32, + "content": "/RWKV-v4neo/chat.py", + "type": "filepath" + }, + "587": { + "file_id": 32, + "content": "The code sets up an RWKV Language Model environment for English/Chinese, creates a multilingual chatbot in Python with response generation commands and independent question prompts, handling user input and generating text from the model.", + "type": "summary" + }, + "588": { + "file_id": 32, + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nprint('Loading...')\nfrom src.model_run import RWKV_RNN\nimport numpy as np\nimport os, copy, types, gc, sys\nimport torch\nfrom src.utils import TOKENIZER\ntry:\n os.environ[\"CUDA_VISIBLE_DEVICES\"] = sys.argv[1]\nexcept:\n pass\ntorch.backends.cudnn.benchmark = True\ntorch.backends.cudnn.allow_tf32 = True\ntorch.backends.cuda.matmul.allow_tf32 = True\nnp.set_printoptions(precision=4, suppress=True, linewidth=200)\nCHAT_LANG = 'English' # English Chinese\nWORD_NAME = [\n \"20B_tokenizer.json\",\n \"20B_tokenizer.json\",\n] # [vocab, vocab] for Pile model\nUNKNOWN_CHAR = None\ntokenizer = TOKENIZER(WORD_NAME, UNKNOWN_CHAR=UNKNOWN_CHAR)\nargs = types.SimpleNamespace()\nargs.RUN_DEVICE = \"cuda\" # 'cpu' (already very fast) // 'cuda'\nargs.FLOAT_MODE = \"fp16\" # fp32 (good for CPU) // fp16 (recommended for GPU) // bf16 (less accurate)", + "type": "code", + "location": "/RWKV-v4neo/chat.py:1-31" + }, + "589": { + "file_id": 32, + "content": "Loading RWKV Language Model and setting up environment with specified device, float mode, and tokenizer for English/Chinese language.", + "type": "comment" + }, + "590": { + "file_id": 32, + "content": "args.vocab_size = 50277\nargs.head_qk = 0\nargs.pre_ffn = 0\nargs.grad_cp = 0\nargs.my_pos_emb = 0\nargs.MODEL_NAME = '/fsx/BlinkDL/HF-MODEL/rwkv-4-pile-14b/RWKV-4-Pile-14B-20230108-5170'\nargs.n_layer = 40\nargs.n_embd = 5120\nargs.ctx_len = 1024\n# args.MODEL_NAME = '/fsx/BlinkDL/HF-MODEL/rwkv-4-pile-7b/RWKV-4-Pile-7B-20221115-8047'\n# args.n_layer = 32\n# args.n_embd = 4096\n# args.ctx_len = 1024\n# args.MODEL_NAME = '/fsx/BlinkDL/HF-MODEL/rwkv-4-pile-3b/RWKV-4-Pile-3B-20221008-8023'\n# args.n_layer = 32\n# args.n_embd = 2560\n# args.ctx_len = 1024\nif CHAT_LANG == 'English':\n user = \"User\"\n bot = \"Bot\"\n interface = \":\"\n # The following is a verbose and detailed conversation between an AI assistant called {bot}, and a human user called {user}. {bot} is intelligent, knowledgeable, wise and polite.\n # The following is a conversation between a highly knowledgeable and intelligent AI called {bot}, and a human called {user}. In the following interactions, {user} and {bot} converse in natural language, and {bot}", + "type": "code", + "location": "/RWKV-v4neo/chat.py:32-59" + }, + "591": { + "file_id": 32, + "content": "Code is setting hyperparameters for RWKV-v4neo model, including vocabulary size, dimensions, and layers. It also has multiple conditionals to change these values based on the chat language, and provides aliases for user and bot.", + "type": "comment" + }, + "592": { + "file_id": 32, + "content": " do its best to answer {user}'s questions. {bot} is respectful, polite and inclusive. {bot} knows a lot, and always tells the truth.\n init_prompt = f'''\nThe following is a verbose and detailed conversation between an AI assistant called {bot}, and a human user called {user}. {bot} is intelligent, knowledgeable, wise and polite.\n{user}{interface} french revolution what year\n{bot}{interface} The French Revolution started in 1789, and lasted 10 years until 1799.\n{user}{interface} 3+5=?\n{bot}{interface} The answer is 8.\n{user}{interface} guess i marry who ?\n{bot}{interface} Only if you tell me more about yourself - what are your interests?\n{user}{interface} solve for a: 9-a=2\n{bot}{interface} The answer is a = 7, because 9 - 7 = 2.\n{user}{interface} wat is lhc\n{bot}{interface} LHC is a high-energy particle collider, built by CERN, and completed in 2008. They used it to confirm the existence of the Higgs boson in 2012.\n'''\n HELP_MSG = '''Commands:\nsay something --> chat with bot. use \\\\n for new line.", + "type": "code", + "location": "/RWKV-v4neo/chat.py:59-86" + }, + "593": { + "file_id": 32, + "content": "This code contains a sample conversation between an AI assistant named {bot} and a user. The assistant provides answers to questions, is respectful and polite, and always tells the truth. The code also includes instructions for using the chat functionality and commands like 'say something' to initiate the conversation with the bot.", + "type": "comment" + }, + "594": { + "file_id": 32, + "content": "+alt --> alternate chat reply\n+reset --> reset chat\n+gen YOUR PROMPT --> free generation with any prompt. use \\\\n for new line.\n+qa YOUR QUESTION --> free generation - ask any question (just ask the question). use \\\\n for new line.\n+more --> continue last free generation (only for +gen / +qa)\n+retry --> retry last free generation (only for +gen / +qa)\nNow talk with the bot and enjoy. Remember to +reset periodically to clean up the bot's memory. Use RWKV-4 14B for best results.\nThis is not instruct-tuned for conversation yet, so don't expect good quality. Better use +gen for free generation.\n'''\nelif CHAT_LANG == 'Chinese':\n args.MODEL_NAME = '/fsx/BlinkDL/CODE/_PUBLIC_/RWKV-LM/RWKV-v4neo/7-run3z/rwkv-293'\n args.n_layer = 32\n args.n_embd = 4096\n args.ctx_len = 1024\n user = \"Q\"\n bot = \"A\"\n interface = \":\"\n init_prompt = '''\nQ: 企鹅会飞吗?\nA: 企鹅是不会飞的。它们的翅膀主要用于游泳和平衡,而不是飞行。\nQ: 西瓜是什么\nA: 西瓜是一种常见的水果,是一种多年生蔓生藤本植物。西瓜的果实呈圆形或卵形,通常是绿色的,里面有红色或黄色的肉和很多的籽。西瓜味甜,多吃可以增加水分,是夏季非常受欢迎的水果之一。\n'''\n HELP_MSG = '''指令:", + "type": "code", + "location": "/RWKV-v4neo/chat.py:87-118" + }, + "595": { + "file_id": 32, + "content": "This code is for a chatbot implemented in Python using the RWKV-v4neo language model. It supports Chinese and English languages, allowing users to ask questions or generate free text. The code provides specific parameters and prompts for Chinese and English interactions, with the ability to reset the bot's memory, and instructions on how to use the chatbot effectively.", + "type": "comment" + }, + "596": { + "file_id": 32, + "content": "直接输入内容 --> 和机器人聊天,用\\\\n代表换行\n+alt --> 让机器人换个回答\n+reset --> 重置对话\n+gen 某某内容 --> 续写任何中英文内容,用\\\\n代表换行\n+qa 某某问题 --> 问独立的问题(忽略上下文),用\\\\n代表换行\n+more --> 继续 +gen / +qa 的回答\n+retry --> 换个 +gen / +qa 的回答\n现在可以输入内容和机器人聊天(注意它不怎么懂中文,它可能更懂英文)。请经常使用 +reset 重置机器人记忆。\n'''\n# Load Model\nos.environ[\"RWKV_RUN_DEVICE\"] = args.RUN_DEVICE\nMODEL_NAME = args.MODEL_NAME\nprint(f'loading... {MODEL_NAME}')\nmodel = RWKV_RNN(args)\nmodel_tokens = []\ncurrent_state = None\n########################################################################################################\ndef run_rnn(tokens, newline_adj = 0):\n global model_tokens, current_state\n for i in range(len(tokens)):\n model_tokens += [int(tokens[i])]\n if i == len(tokens) - 1:\n out, current_state = model.forward(model_tokens, current_state)\n else:\n current_state = model.forward(model_tokens, current_state, preprocess_only = True)\n # print(f'### model ###\\n[{tokenizer.tokenizer.decode(model_tokens)}]')\n out[0] = -999999999 # disable <|endoftext|>", + "type": "code", + "location": "/RWKV-v4neo/chat.py:119-156" + }, + "597": { + "file_id": 32, + "content": "This code is part of an interactive chatbot that uses the RWKV model for natural language processing. It supports various commands to generate or reset responses, and prompts for independent questions. The code loads the model and defines a function run_rnn() which takes input tokens, processes them through the RWKV model, and generates output.", + "type": "comment" + }, + "598": { + "file_id": 32, + "content": " out[187] += newline_adj\n # if newline_adj > 0:\n # out[15] += newline_adj / 2 # '.'\n return out\nall_state = {}\ndef save_all_stat(srv, name, last_out):\n n = f'{name}_{srv}'\n all_state[n] = {}\n all_state[n]['out'] = last_out\n all_state[n]['rnn'] = copy.deepcopy(current_state)\n all_state[n]['token'] = copy.deepcopy(model_tokens)\ndef load_all_stat(srv, name):\n global model_tokens, current_state\n n = f'{name}_{srv}'\n current_state = copy.deepcopy(all_state[n]['rnn'])\n model_tokens = copy.deepcopy(all_state[n]['token'])\n return all_state[n]['out']\n########################################################################################################\n# Run inference\nprint(f'\\nRun prompt...')\nout = run_rnn(tokenizer.tokenizer.encode(init_prompt))\ngc.collect()\ntorch.cuda.empty_cache()\nsave_all_stat('', 'chat_init', out)\nsrv_list = ['dummy_server']\nfor s in srv_list:\n save_all_stat(s, 'chat', out)\nprint(f'### prompt ###\\n[{tokenizer.tokenizer.decode(model_tokens)}]\\n')\ndef reply_msg(msg):", + "type": "code", + "location": "/RWKV-v4neo/chat.py:157-194" + }, + "599": { + "file_id": 32, + "content": "This code is used for saving and loading all-state, running inference, and replying to messages. The all-state contains the RNN state, model tokens, and output. Inference is run on a prompt, and the output is saved and printed with decoded tokens. The reply_msg function can be used to reply to received messages.", + "type": "comment" + } +} \ No newline at end of file diff --git a/docs/data/6.json b/docs/data/6.json new file mode 100644 index 00000000..2897a952 --- /dev/null +++ b/docs/data/6.json @@ -0,0 +1,543 @@ +{ + "600": { + "file_id": 32, + "content": " print(f'{bot}{interface} {msg}\\n')\ndef on_message(message):\n global model_tokens, current_state\n srv = 'dummy_server'\n msg = message.replace('\\\\n','\\n').strip()\n if len(msg) > 1000:\n reply_msg('your message is too long (max 1000 tokens)')\n return\n x_temp = 1.0\n x_top_p = 0.85\n if (\"-temp=\" in msg):\n x_temp = float(msg.split(\"-temp=\")[1].split(\" \")[0])\n msg = msg.replace(\"-temp=\"+f'{x_temp:g}', \"\")\n # print(f\"temp: {x_temp}\")\n if (\"-top_p=\" in msg):\n x_top_p = float(msg.split(\"-top_p=\")[1].split(\" \")[0])\n msg = msg.replace(\"-top_p=\"+f'{x_top_p:g}', \"\")\n # print(f\"top_p: {x_top_p}\")\n if x_temp <= 0.2:\n x_temp = 0.2\n if x_temp >= 5:\n x_temp = 5\n if x_top_p <= 0:\n x_top_p = 0\n if msg == '+reset':\n out = load_all_stat('', 'chat_init')\n save_all_stat(srv, 'chat', out)\n reply_msg(\"Chat reset.\")\n return\n elif msg[:5].lower() == '+gen ' or msg[:4].lower() == '+qa ' or msg.lower() == '+more' or msg.lower() == '+retry':", + "type": "code", + "location": "/RWKV-v4neo/chat.py:195-230" + }, + "601": { + "file_id": 32, + "content": "This code defines a function `on_message()` that processes incoming messages. It checks if the message is longer than 1000 tokens, applies temperature and top-p sampling parameters, handles reset requests, and possibly generates new text or asks questions based on message content. The processing includes loading and saving chat state, printing messages to console, and sending appropriate replies.", + "type": "comment" + }, + "602": { + "file_id": 32, + "content": " if msg[:5].lower() == '+gen ':\n new = '\\n' + msg[5:].strip()\n # print(f'### prompt ###\\n[{new}]')\n current_state = None\n out = run_rnn(tokenizer.tokenizer.encode(new))\n save_all_stat(srv, 'gen_0', out)\n elif msg[:4].lower() == '+qa ':\n out = load_all_stat('', 'chat_init')\n real_msg = msg[4:].strip()\n new = f\"{user}{interface} {real_msg}\\n\\n{bot}{interface}\"\n # print(f'### qa ###\\n[{new}]')\n out = run_rnn(tokenizer.tokenizer.encode(new))\n save_all_stat(srv, 'gen_0', out)\n # new = f\"\\nThe following is an excellent Q&A session consists of detailed and factual information.\\n\\nQ: What is 3+5?\\nA: The answer is 8.\\n\\nQ: {msg[9:].strip()}\\nA:\"\n # print(f'### prompt ###\\n[{new}]')\n # current_state = None\n # out = run_rnn(tokenizer.tokenizer.encode(new))\n # save_all_stat(srv, 'gen_0', out)\n elif msg.lower() == '+more':", + "type": "code", + "location": "/RWKV-v4neo/chat.py:232-255" + }, + "603": { + "file_id": 32, + "content": "This code checks if the message starts with \"+gen\", \"+qa\", or \"+more\" and performs corresponding actions. If \"+gen\" is found, it generates a response based on the given prompt. If \"+qa\" is found, it loads previous chat context and continues the conversation. If \"+more\" is found, it displays more content related to the current context. It utilizes tokenizer for encoding messages and run_rnn to generate responses. The generated responses are saved in 'gen_0' state for future reference.", + "type": "comment" + }, + "604": { + "file_id": 32, + "content": " try:\n out = load_all_stat(srv, 'gen_1')\n save_all_stat(srv, 'gen_0', out)\n except:\n return\n elif msg.lower() == '+retry':\n try:\n out = load_all_stat(srv, 'gen_0')\n except:\n return\n begin = len(model_tokens)\n out_last = begin\n for i in range(150):\n token = tokenizer.sample_logits(\n out,\n model_tokens,\n args.ctx_len,\n temperature=x_temp,\n top_p_usual=x_top_p,\n top_p_newline=x_top_p,\n )\n if msg[:4].lower() == '+qa ':\n out = run_rnn([token], newline_adj=-1)\n else:\n out = run_rnn([token])\n xxx = tokenizer.tokenizer.decode(model_tokens[out_last:])\n if '\\ufffd' not in xxx:\n print(xxx, end='', flush=True)\n out_last = begin + i + 1\n print('\\n')\n # send_msg = tokenizer.tokenizer.decode(model_tokens[begin:]).strip()", + "type": "code", + "location": "/RWKV-v4neo/chat.py:256-289" + }, + "605": { + "file_id": 32, + "content": "The code is performing the following tasks:\n1. Loading statistics from a server and saving them, with error handling for any exceptions that might occur.\n2. Handling user input, specifically \"+retry\" command, by loading saved statistics from an alternative source if an exception occurs during the initial load.\n3. Generating tokens using tokenizer's sample_logits function, considering various parameters like temperature and top_p values.\n4. Running RNN model on generated tokens to process them, handling different cases based on user input.\n5. Printing generated output, handling special characters, and updating the tracking variables for the next iteration.", + "type": "comment" + }, + "606": { + "file_id": 32, + "content": " # print(f'### send ###\\n[{send_msg}]')\n # reply_msg(send_msg)\n save_all_stat(srv, 'gen_1', out)\n else:\n if msg.lower() == '+alt':\n try:\n out = load_all_stat(srv, 'chat_pre')\n except:\n return\n else:\n out = load_all_stat(srv, 'chat')\n new = f\"{user}{interface} {msg}\\n\\n{bot}{interface}\"\n # print(f'### add ###\\n[{new}]')\n out = run_rnn(tokenizer.tokenizer.encode(new), newline_adj=-999999999)\n save_all_stat(srv, 'chat_pre', out)\n begin = len(model_tokens)\n out_last = begin\n print(f'{bot}{interface}', end='', flush=True)\n for i in range(999):\n if i <= 0:\n newline_adj = -999999999\n elif i <= 30:\n newline_adj = (i - 30) / 10\n elif i <= 130:\n newline_adj = 0\n else:\n newline_adj = (i - 130) * 0.25 # MUST END THE GENERATION\n token = tokenizer.sample_logits(", + "type": "code", + "location": "/RWKV-v4neo/chat.py:290-319" + }, + "607": { + "file_id": 32, + "content": "This code handles two cases: sending a message and adding a message to the chat history. If the message is \"+alt\", it loads the previous chat state. Otherwise, it loads the current chat state, generates a new message using an RNN model, saves the new message in the \"chat_pre\" file, and prints part of the output. The code also determines the appropriate newline adjustment for the generated text. It uses tokenizer.sample_logits to generate the next token based on the current state of the model.", + "type": "comment" + }, + "608": { + "file_id": 32, + "content": " out,\n model_tokens,\n args.ctx_len,\n temperature=x_temp,\n top_p_usual=x_top_p,\n top_p_newline=x_top_p,\n )\n out = run_rnn([token], newline_adj=newline_adj)\n xxx = tokenizer.tokenizer.decode(model_tokens[out_last:])\n if '\\ufffd' not in xxx:\n print(xxx, end='', flush=True)\n out_last = begin + i + 1\n send_msg = tokenizer.tokenizer.decode(model_tokens[begin:])\n if '\\n\\n' in send_msg:\n send_msg = send_msg.strip()\n break\n # send_msg = tokenizer.tokenizer.decode(model_tokens[begin:]).strip()\n # if send_msg.endswith(f'{user}{interface}'): # warning: needs to fix state too !!!\n # send_msg = send_msg[:-len(f'{user}{interface}')].strip()\n # break\n # if send_msg.endswith(f'{bot}{interface}'):\n # send_msg = send_msg[:-len(f'{bot}{interface}')].strip()", + "type": "code", + "location": "/RWKV-v4neo/chat.py:320-344" + }, + "609": { + "file_id": 32, + "content": "This code is responsible for generating text from a model, tokenizing the output, and printing it until a newline or specified context length is reached. It also handles breaking the loop when encountering a double newline or specific user/bot messages.", + "type": "comment" + }, + "610": { + "file_id": 32, + "content": " # break\n # print(f'{model_tokens}')\n # print(f'[{tokenizer.tokenizer.decode(model_tokens)}]')\n # print(f'### send ###\\n[{send_msg}]')\n # reply_msg(send_msg)\n save_all_stat(srv, 'chat', out)\nprint(HELP_MSG)\nwhile True:\n msg = input(f'{user}{interface} ')\n if len(msg.strip()) > 0:\n on_message(msg)\n else:\n print('Erorr: please say something')", + "type": "code", + "location": "/RWKV-v4neo/chat.py:345-361" + }, + "611": { + "file_id": 32, + "content": "The code handles user input, continuously prompts the user for messages, passes them to a function `on_message()`, and saves chat data if necessary. If no valid input is provided, it prints an error message.", + "type": "comment" + }, + "612": { + "file_id": 33, + "content": "/RWKV-v4neo/cuda/wkv5_cuda.cu", + "type": "filepath" + }, + "613": { + "file_id": 33, + "content": "This CUDA code optimizes neural network forward pass with shared memory, efficient matrix operations, and parallel computation. It performs convolution using kernel functions, shared memory, synchronization, and unrolled loops. Assertions ensure efficient GPU computation.", + "type": "summary" + }, + "614": { + "file_id": 33, + "content": "#include\n#include \n#include \"ATen/ATen.h\"\ntypedef at::BFloat16 bf16;\ntemplate \n__global__ void kernel_forward(const int B, const int T, const int C, const int H,\n const F *__restrict__ const _r, const F *__restrict__ const _k, const F *__restrict__ const _v, const float *__restrict__ _w, const F *__restrict__ _u,\n F *__restrict__ const _y)\n{\n const int b = blockIdx.x / H;\n const int h = blockIdx.x % H;\n const int i = threadIdx.x;\n _w += h*_N_;\n _u += h*_N_;\n __shared__ float r[_N_], k[_N_], u[_N_], w[_N_];\n float state[_N_] = {0};\n __syncthreads();\n w[i] = _w[i];\n u[i] = float(_u[i]);\n __syncthreads();\n for (int t = b*T*C + h*_N_ + i; t < (b+1)*T*C + h*_N_ + i; t += C)\n {\n __syncthreads();\n r[i] = float(_r[t]);\n k[i] = float(_k[t]);\n __syncthreads();\n const float v = float(_v[t]);\n float y = 0;\n #pragma unroll\n for (int j = 0; j < _N_; j+=4)", + "type": "code", + "location": "/RWKV-v4neo/cuda/wkv5_cuda.cu:1-36" + }, + "615": { + "file_id": 33, + "content": "Code implements a CUDA kernel for the forward pass of a neural network layer, where each thread calculates output values based on input data and pre-stored parameters. It uses shared memory to store intermediate results and synchronizes threads with `__syncthreads()`. The loop iterates over time steps, applying element-wise operations to calculate output values.", + "type": "comment" + }, + "616": { + "file_id": 33, + "content": " {\n const float4& r_ = (float4&)(r[j]);\n const float4& k_ = (float4&)(k[j]);\n const float4& w_ = (float4&)(w[j]);\n const float4& u_ = (float4&)(u[j]);\n float4& s = (float4&)(state[j]);\n float4 x;\n x.x = k_.x * v;\n x.y = k_.y * v;\n x.z = k_.z * v;\n x.w = k_.w * v;\n y += r_.x * (u_.x * x.x + s.x);\n y += r_.y * (u_.y * x.y + s.y);\n y += r_.z * (u_.z * x.z + s.z);\n y += r_.w * (u_.w * x.w + s.w);\n s.x = s.x * w_.x + x.x;\n s.y = s.y * w_.y + x.y;\n s.z = s.z * w_.z + x.z;\n s.w = s.w * w_.w + x.w;\n }\n _y[t] = F(y);\n }\n}\ntemplate \n__global__ void kernel_backward(const int B, const int T, const int C, const int H,\n const F *__restrict__ const _r, const F *__restrict__ const _k, const F *__restrict__ const _v, const float *__restrict__ _w, const float *__restrict__ __w, const F *__restrict__ _u, const F *__restrict__ const _gy,", + "type": "code", + "location": "/RWKV-v4neo/cuda/wkv5_cuda.cu:37-66" + }, + "617": { + "file_id": 33, + "content": "This code is performing a matrix multiplication operation using CUDA. It takes in four input matrices, calculates the dot product between two sets of vectors, and updates the state vector accordingly. The result is then passed to a function F for further processing.", + "type": "comment" + }, + "618": { + "file_id": 33, + "content": " F *__restrict__ const _gr, F *__restrict__ const _gk, F *__restrict__ const _gv, F *__restrict__ const _gw, F *__restrict__ const _gu)\n{\n const int b = blockIdx.x / H;\n const int h = blockIdx.x % H;\n const int i = threadIdx.x;\n _w += h*_N_;\n _u += h*_N_;\n __w += h*_N_;\n __shared__ float w_[_N_], u_[_N_];\n __shared__ float r[_N_], k[_N_], v[_N_], gy[_N_];\n __syncthreads();\n w_[i] = _w[i];\n u_[i] = float(_u[i]);\n __syncthreads();\n const float w = w_[i];\n const float ww = __w[i];\n const float u = u_[i];\n float state[_N_] = {0}, saaaa[_N_] = {0}, sbbbb[_N_] = {0}, scccc[_N_] = {0}, sdddd[_N_] = {0};\n float gw = 0, gu = 0;\n const int t000 = b*T*C + h*_N_ + i;\n const int t111 = (b+1)*T*C + h*_N_ + i;\n const int t222 = t111 - 2*C;\n for (int t = t000; t < t111; t += C)\n {\n __syncthreads();\n v[i] = float(_v[t]);\n gy[i] = float(_gy[t]);\n __syncthreads();\n const float k = float(_k[t]);\n float gr = 0, gu_ = 0;\n #pragma unroll", + "type": "code", + "location": "/RWKV-v4neo/cuda/wkv5_cuda.cu:67-104" + }, + "619": { + "file_id": 33, + "content": "This function calculates the recurrent weight updates in a neural network using CUDA. It uses shared memory for efficient parallel computation and synchronizes threads with `__syncthreads()`. The variables `w`, `u` represent input and output tensors, while `v` and `gy` store intermediate results. The loop iterates over the time dimension (T) and channel dimension (C).", + "type": "comment" + }, + "620": { + "file_id": 33, + "content": " for (int j = 0; j < _N_; j++)\n {\n float& s = state[j];\n float x = k * v[j];\n gr += (u * x + s) * gy[j];\n gu_ += x * gy[j];\n s = s * w + x;\n }\n _gr[t] = F(gr);\n gu += float(_r[t]) * gu_;\n }\n _gu[b*C + h*_N_ + i] = F(gu);\n for (int t = t000; t < t222; t += C)\n {\n __syncthreads();\n v[i] = float(_v[t]);\n gy[i] = float(_gy[t + 2*C]);\n __syncthreads();\n const float k = float(_k[t]);\n float gw_ = 0;\n #pragma unroll\n for (int j = 0; j < _N_; j++)\n {\n float& s = saaaa[j];\n float& s2 = sbbbb[j];\n float x = k * v[j];\n float tmp = w * (x + s);\n s = tmp;\n s2 = tmp + w * s2;\n gw_ += s2 * gy[j];\n }\n gw += float(_r[t + 2*C]) * gw_;\n } \n _gw[b*C + h*_N_ + i] = F(ww * gw);\n for (int t = t111 - C; t >= t000; t -= C)\n {\n __syncthreads();\n v[i] = float(_v[t]);\n gy[i] = float(_gy[t]);", + "type": "code", + "location": "/RWKV-v4neo/cuda/wkv5_cuda.cu:105-149" + }, + "621": { + "file_id": 33, + "content": "This code appears to be performing element-wise operations on arrays, such as adding and multiplying elements. It uses a for loop to iterate through the array elements, with each iteration updating the state variables and accumulating results for later use. The code also includes synchronization points (__syncthreads) for thread coordination, and it utilizes arrays with names like _gr, _gu, and _gw for storing intermediate and final results. Overall, this code seems to be part of a larger computation involving array manipulation and synchronization.", + "type": "comment" + }, + "622": { + "file_id": 33, + "content": " __syncthreads();\n const float rr = float(_r[t]);\n float gk = 0;\n #pragma unroll\n for (int j = 0; j < _N_; j++)\n {\n float& s = scccc[j];\n float x = rr * gy[j];\n gk += (u * x + s) * v[j];\n s = x + s * w;\n }\n _gk[t] = F(gk);\n }\n for (int t = t111 - C; t >= t000; t -= C)\n {\n __syncthreads();\n r[i] = float(_r[t]);\n k[i] = float(_k[t]);\n __syncthreads();\n const float gyy = float(_gy[t]);\n float gv = 0;\n #pragma unroll\n for (int j = 0; j < _N_; j++)\n {\n float& s = sdddd[j];\n float x = gyy * r[j];\n gv += (u_[j] * x + s) * k[j];\n s = x + s * w_[j];\n }\n _gv[t] = F(gv);\n }\n}\nvoid cuda_forward(int B, int T, int C, int H, bf16 *r, bf16 *k, bf16 *v, float *w, bf16 *u, bf16 *y)\n{\n assert(H*_N_ == C);\n assert(_N_%4 == 0);\n kernel_forward<< >>(B, T, C, H, r, k, v, w, u, y);\n}\nvo", + "type": "code", + "location": "/RWKV-v4neo/cuda/wkv5_cuda.cu:150-197" + }, + "623": { + "file_id": 33, + "content": "The code is performing a convolution operation in CUDA. It first executes a kernel function that processes the data in blocks and threads, then calls another kernel function to compute the final results. The code uses shared memory, synchronization, and unrolling of loops for efficient computation. The `cuda_forward` function initializes the necessary variables and launches the kernel functions with appropriate dimensions.", + "type": "comment" + }, + "624": { + "file_id": 33, + "content": "id cuda_backward(int B, int T, int C, int H, bf16 *r, bf16 *k, bf16 *v, float *w, float *ww, bf16 *u, bf16 *gy, bf16 *gr, bf16 *gk, bf16 *gv, bf16 *gw, bf16 *gu)\n{\n assert(H*_N_ == C);\n assert(_N_%4 == 0);\n kernel_backward<< >>(B, T, C, H, r, k, v, w, ww, u, gy, gr, gk, gv, gw, gu);\n}", + "type": "code", + "location": "/RWKV-v4neo/cuda/wkv5_cuda.cu:197-202" + }, + "625": { + "file_id": 33, + "content": "Function `cuda_backward` is a CUDA kernel launcher, taking input/output parameters and invoking the `kernel_backward` kernel with appropriate grid and block dimensions. The assertions ensure correct memory layouts and alignment for efficient GPU computation.", + "type": "comment" + }, + "626": { + "file_id": 34, + "content": "/RWKV-v4neo/cuda/wkv5_op.cpp", + "type": "filepath" + }, + "627": { + "file_id": 34, + "content": "This C++ code implements forward and backward neural network operations using PyTorch tensors, optimized for CUDA execution. It includes functions for BFloat16 data type, with Python module \"wkv5\" for forward and backward operations.", + "type": "summary" + }, + "628": { + "file_id": 34, + "content": "#include \n#include \"ATen/ATen.h\"\ntypedef at::BFloat16 bf16;\nvoid cuda_forward(int B, int T, int C, int H, bf16 *r, bf16 *k, bf16 *v, float *w, bf16 *u, bf16 *y);\nvoid cuda_backward(int B, int T, int C, int H, bf16 *r, bf16 *k, bf16 *v, float *w, float *ww, bf16 *u, bf16 *gy, bf16 *gr, bf16 *gk, bf16 *gv, bf16 *gw, bf16 *gu);\nvoid forward(int64_t B, int64_t T, int64_t C, int64_t H, torch::Tensor &r, torch::Tensor &k, torch::Tensor &v, torch::Tensor &w, torch::Tensor &u, torch::Tensor &y) {\n cuda_forward(B, T, C, H, r.data_ptr (), k.data_ptr (), v.data_ptr (), w.data_ptr (), u.data_ptr (), y.data_ptr ());\n}\nvoid backward(int64_t B, int64_t T, int64_t C, int64_t H, torch::Tensor &r, torch::Tensor &k, torch::Tensor &v, torch::Tensor &w, torch::Tensor &ww, torch::Tensor &u, torch::Tensor &gy, torch::Tensor &gr, torch::Tensor &gk, torch::Tensor &gv, torch::Tensor &gw, torch::Tensor &gu) {\n cuda_backward(B, T, C, H, r.data_ptr (), k.data_ptr", + "type": "code", + "location": "/RWKV-v4neo/cuda/wkv5_op.cpp:1-12" + }, + "629": { + "file_id": 34, + "content": "This code is a C++ implementation of forward and backward passes for an unknown neural network operation. It includes functions `cuda_forward` and `cuda_backward`, which are called by the corresponding `forward` and `backward` wrapper functions. The wrapper functions handle memory allocation, type conversion, and data pointers for PyTorch tensors. The code uses BFloat16 as the floating-point data type and is optimized for CUDA execution.", + "type": "comment" + }, + "630": { + "file_id": 34, + "content": " (), v.data_ptr (), w.data_ptr (), ww.data_ptr (), u.data_ptr (), gy.data_ptr (), gr.data_ptr (), gk.data_ptr (), gv.data_ptr (), gw.data_ptr (), gu.data_ptr ());\n}\nPYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {\n m.def(\"forward\", &forward, \"wkv5 forward\");\n m.def(\"backward\", &backward, \"wkv5 backward\");\n}\nTORCH_LIBRARY(wkv5, m) {\n m.def(\"forward\", forward);\n m.def(\"backward\", backward);\n}", + "type": "code", + "location": "/RWKV-v4neo/cuda/wkv5_op.cpp:12-22" + }, + "631": { + "file_id": 34, + "content": "This code defines a Python module for the \"wkv5\" function, which includes forward and backward operations. The variables declared represent different data pointers used in these operations.", + "type": "comment" + }, + "632": { + "file_id": 35, + "content": "/RWKV-v4neo/cuda/wkv_cuda.cu", + "type": "filepath" + }, + "633": { + "file_id": 35, + "content": "The code performs matrix multiplication, computes gradients for neural network backward pass, and optimally initializes variables for CUDA implementation of RWKV model's forward and backward passes using efficient execution configuration.", + "type": "summary" + }, + "634": { + "file_id": 35, + "content": "#include \n#include \n#define MIN_VALUE (-1e38)\ntemplate \n__global__ void kernel_forward(const int B, const int T, const int C,\n const F *__restrict__ const _w, const F *__restrict__ const _u, const F *__restrict__ const _k, const F *__restrict__ const _v,\n F *__restrict__ const _y) {\n const int idx = blockIdx.x * blockDim.x + threadIdx.x;\n const int _b = idx / C;\n const int _c = idx % C;\n const int _offset = _b * T * C + _c;\n F u = _u[_c];\n F w = _w[_c];\n const F *__restrict__ const k = _k + _offset;\n const F *__restrict__ const v = _v + _offset;\n F *__restrict__ const y = _y + _offset;\n // aa and bb are running sums divided by exp(pp) (to avoid overflow)\n F aa = 0, bb = 0, pp = MIN_VALUE;\n for (int i = 0; i < T; i++) {\n const int ii = i * C;\n const F kk = k[ii];\n const F vv = v[ii];\n F ww = u + kk;\n F p = max(pp, ww);\n F e1 = exp(pp - p);\n F e2 = exp(ww - p);", + "type": "code", + "location": "/RWKV-v4neo/cuda/wkv_cuda.cu:1-31" + }, + "635": { + "file_id": 35, + "content": "Kernel function for matrix multiplication with accumulation of running sums and avoiding overflow by dividing the sums by exp(pp).", + "type": "comment" + }, + "636": { + "file_id": 35, + "content": " y[ii] = (e1 * aa + e2 * vv) / (e1 * bb + e2);\n ww = w + pp;\n p = max(ww, kk);\n e1 = exp(ww - p);\n e2 = exp(kk - p);\n aa = e1 * aa + e2 * vv;\n bb = e1 * bb + e2;\n pp = p;\n }\n}\ntemplate \n__global__ void kernel_backward(const int B, const int T, const int C,\n const F *__restrict__ const _w, const F *__restrict__ const _u, const F *__restrict__ const _k, const F *__restrict__ const _v,\n const F *__restrict__ const _y, const F *__restrict__ const _gy,\n F *__restrict__ const _gw, F *__restrict__ const _gu, F *__restrict__ const _gk, F *__restrict__ const _gv) {\n const int idx = blockIdx.x * blockDim.x + threadIdx.x;\n const int _b = idx / C;\n const int _c = idx % C;\n const int _offset = _b * T * C + _c;\n F u = _u[_c];\n F w = _w[_c];\n const F *__restrict__ const k = _k + _offset;\n const F *__restrict__ const v = _v + _offset;\n const F *__restrict__ const y = _y + _offset;", + "type": "code", + "location": "/RWKV-v4neo/cuda/wkv_cuda.cu:32-58" + }, + "637": { + "file_id": 35, + "content": "This code performs a matrix multiplication and subsequent calculations to compute gradients for the backward pass in a neural network. The kernel function takes input weights, input activations, output activations, and their respective gradients as inputs, and computes gradients for the output weights and input weights. The comments should reflect this understanding of what the code is doing.", + "type": "comment" + }, + "638": { + "file_id": 35, + "content": " const F *__restrict__ const gy = _gy + _offset;\n F *__restrict__ const gk = _gk + _offset;\n F *__restrict__ const gv = _gv + _offset;\n F q[Tmax], r[Tmax];\n F gw = 0, gu = 0, aa = 0, bb = 0, ga = 0, gb = 0, pp = MIN_VALUE;\n for (int i = 0; i < T; i++) {\n const int ii = i * C;\n const F kk = k[ii];\n const F vv = v[ii];\n const F yy = y[ii];\n F ww = u + kk;\n F p = max(pp, ww);\n F e1 = exp(pp - p);\n F e2 = exp(ww - p);\n const F qq = gy[ii] / (e1 * bb + e2);\n gw += (ga - gb * yy) * e1 * qq;\n gu += (vv - yy) * e2 * qq;\n q[i] = qq;\n r[i] = ww - p;\n ww = w + pp;\n p = max(ww, kk);\n e1 = exp(ww - p);\n e2 = exp(kk - p);\n ga = e1 * (aa + ga);\n gb = e1 * (bb + gb);\n aa = e1 * aa + e2 * vv;\n bb = e1 * bb + e2;\n pp = p;\n }\n const int _offsetBC = _b * C + _c;\n _gw[_offsetBC] = gw * _w[_c]; // multiply by w because of w -> -exp(w) in python forward()\n _gu[_offsetBC] = gu;", + "type": "code", + "location": "/RWKV-v4neo/cuda/wkv_cuda.cu:59-94" + }, + "639": { + "file_id": 35, + "content": "This code segment initializes variables and iterates over the data to calculate values for gw, gu, q, and r. It then assigns these calculated values to their respective locations in memory. The multiplication by w is because of the transformation from forward pass in python.", + "type": "comment" + }, + "640": { + "file_id": 35, + "content": " aa = 0, bb = 0, pp = MIN_VALUE;\n for (int i = T - 1; i >= 0; i--) {\n const int ii = i * C;\n const F kk = k[ii];\n const F vv = v[ii];\n const F yy = y[ii];\n const F qq = q[i];\n const F rr = r[i];\n F e1 = qq * exp(rr);\n F e2 = exp(kk + pp);\n gk[ii] = e1 * (vv - yy) + e2 * (aa * vv + bb);\n gv[ii] = e1 + e2 * aa;\n const F ww = w + pp;\n const F www = rr - u - kk;\n const F p = max(ww, www);\n e1 = exp(ww - p);\n e2 = qq * exp(www - p);\n aa = e1 * aa + e2;\n bb = e1 * bb - e2 * yy;\n pp = p;\n }\n}\nvoid cuda_forward(int B, int T, int C, float *w, float *u, float *k, float *v, float *y) {\n dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance\n assert(B * C % threadsPerBlock.x == 0);\n dim3 numBlocks(B * C / threadsPerBlock.x);\n kernel_forward<< >>(B, T, C, w, u, k, v, y);\n}\nvoid cuda_backward(int B, int T, int C, float *w, float *u, float *k, float *v, float *y, float *gy, float *gw, float *gu, float *gk, float *gv) {", + "type": "code", + "location": "/RWKV-v4neo/cuda/wkv_cuda.cu:96-128" + }, + "641": { + "file_id": 35, + "content": "The code initializes variables and performs a matrix operation for the CUDA implementation of RWKV model's forward and backward passes. The function `cuda_forward` launches a CUDA kernel that processes each block with 32 threads, while `cuda_backward` computes gradients for the same matrix operations as the forward pass, storing them in `gw`, `gu`, `gk`, `gv`. The computation involves element-wise multiplication and addition using `e1` and `e2`, respectively. The function `cuda_forward` requires `--maxrregcount 60` for optimal performance due to register usage.", + "type": "comment" + }, + "642": { + "file_id": 35, + "content": " dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance\n assert(B * C % threadsPerBlock.x == 0);\n dim3 numBlocks(B * C / threadsPerBlock.x);\n kernel_backward<< >>(B, T, C, w, u, k, v, y, gy, gw, gu, gk, gv);\n}", + "type": "code", + "location": "/RWKV-v4neo/cuda/wkv_cuda.cu:129-133" + }, + "643": { + "file_id": 35, + "content": "The code sets up the execution configuration for a CUDA kernel function named 'kernel_backward'. It determines the optimal number of threads per block (min(C, 32)) and ensures that the number of blocks is proportional to B*C, ensuring even distribution of tasks. It then launches the kernel function on the specified number of blocks and threads.", + "type": "comment" + }, + "644": { + "file_id": 36, + "content": "/RWKV-v4neo/cuda/wkv_cuda_bf16.cu", + "type": "filepath" + }, + "645": { + "file_id": 36, + "content": "This code performs a forward pass of a neural network on GPU with BFloat16 data type and softmax cross-entropy loss, calculating gradients using fixed-point arrays in CUDA. It defines two functions, cuda_forward and cuda_backward, which use CUDA to perform matrix operations on GPU.", + "type": "summary" + }, + "646": { + "file_id": 36, + "content": "#include \n#include \n#include \"ATen/ATen.h\"\n#define MIN_VALUE (-1e38)\ntypedef at::BFloat16 bf16;\n__global__ void kernel_forward(const int B, const int T, const int C,\n const float *__restrict__ const _w, const bf16 *__restrict__ const _u, const bf16 *__restrict__ const _k, const bf16 *__restrict__ const _v,\n bf16 *__restrict__ const _y) {\n const int idx = blockIdx.x * blockDim.x + threadIdx.x;\n const int _b = idx / C;\n const int _c = idx % C;\n const int _offset = _b * T * C + _c;\n float u = float(_u[_c]);\n float w = _w[_c];\n const bf16 *__restrict__ const k = _k + _offset;\n const bf16 *__restrict__ const v = _v + _offset;\n bf16 *__restrict__ const y = _y + _offset;\n // aa and bb are running sums divided by exp(pp) (to avoid overflow)\n float aa = 0, bb = 0, pp = MIN_VALUE;\n for (int i = 0; i < T; i++) {\n const int ii = i * C;\n const float kk = float(k[ii]);\n const float vv = float(v[ii]);", + "type": "code", + "location": "/RWKV-v4neo/cuda/wkv_cuda_bf16.cu:1-26" + }, + "647": { + "file_id": 36, + "content": "This function implements the forward pass of a neural network operation using BFloat16 (bf16) data type on GPU. It takes input dimensions and pointers to weights, inputs, keys, and values arrays as parameters. The function then performs element-wise multiplications, accumulates results, and stores the result in an output array.", + "type": "comment" + }, + "648": { + "file_id": 36, + "content": " float ww = u + kk;\n float p = max(pp, ww);\n float e1 = exp(pp - p);\n float e2 = exp(ww - p);\n y[ii] = bf16((e1 * aa + e2 * vv) / (e1 * bb + e2));\n ww = w + pp;\n p = max(ww, kk);\n e1 = exp(ww - p);\n e2 = exp(kk - p);\n aa = e1 * aa + e2 * vv;\n bb = e1 * bb + e2;\n pp = p;\n }\n}\n__global__ void kernel_backward(const int B, const int T, const int C,\n const float *__restrict__ const _w, const bf16 *__restrict__ const _u, const bf16 *__restrict__ const _k, const bf16 *__restrict__ const _v,\n const bf16 *__restrict__ const _y, const bf16 *__restrict__ const _gy,\n bf16 *__restrict__ const _gw, bf16 *__restrict__ const _gu, bf16 *__restrict__ const _gk, bf16 *__restrict__ const _gv) {\n const int idx = blockIdx.x * blockDim.x + threadIdx.x;\n const int _b = idx / C;\n const int _c = idx % C;\n const int _offset = _b * T * C + _c;\n float u = float(_u[_c]);", + "type": "code", + "location": "/RWKV-v4neo/cuda/wkv_cuda_bf16.cu:28-53" + }, + "649": { + "file_id": 36, + "content": "Calculates gradients for weight, input, and kernel arrays using backward pass with softmax cross-entropy loss. Batch size, sequence length, number of channels, weight, input, kernel arrays, output gradients, and gradients for each array are passed as arguments to the kernel function. Gradient computation is performed per element in the arrays.", + "type": "comment" + }, + "650": { + "file_id": 36, + "content": " float w = _w[_c];\n const bf16 *__restrict__ const k = _k + _offset;\n const bf16 *__restrict__ const v = _v + _offset;\n const bf16 *__restrict__ const y = _y + _offset;\n const bf16 *__restrict__ const gy = _gy + _offset;\n bf16 *__restrict__ const gk = _gk + _offset;\n bf16 *__restrict__ const gv = _gv + _offset;\n float q[Tmax], r[Tmax];\n float gw = 0, gu = 0, aa = 0, bb = 0, ga = 0, gb = 0, pp = MIN_VALUE;\n for (int i = 0; i < T; i++) {\n const int ii = i * C;\n const float kk = float(k[ii]);\n const float vv = float(v[ii]);\n const float yy = float(y[ii]);\n float ww = u + kk;\n float p = max(pp, ww);\n float e1 = exp(pp - p);\n float e2 = exp(ww - p);\n const float qq = float(gy[ii]) / (e1 * bb + e2);\n gw += (ga - gb * yy) * e1 * qq;\n gu += (vv - yy) * e2 * qq;\n q[i] = qq;\n r[i] = ww - p;\n ww = w + pp;\n p = max(ww, kk);\n e1 = exp(ww - p);\n e2 = exp(kk - p);\n ga = e1 * (aa + ga);", + "type": "code", + "location": "/RWKV-v4neo/cuda/wkv_cuda_bf16.cu:54-85" + }, + "651": { + "file_id": 36, + "content": "The code is calculating the softmax and element-wise multiplication of input vectors for matrix multiplication using bf16 data type in CUDA. It initializes variables, performs calculations using exp() and max(), stores results in q and r arrays, and updates gw and gu variables.", + "type": "comment" + }, + "652": { + "file_id": 36, + "content": " gb = e1 * (bb + gb);\n aa = e1 * aa + e2 * vv;\n bb = e1 * bb + e2;\n pp = p;\n }\n const int _offsetBC = _b * C + _c;\n _gw[_offsetBC] = bf16(gw * _w[_c]); // multiply by w because of w -> -exp(w) in python forward()\n _gu[_offsetBC] = bf16(gu);\n aa = 0, bb = 0, pp = MIN_VALUE;\n for (int i = T - 1; i >= 0; i--) {\n const int ii = i * C;\n const float kk = float(k[ii]);\n const float vv = float(v[ii]);\n const float yy = float(y[ii]);\n const float qq = q[i];\n const float rr = r[i];\n float e1 = qq * exp(rr);\n float e2 = exp(kk + pp);\n gk[ii] = bf16(e1 * (vv - yy) + e2 * (aa * vv + bb));\n gv[ii] = bf16(e1 + e2 * aa);\n const float ww = w + pp;\n const float www = rr - u - kk;\n const float p = max(ww, www);\n e1 = exp(ww - p);\n e2 = qq * exp(www - p);\n aa = e1 * aa + e2;\n bb = e1 * bb - e2 * yy;\n pp = p;\n }\n}\nvoid cuda_forward(int B, int T, int C, float *w, bf16 *u, bf16 *k, bf16 *v, bf16 *y) {", + "type": "code", + "location": "/RWKV-v4neo/cuda/wkv_cuda_bf16.cu:86-120" + }, + "653": { + "file_id": 36, + "content": "This code computes the forward pass of a neural network using CUDA for efficient GPU computation. The input includes batch size B, time steps T, channels C, and floating-point w parameter, along with fixed-point u, k, v, and y arrays. It initializes gk and gv arrays and performs element-wise computations to calculate the gradients.", + "type": "comment" + }, + "654": { + "file_id": 36, + "content": " dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance\n assert(B * C % threadsPerBlock.x == 0);\n dim3 numBlocks(B * C / threadsPerBlock.x);\n kernel_forward<< >>(B, T, C, w, u, k, v, y);\n}\nvoid cuda_backward(int B, int T, int C, float *w, bf16 *u, bf16 *k, bf16 *v, bf16 *y, bf16 *gy, bf16 *gw, bf16 *gu, bf16 *gk, bf16 *gv) {\n dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance\n assert(B * C % threadsPerBlock.x == 0);\n dim3 numBlocks(B * C / threadsPerBlock.x);\n kernel_backward<< >>(B, T, C, w, u, k, v, y, gy, gw, gu, gk, gv);\n}", + "type": "code", + "location": "/RWKV-v4neo/cuda/wkv_cuda_bf16.cu:121-132" + }, + "655": { + "file_id": 36, + "content": "This code defines two functions, cuda_forward and cuda_backward, which use CUDA to perform a matrix operation on a GPU. The function sets the number of threads per block based on the C dimension and ensures optimal performance by setting --maxrregcount 60. It also asserts that B * C is divisible by threadsPerBlock.x to ensure even distribution of tasks among blocks, then assigns the total number of blocks accordingly. Finally, it calls a kernel function with these parameters.", + "type": "comment" + }, + "656": { + "file_id": 37, + "content": "/RWKV-v4neo/cuda/wkv_op.cpp", + "type": "filepath" + }, + "657": { + "file_id": 37, + "content": "This code defines CUDA functions wrapped with Torch extension for PyTorch's forward and backward passes, implemented as a PyBind11 module for seamless integration.", + "type": "summary" + }, + "658": { + "file_id": 37, + "content": "#include \nvoid cuda_forward(int B, int T, int C, float *w, float *u, float *k, float *v, float *y);\nvoid cuda_backward(int B, int T, int C, float *w, float *u, float *k, float *v, float *y, float *gy, float *gw, float *gu, float *gk, float *gv);\nvoid forward(int64_t B, int64_t T, int64_t C, torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y) {\n cuda_forward(B, T, C, w.data_ptr (), u.data_ptr (), k.data_ptr (), v.data_ptr (), y.data_ptr ());\n}\nvoid backward(int64_t B, int64_t T, int64_t C, torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y, torch::Tensor &gy, torch::Tensor &gw, torch::Tensor &gu, torch::Tensor &gk, torch::Tensor &gv) {\n cuda_backward(B, T, C, w.data_ptr (), u.data_ptr (), k.data_ptr (), v.data_ptr (), y.data_ptr (), gy.data_ptr (), gw.data_ptr (), gu.data_ptr (), gk.data_ptr (), gv.data_ptr ());", + "type": "code", + "location": "/RWKV-v4neo/cuda/wkv_op.cpp:1-10" + }, + "659": { + "file_id": 37, + "content": "This code defines two functions, `forward` and `backward`, which perform the forward and backward passes of a computation. These functions are implemented in CUDA and wrapped with Torch extension for seamless integration with PyTorch. The `forward` function takes in Tensor inputs and calls the CUDA `cuda_forward` function to perform the computation on GPU, while `backward` performs the backward pass using the corresponding CUDA function.", + "type": "comment" + }, + "660": { + "file_id": 37, + "content": "}\nPYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {\n m.def(\"forward\", &forward, \"wkv forward\");\n m.def(\"backward\", &backward, \"wkv backward\");\n}\nTORCH_LIBRARY(wkv, m) {\n m.def(\"forward\", forward);\n m.def(\"backward\", backward);\n}", + "type": "code", + "location": "/RWKV-v4neo/cuda/wkv_op.cpp:11-21" + }, + "661": { + "file_id": 37, + "content": "This code defines a PyBind11 module for the Torch library, named \"TORCH_EXTENSION_NAME\". It includes forward and backward functions with their respective definitions. The module is then linked to the library using TORCH_LIBRARY.", + "type": "comment" + }, + "662": { + "file_id": 38, + "content": "/RWKV-v4neo/cuda/wkv_op_bf16.cpp", + "type": "filepath" + }, + "663": { + "file_id": 38, + "content": "The code defines CUDA functions for forward and backward torch Tensor computations, utilizing BF16 data type. Python bindings are created for these functions.", + "type": "summary" + }, + "664": { + "file_id": 38, + "content": "#include \n#include \"ATen/ATen.h\"\ntypedef at::BFloat16 bf16;\nvoid cuda_forward(int B, int T, int C, float *w, bf16 *u, bf16 *k, bf16 *v, bf16 *y);\nvoid cuda_backward(int B, int T, int C, float *w, bf16 *u, bf16 *k, bf16 *v, bf16 *y, bf16 *gy, bf16 *gw, bf16 *gu, bf16 *gk, bf16 *gv);\nvoid forward(int64_t B, int64_t T, int64_t C, torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y) {\n cuda_forward(B, T, C, w.data_ptr (), u.data_ptr (), k.data_ptr (), v.data_ptr (), y.data_ptr ());\n}\nvoid backward(int64_t B, int64_t T, int64_t C, torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y,\n torch::Tensor &gy, torch::Tensor &gw, torch::Tensor &gu, torch::Tensor &gk, torch::Tensor &gv) {\n cuda_backward(B, T, C, w.data_ptr (), u.data_ptr (), k.data_ptr (), v.data_ptr (), y.data_ptr (),\n gy.data_ptr (), gw.data_ptr (), gu.data_ptr (), gk.data_ptr (), gv.data_ptr ());", + "type": "code", + "location": "/RWKV-v4neo/cuda/wkv_op_bf16.cpp:1-14" + }, + "665": { + "file_id": 38, + "content": "This code defines a forward and backward function for torch Tensors using CUDA. The forward function calls a CUDA kernel function to compute the output tensor y based on inputs w, u, k, v. The backward function also calls a CUDA kernel function to compute gradients for w, u, k, v, gw, gu, gk, gv. BF16 (BFloat16) is used as a data type for some tensor operations.", + "type": "comment" + }, + "666": { + "file_id": 38, + "content": "}\nPYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {\n m.def(\"forward\", &forward, \"wkv forward\");\n m.def(\"backward\", &backward, \"wkv backward\");\n}\nTORCH_LIBRARY(wkv, m) {\n m.def(\"forward\", forward);\n m.def(\"backward\", backward);\n}", + "type": "code", + "location": "/RWKV-v4neo/cuda/wkv_op_bf16.cpp:15-25" + }, + "667": { + "file_id": 38, + "content": "Defining Python bindings for the forward and backward functions of the wkv extension module.", + "type": "comment" + }, + "668": { + "file_id": 39, + "content": "/RWKV-v4neo/img_demoAE.py", + "type": "filepath" + }, + "669": { + "file_id": 39, + "content": "This code imports libraries and defines a function ToBinary for converting images to binary format. It includes an R_ENCODER class with layers for image processing and backward operation, as well as forward pass and neural network class for image decoding. The code also defines a neural network model for image generation using convolution layers and Mish activation.", + "type": "summary" + }, + "670": { + "file_id": 39, + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nimport torch, types, os\nimport numpy as np\nfrom PIL import Image\nimport torch.nn as nn\nfrom torch.nn import functional as F\nimport torchvision as vision\nimport torchvision.transforms as transforms\nnp.set_printoptions(precision=4, suppress=True, linewidth=200)\nprint(f'loading...')\n########################################################################################################\nmodel_prefix = 'test/image_trained/out-v7c_d8_256-224-13bit-OB32x0.5-201'\ninput_img = 'test/img_ae_test/test0.png'\n########################################################################################################\nclass ToBinary(torch.autograd.Function):\n @staticmethod\n def forward(ctx, x):\n return torch.floor(x + 0.5) # no need for noise when we have plenty of data", + "type": "code", + "location": "/RWKV-v4neo/img_demoAE.py:1-25" + }, + "671": { + "file_id": 39, + "content": "This code imports necessary libraries and defines a function called ToBinary for converting input images to binary format. It uses RWKV Language Model and provides model_prefix and input_img variables for further processing.", + "type": "comment" + }, + "672": { + "file_id": 39, + "content": " @staticmethod\n def backward(ctx, grad_output):\n return grad_output.clone() # pass-through\nclass R_ENCODER(nn.Module):\n def __init__(self, args):\n super().__init__()\n self.args = args\n dd = 8\n self.Bxx = nn.BatchNorm2d(dd*64)\n self.CIN = nn.Conv2d(3, dd, kernel_size=3, padding=1)\n self.Cx0 = nn.Conv2d(dd, 32, kernel_size=3, padding=1)\n self.Cx1 = nn.Conv2d(32, dd, kernel_size=3, padding=1)\n self.B00 = nn.BatchNorm2d(dd*4)\n self.C00 = nn.Conv2d(dd*4, 256, kernel_size=3, padding=1)\n self.C01 = nn.Conv2d(256, dd*4, kernel_size=3, padding=1)\n self.C02 = nn.Conv2d(dd*4, 256, kernel_size=3, padding=1)\n self.C03 = nn.Conv2d(256, dd*4, kernel_size=3, padding=1)\n self.B10 = nn.BatchNorm2d(dd*16)\n self.C10 = nn.Conv2d(dd*16, 256, kernel_size=3, padding=1)\n self.C11 = nn.Conv2d(256, dd*16, kernel_size=3, padding=1)\n self.C12 = nn.Conv2d(dd*16, 256, kernel_size=3, padding=1)\n self.C13 = nn.Conv2d(256, dd*16, kernel_size=3, padding=1)", + "type": "code", + "location": "/RWKV-v4neo/img_demoAE.py:27-52" + }, + "673": { + "file_id": 39, + "content": "This code defines a class named R_ENCODER inheriting from nn.Module, which includes several convolutional and batch normalization layers for image processing or feature extraction. The backward function is defined as a pass-through operation for gradient computation during backpropagation.", + "type": "comment" + }, + "674": { + "file_id": 39, + "content": " self.B20 = nn.BatchNorm2d(dd*64)\n self.C20 = nn.Conv2d(dd*64, 256, kernel_size=3, padding=1)\n self.C21 = nn.Conv2d(256, dd*64, kernel_size=3, padding=1)\n self.C22 = nn.Conv2d(dd*64, 256, kernel_size=3, padding=1)\n self.C23 = nn.Conv2d(256, dd*64, kernel_size=3, padding=1)\n self.COUT = nn.Conv2d(dd*64, args.my_img_bit, kernel_size=3, padding=1)\n def forward(self, img):\n ACT = F.mish\n x = self.CIN(img)\n xx = self.Bxx(F.pixel_unshuffle(x, 8))\n x = x + self.Cx1(ACT(self.Cx0(x)))\n x = F.pixel_unshuffle(x, 2)\n x = x + self.C01(ACT(self.C00(ACT(self.B00(x)))))\n x = x + self.C03(ACT(self.C02(x)))\n x = F.pixel_unshuffle(x, 2)\n x = x + self.C11(ACT(self.C10(ACT(self.B10(x)))))\n x = x + self.C13(ACT(self.C12(x)))\n x = F.pixel_unshuffle(x, 2)\n x = x + self.C21(ACT(self.C20(ACT(self.B20(x)))))\n x = x + self.C23(ACT(self.C22(x)))\n x = self.COUT(x + xx)\n return torch.sigmoid(x)", + "type": "code", + "location": "/RWKV-v4neo/img_demoAE.py:54-82" + }, + "675": { + "file_id": 39, + "content": "This code defines a forward pass function for a neural network layer. It applies various convolutions and batch normalization to input image 'img' after passing it through several activation functions, including Mish. The final result is passed through a sigmoid function before being returned.", + "type": "comment" + }, + "676": { + "file_id": 39, + "content": "class R_DECODER(nn.Module):\n def __init__(self, args):\n super().__init__()\n self.args = args\n dd = 8\n self.CIN = nn.Conv2d(args.my_img_bit, dd*64, kernel_size=3, padding=1)\n self.B00 = nn.BatchNorm2d(dd*64)\n self.C00 = nn.Conv2d(dd*64, 256, kernel_size=3, padding=1)\n self.C01 = nn.Conv2d(256, dd*64, kernel_size=3, padding=1)\n self.C02 = nn.Conv2d(dd*64, 256, kernel_size=3, padding=1)\n self.C03 = nn.Conv2d(256, dd*64, kernel_size=3, padding=1)\n self.B10 = nn.BatchNorm2d(dd*16)\n self.C10 = nn.Conv2d(dd*16, 256, kernel_size=3, padding=1)\n self.C11 = nn.Conv2d(256, dd*16, kernel_size=3, padding=1)\n self.C12 = nn.Conv2d(dd*16, 256, kernel_size=3, padding=1)\n self.C13 = nn.Conv2d(256, dd*16, kernel_size=3, padding=1)\n self.B20 = nn.BatchNorm2d(dd*4)\n self.C20 = nn.Conv2d(dd*4, 256, kernel_size=3, padding=1)\n self.C21 = nn.Conv2d(256, dd*4, kernel_size=3, padding=1)\n self.C22 = nn.Conv2d(dd*4, 256, kernel_size=3, padding=1)", + "type": "code", + "location": "/RWKV-v4neo/img_demoAE.py:84-106" + }, + "677": { + "file_id": 39, + "content": "This code defines a class \"R_DECODER\" that inherits from the PyTorch's `nn.Module` and consists of multiple convolutional layers and batch normalization layers for image decoding. The class takes an argument \"args\", which contains information such as the number of image channels, and the kernel size of convolutions.", + "type": "comment" + }, + "678": { + "file_id": 39, + "content": " self.C23 = nn.Conv2d(256, dd*4, kernel_size=3, padding=1)\n self.Cx0 = nn.Conv2d(dd, 32, kernel_size=3, padding=1)\n self.Cx1 = nn.Conv2d(32, dd, kernel_size=3, padding=1)\n self.COUT = nn.Conv2d(dd, 3, kernel_size=3, padding=1)\n def forward(self, code):\n ACT = F.mish\n x = self.CIN(code)\n x = x + self.C01(ACT(self.C00(ACT(self.B00(x)))))\n x = x + self.C03(ACT(self.C02(x)))\n x = F.pixel_shuffle(x, 2)\n x = x + self.C11(ACT(self.C10(ACT(self.B10(x)))))\n x = x + self.C13(ACT(self.C12(x)))\n x = F.pixel_shuffle(x, 2)\n x = x + self.C21(ACT(self.C20(ACT(self.B20(x)))))\n x = x + self.C23(ACT(self.C22(x)))\n x = F.pixel_shuffle(x, 2)\n x = x + self.Cx1(ACT(self.Cx0(x)))\n x = self.COUT(x)\n return torch.sigmoid(x)\n########################################################################################################\nprint(f'building model...')\nargs = types.SimpleNamespace()\nargs.my_img_bit = 13\nencoder = R_ENCODER(args).eval().cuda()", + "type": "code", + "location": "/RWKV-v4neo/img_demoAE.py:107-139" + }, + "679": { + "file_id": 39, + "content": "This code defines a neural network model for image generation. It has multiple convolution layers and uses the Mish activation function. The model takes an input code, performs several operations with different convolution layers and pixel shuffling, and outputs a final tensor. The code also builds the model using given arguments.", + "type": "comment" + }, + "680": { + "file_id": 39, + "content": "decoder = R_DECODER(args).eval().cuda()\nzpow = torch.tensor([2**i for i in range(0,13)]).reshape(13,1,1).cuda().long()\nencoder.load_state_dict(torch.load(f'{model_prefix}-E.pth'))\ndecoder.load_state_dict(torch.load(f'{model_prefix}-D.pth'))\n########################################################################################################\nprint(f'test image...')\nimg_transform = transforms.Compose([\n transforms.PILToTensor(),\n transforms.ConvertImageDtype(torch.float),\n transforms.Resize((224, 224))\n])\nwith torch.no_grad():\n img = img_transform(Image.open(input_img)).unsqueeze(0).cuda()\n z = encoder(img)\n z = ToBinary.apply(z)\n zz = torch.sum(z.squeeze().long() * zpow, dim=0)\n print(f'Code shape = {zz.shape}\\n{zz.cpu().numpy()}\\n')\n out = decoder(z)\n vision.utils.save_image(out, f\"{input_img.split('.')[0]}-out-13bit.jpg\")", + "type": "code", + "location": "/RWKV-v4neo/img_demoAE.py:140-165" + }, + "681": { + "file_id": 39, + "content": "The code is loading a pre-trained encoder and decoder model, applying image transformations, and generating an output image using the decoder. It then saves the output image in a specific format. The code also prints the shape and values of a tensor after performing some operations on it.", + "type": "comment" + }, + "682": { + "file_id": 40, + "content": "/RWKV-v4neo/math_demo/run.py", + "type": "filepath" + }, + "683": { + "file_id": 40, + "content": "The code introduces RWKV, a deep learning NLP model with tokenization classes and RWKV_RNN functions. It initializes the model, loads checkpoints, optimizes performance using layer norm, time-dependent mixing, and sigmoid activation functions, generates context, predicts next character based on model output, and uses layer norm and time-mixing operations for processing.", + "type": "summary" + }, + "684": { + "file_id": 40, + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nimport numpy as np\nnp.set_printoptions(precision=4, suppress=True, linewidth=200)\nimport types, torch\nfrom torch.nn import functional as F\n# only + - *\nequation = \"4.2379*564.778-1209.01\" # 1184.4626862\n# equation = \"32731423*2189286\" # 71658446133978\n# equation = \"18239.715*9.728263\" # 177440.744565045\n# equation = \"2067*9832*4549\" # 92448162456\n########################################################################################################\nargs = types.SimpleNamespace()\nargs.MODEL_NAME = 'rwkv-200'\nargs.n_layer = 6\nargs.n_embd = 192\nclass TOKENIZER():\n def __init__(self):\n self.word_table = {\"0\": \"\\n\", \"1\": \" \", \"2\": \"(\", \"3\": \")\", \"4\": \"*\", \"5\": \"+\", \"6\": \"-\", \"7\": \".\", \"8\": \"0\", \"9\": \"1\", \"10\": \"2\", \"11\": \"3\", \"12\": \"4\", \"13\": \"5\", \"14\": \"6\", \"15\": \"7\", \"16\": \"8\", \"17\": \"9\", \"18\": \"=\", \"19\": \"e\", \"20\": \"f\"}", + "type": "code", + "location": "/RWKV-v4neo/math_demo/run.py:1-25" + }, + "685": { + "file_id": 40, + "content": "This code is a part of the RWKV Language Model, which is a deep learning model for natural language processing. The code snippet defines an equation and tokenizes it using a word table to map numbers and symbols to corresponding strings. It also initializes an object with a word_table dictionary and sets up arguments such as MODEL_NAME, n_layer, and n_embd.", + "type": "comment" + }, + "686": { + "file_id": 40, + "content": " self.vocab_size = len(self.word_table)\n self.stoi = {v: int(k) for k, v in self.word_table.items()}\n self.itos = {int(k): v for k, v in self.word_table.items()}\n def encode(self, x):\n return [self.stoi[t] for t in x]\n def decode(self, x):\n return ''.join([self.itos[t] for t in x])\ntokenizer = TOKENIZER()\n########################################################################################################\nclass RWKV_RNN(torch.jit.ScriptModule):\n def __init__(self, args):\n super().__init__()\n self.args = args\n self.eval() # set torch to inference mode\n w = torch.load(args.MODEL_NAME + '.pth', map_location='cpu')\n for k in w.keys():\n if '.time_' in k: w[k] = w[k].squeeze()\n if '.time_decay' in k: w[k] = -torch.exp(w[k].float()) # the real time decay is like e^{-e^x}\n else: w[k] = w[k].float() # convert to f32 type\n self.w = types.SimpleNamespace() # set self.w from w\n self.w.blocks = {}", + "type": "code", + "location": "/RWKV-v4neo/math_demo/run.py:26-53" + }, + "687": { + "file_id": 40, + "content": "This code defines a class for RWKV tokenization and another class for RWKV_RNN. The RWKV_RNN class initializes with arguments, loads the model from a checkpoint file, and adjusts some parameters as needed. It sets the model to inference mode using eval() function and converts certain parameters to floating point type. The code also includes functions for tokenization: encode and decode which convert tokens to integers and integers back to tokens respectively.", + "type": "comment" + }, + "688": { + "file_id": 40, + "content": " for k in w.keys(): # example: \"blocks.0.att.time_first\" => self.w.blocks[0].att.time_first\n parts = k.split('.')\n last = parts.pop()\n here = self.w\n for p in parts:\n if p.isdigit():\n p = int(p)\n if p not in here: here[p] = types.SimpleNamespace()\n here = here[p]\n else:\n if not hasattr(here, p): setattr(here, p, types.SimpleNamespace())\n here = getattr(here, p)\n setattr(here, last, w[k])\n def layer_norm(self, x, w):\n return F.layer_norm(x, (self.args.n_embd,), weight=w.weight, bias=w.bias)\n @torch.jit.script_method\n def channel_mixing(self, x, state, i:int, time_mix_k, time_mix_r, kw, vw, rw):\n xk = x * time_mix_k + state[5*i+0] * (1 - time_mix_k)\n xr = x * time_mix_r + state[5*i+0] * (1 - time_mix_r)\n state[5*i+0] = x\n r = torch.sigmoid(rw @ xr)\n k = torch.square(torch.relu(kw @ xk)) # square relu, primer paper", + "type": "code", + "location": "/RWKV-v4neo/math_demo/run.py:54-77" + }, + "689": { + "file_id": 40, + "content": "This code is defining a function called \"channel_mixing\" which performs channel-wise mixing and normalization on input tensor \"x\". It uses layer norm, time-wise mixing, and sigmoid activation functions. The function also updates the state variable for future iterations. The code organizes the weights in a specific way and utilizes torch script method to optimize performance.", + "type": "comment" + }, + "690": { + "file_id": 40, + "content": " return r * (vw @ k)\n @torch.jit.script_method\n def time_mixing(self, x, state, i:int, time_mix_k, time_mix_v, time_mix_r, time_first, time_decay, kw, vw, rw, ow):\n xk = x * time_mix_k + state[5*i+1] * (1 - time_mix_k)\n xv = x * time_mix_v + state[5*i+1] * (1 - time_mix_v)\n xr = x * time_mix_r + state[5*i+1] * (1 - time_mix_r)\n state[5*i+1] = x\n r = torch.sigmoid(rw @ xr)\n k = kw @ xk\n v = vw @ xv\n aa = state[5*i+2]\n bb = state[5*i+3]\n pp = state[5*i+4]\n ww = time_first + k\n qq = torch.maximum(pp, ww)\n e1 = torch.exp(pp - qq)\n e2 = torch.exp(ww - qq)\n a = e1 * aa + e2 * v\n b = e1 * bb + e2\n wkv = a / b\n ww = pp + time_decay\n qq = torch.maximum(ww, k)\n e1 = torch.exp(ww - qq)\n e2 = torch.exp(k - qq)\n state[5*i+2] = e1 * aa + e2 * v\n state[5*i+3] = e1 * bb + e2\n state[5*i+4] = qq\n return ow @ (r * wkv)\n def forward(self, token, state):", + "type": "code", + "location": "/RWKV-v4neo/math_demo/run.py:78-109" + }, + "691": { + "file_id": 40, + "content": "This code performs time-dependent mixing and computes the output for each step of a recurrent neural network. The `time_mixing` method updates hidden states, applies transformations based on time constants, and calculates the weighted sum of inputs using dot products. The `forward` method takes input tokens and current state as arguments to perform forward pass calculations.", + "type": "comment" + }, + "692": { + "file_id": 40, + "content": " with torch.no_grad():\n if state == None:\n state = torch.zeros(self.args.n_layer * 5, self.args.n_embd)\n for i in range(self.args.n_layer): state[5*i+4] = -1e30 # -infinity\n x = self.w.emb.weight[token]\n x = self.layer_norm(x, self.w.blocks[0].ln0)\n for i in range(self.args.n_layer):\n att = self.w.blocks[i].att\n x = x + self.time_mixing(self.layer_norm(x, self.w.blocks[i].ln1), state, i, \n att.time_mix_k, att.time_mix_v, att.time_mix_r, att.time_first, att.time_decay, \n att.key.weight, att.value.weight, att.receptance.weight, att.output.weight)\n ffn = self.w.blocks[i].ffn\n x = x + self.channel_mixing(self.layer_norm(x, self.w.blocks[i].ln2), state, i, \n ffn.time_mix_k, ffn.time_mix_r, \n ffn.key.weight, ffn.value.weight, ffn.receptance.weight)\n x = self.w.head.weight @ self.layer_norm(x, self.w.ln_out)", + "type": "code", + "location": "/RWKV-v4neo/math_demo/run.py:110-127" + }, + "693": { + "file_id": 40, + "content": "Iterates over layers, applies time-mixing and channel-mixing operations, layer norm, and final weighted operation.", + "type": "comment" + }, + "694": { + "file_id": 40, + "content": " return x.float(), state\n##########################################################################################################\nprint(f'\\nUsing CPU. Loading {args.MODEL_NAME} ...')\nmodel = RWKV_RNN(args)\ncontext = \"\\n\" + equation.strip().replace(' ','') + \"=\"\nprint(context, f'(python answer {eval(equation)})')\nstate = None\nfor token in tokenizer.encode(context):\n out, state = model.forward(token, state)\nfor i in range(4096):\n token = int(torch.argmax(out))\n tmp = tokenizer.decode([token])\n print(tmp, end=\"\", flush=True)\n if tmp == '\\n':\n break\n out, state = model.forward(token, state) \nprint()", + "type": "code", + "location": "/RWKV-v4neo/math_demo/run.py:128-150" + }, + "695": { + "file_id": 40, + "content": "This code loads an RWKV_RNN model, generates context using provided equation and tokenizes it. It then iterates through tokens, predicting the next character based on the model's output, until a newline is predicted.", + "type": "comment" + }, + "696": { + "file_id": 41, + "content": "/RWKV-v4neo/run.py", + "type": "filepath" + }, + "697": { + "file_id": 41, + "content": "This code initializes RWKV Language Model, configures parameters, handles potential issues, and utilizes the RWKV-v4neo model for text generation, context refinement, tokenizer error checking, and prompt processing. It iterates through tokens, predicts next tokens, handles special cases, and prints debug output while tracking time for preprocessing, generation, and flushing buffer after each character.", + "type": "summary" + }, + "698": { + "file_id": 41, + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nimport numpy as np\nimport math, os, sys, types, time, gc\nimport torch\nfrom src.utils import TOKENIZER\ntry:\n os.environ[\"CUDA_VISIBLE_DEVICES\"] = sys.argv[1]\nexcept:\n pass\ntorch.backends.cudnn.benchmark = True\ntorch.backends.cudnn.allow_tf32 = True\ntorch.backends.cuda.matmul.allow_tf32 = True\nnp.set_printoptions(precision=4, suppress=True, linewidth=200)\nargs = types.SimpleNamespace()\n########################################################################################################\n# Step 1: set model & config (use v4 to run your trained-from-scratch models. v4 and v4neo are compatible)\n########################################################################################################\nargs.RUN_DEVICE = \"cuda\" # 'cuda' // 'cpu' (already fast)", + "type": "code", + "location": "/RWKV-v4neo/run.py:1-23" + }, + "699": { + "file_id": 41, + "content": "This code is initializing the RWKV Language Model. It imports necessary libraries, sets the CUDA device for GPU or CPU usage, and ensures compatibility between v4 and v4neo models. The arguments for model configuration are set to 'cuda' for GPU acceleration or 'cpu' for CPU-only processing.", + "type": "comment" + } +} \ No newline at end of file diff --git a/docs/data/7.json b/docs/data/7.json new file mode 100644 index 00000000..1bb9eaa3 --- /dev/null +++ b/docs/data/7.json @@ -0,0 +1,549 @@ +{ + "700": { + "file_id": 41, + "content": "args.FLOAT_MODE = \"fp16\" # fp16 (good for GPU, does not work for CPU) // fp32 (good for CPU) // bf16 (less accurate, but works for CPU)\n# if args.RUN_DEVICE == \"cuda\":\n# os.environ[\"RWKV_RUN_BACKEND\"] = 'nvfuser' # !!!BUGGY!!! wrong output\nos.environ[\"RWKV_JIT_ON\"] = '1' # '1' or '0'. very useful for GPU/CPU fp32, but might be harmful for GPU fp16. please benchmark !!!\nTOKEN_MODE = \"pile\"\nWORD_NAME = [\n \"20B_tokenizer.json\",\n \"20B_tokenizer.json\",\n] # [vocab, vocab] for Pile model\nUNKNOWN_CHAR = None\nvocab_size = 50277\n# Download Pile models: https://huggingface.co/BlinkDL\n# or, set MODEL_NAME to your fine-tuned model\n# MODEL_NAME = \"/fsx/BlinkDL/rwkv-release/RWKV-4-Pile-169M-20220807-8023\"\n# n_layer = 12\n# n_embd = 768\n# ctx_len = 1024\n# MODEL_NAME = '/fsx/BlinkDL/rwkv-release/RWKV-4-Pile-430M-20220808-8066'\n# n_layer = 24\n# n_embd = 1024\n# ctx_len = 1024\n# MODEL_NAME = '/fsx/BlinkDL/HF-MODEL/rwkv-4-pile-1b5/RWKV-4-Pile-1B5-20220903-8040'\n# n_layer = 24\n# n_embd = 2048\n# ctx_len = 1024\n# MODEL_NAME = '/fsx/BlinkDL/HF-MODEL/rwkv-4-pile-3b/RWKV-4-Pile-3B-20221008-8023'", + "type": "code", + "location": "/RWKV-v4neo/run.py:24-56" + }, + "701": { + "file_id": 41, + "content": "This code sets various parameters for an RWKV model, including float mode (fp16, fp32 or bf16), JIT environment, tokenizer files, and Pile model options. It also specifies the MODEL_NAME based on downloaded models or a user-defined fine-tuned model. The code is designed for GPU and CPU usage, but some elements may require benchmarking due to potential issues or reduced accuracy.", + "type": "comment" + }, + "702": { + "file_id": 41, + "content": "# n_layer = 32\n# n_embd = 2560\n# ctx_len = 1024\nMODEL_NAME = '/fsx/BlinkDL/HF-MODEL/rwkv-4-pile-7b/RWKV-4-Pile-7B-20221115-8047'\nn_layer = 32\nn_embd = 4096\nctx_len = 1024\nargs.MODEL_NAME = MODEL_NAME\nargs.n_layer = n_layer\nargs.n_embd = n_embd\nargs.ctx_len = ctx_len\nargs.vocab_size = vocab_size\nargs.head_qk = 0\nargs.pre_ffn = 0\nargs.grad_cp = 0\nargs.my_pos_emb = 0\nos.environ[\"RWKV_RUN_DEVICE\"] = args.RUN_DEVICE\n########################################################################################################\n# Step 2: set prompt & sampling stuffs\n########################################################################################################\n# context = 'A'\n# context = \"\\nIn the\"\n# context = '\\nSugar:'\ncontext = \"\\nIn a shocking finding, scientist discovered a herd of dragons living in a remote, previously unexplored valley, in Tibet. Even more surprising to the researchers was the fact that the dragons spoke perfect Chinese.\"\n# context = \"\\n深圳是\" # test Chinese\n# context = \"\\n東京は\" # test Japanese\n# ###### A good prompt for Q&A ######", + "type": "code", + "location": "/RWKV-v4neo/run.py:57-89" + }, + "703": { + "file_id": 41, + "content": "This code sets the model parameters (n_layer, n_embd, ctx_len) and environment variables for RWKV-v4neo's run.py. The context variable holds a text prompt for question and answer tasks in various languages.", + "type": "comment" + }, + "704": { + "file_id": 41, + "content": "# context = '''\n# Questions & Helpful Answers\n# Ask Research Experts\n# Question:\n# Can penguins fly?\n# Full Answer:\n# '''\n# ###### A good prompt for chatbot ######\n# context = '''\n# The following is a conversation between a highly knowledgeable and intelligent AI assistant called Bot, and a human user called User. In the following interactions, User and Bot converse in natural language, and Bot always answer User's questions. Bot is very smart, polite and humorous. Bot knows a lot, and always tells the truth. The conversation begins.\n# User: who is president of usa?\n# Bot: It’s Joe Biden; he was sworn in earlier this year.\n# User: french revolution what year\n# Bot: It started in 1789, but it lasted 10 years until 1799.\n# User: guess i marry who ?\n# Bot: Only if you tell me more about yourself - what are your interests?\n# User: wat is lhc\n# Bot: It’s a large and very expensive piece of science equipment. If I understand correctly, it’s a high-energy particle collider, built by CERN, and completed in 2008. They used it to confirm the existence of the Higgs boson in 2012.", + "type": "code", + "location": "/RWKV-v4neo/run.py:90-117" + }, + "705": { + "file_id": 41, + "content": "This code is a chatbot prompt featuring a conversation between a user and an intelligent AI assistant. The user asks various questions about politics, history, and personal preferences, and the AI provides accurate and informative responses.", + "type": "comment" + }, + "706": { + "file_id": 41, + "content": "# User:''' # type your question here\nNUM_TRIALS = 999\nLENGTH_PER_TRIAL = 333\nTEMPERATURE = 1.0\ntop_p = 0.8\ntop_p_newline = 0.9 # only used in TOKEN_MODE = char\nDEBUG_DEBUG = False # True False --> show softmax output\n########################################################################################################\nprint(f'\\nUsing {args.RUN_DEVICE.upper()}. Loading {MODEL_NAME}...')\nfrom src.model_run import RWKV_RNN\nmodel = RWKV_RNN(args)\nprint(f'\\nOptimizing speed...')\nout, _ = model.forward([187], None)\n# print(out)\ngc.collect()\ntorch.cuda.empty_cache()\n# input(0)\nprint(f'\\nLoading tokenizer {WORD_NAME}...')\ntokenizer = TOKENIZER(WORD_NAME, UNKNOWN_CHAR=UNKNOWN_CHAR)\nif TOKEN_MODE == \"pile\":\n assert tokenizer.tokenizer.decode([187]) == '\\n'\n########################################################################################################\nif tokenizer.charMode:\n context = tokenizer.refine_context(context)\n ctx = [tokenizer.stoi.get(s, tokenizer.UNKNOWN_CHAR) for s in context]\nelse:\n ctx = tokenizer.tokenizer.encode(context)", + "type": "code", + "location": "/RWKV-v4neo/run.py:119-156" + }, + "707": { + "file_id": 41, + "content": "This code snippet is part of a larger program that uses the RWKV-v4neo model for text generation. It sets up necessary parameters and initializes the model, optimizer, and tokenizer. The context provided to the model is refined if character mode is enabled. The code also includes error checking for specific conditions related to the tokenizer being used.", + "type": "comment" + }, + "708": { + "file_id": 41, + "content": "src_len = len(ctx)\nsrc_ctx = ctx.copy()\nprint(\"\\nYour prompt has \" + str(src_len) + \" tokens.\")\nprint(\n \"Note: currently the first run takes a while if your prompt is long, as we are using RNN to preprocess the prompt. Use GPT to build the hidden state for better speed.\\n\"\n)\ntime_slot = {}\ntime_ref = time.time_ns()\ndef record_time(name):\n if name not in time_slot:\n time_slot[name] = 1e20\n tt = (time.time_ns() - time_ref) / 1e9\n if tt < time_slot[name]:\n time_slot[name] = tt\ninit_state = None\ninit_out = None\nstate = None\nout = None\nfor TRIAL in range(1 if DEBUG_DEBUG else NUM_TRIALS):\n print((\"-\" * 50) + '\\n' + context, end=\"\")\n time_ref = time.time_ns()\n ctx = src_ctx.copy()\n if TRIAL == 0:\n for i in range(src_len):\n x = ctx[: i + 1]\n if i == src_len - 1:\n init_out, init_state = model.forward(x, init_state)\n else:\n init_state = model.forward(x, init_state, preprocess_only=True)\n gc.collect()\n torch.cuda.empty_cache()", + "type": "code", + "location": "/RWKV-v4neo/run.py:157-194" + }, + "709": { + "file_id": 41, + "content": "This code is running a neural language model for a given prompt. It first processes the prompt to generate an initial state and output, which are then used in subsequent trials. The processing involves using a recurrent neural network (RNN) for the first run if the prompt is long, or GPT otherwise. The code also keeps track of time taken during various operations for potential performance improvements.", + "type": "comment" + }, + "710": { + "file_id": 41, + "content": " record_time('preprocess')\n out_last = src_len\n for i in range(src_len, src_len + (1 if DEBUG_DEBUG else LENGTH_PER_TRIAL)):\n x = ctx[: i + 1]\n x = x[-ctx_len:]\n if i == src_len:\n out = init_out.clone()\n state = init_state.clone()\n else:\n out, state = model.forward(x, state)\n if DEBUG_DEBUG:\n print(\"model\", np.array(x), \"==>\", np.array(out), np.max(out.cpu().numpy()), np.min(out.cpu().numpy()))\n if TOKEN_MODE == \"pile\":\n out[0] = -999999999 # disable <|endoftext|>\n ttt = tokenizer.sample_logits(\n out,\n x,\n ctx_len,\n temperature=TEMPERATURE,\n top_p_usual=top_p,\n top_p_newline=top_p_newline,\n )\n ctx += [ttt]\n if tokenizer.charMode:\n char = tokenizer.itos[ttt]\n print(char, end=\"\", flush=True)\n else:\n char = tokenizer.tokenizer.decode(ctx[out_last:])\n if '\\ufffd' not in char: # is valid utf8 string?", + "type": "code", + "location": "/RWKV-v4neo/run.py:196-227" + }, + "711": { + "file_id": 41, + "content": "This code is iterating through a sequence of tokens, using a model to predict the next token based on the previous ones. If in debug mode, it prints out the output of the model for each step. It also has special handling for the \"<|endoftext|>\" token, disabling it if the tokenizer mode is set to \"pile\". The code then adds the predicted token to the context and either prints out each character if in character mode or combines the tokens into a string if not.", + "type": "comment" + }, + "712": { + "file_id": 41, + "content": " print(char, end=\"\", flush=True)\n out_last = i+1\n record_time('total')\n # print(f'\\n\\n{time_slot}\\n\\n')\n print(\n f\"\\n\\n--- preprocess {round(time_slot['preprocess'], 2)}s, generation {round(time_slot['total']-time_slot['preprocess'], 2)}s \", end = ''\n )\nprint((\"-\" * 50) + '\\n')", + "type": "code", + "location": "/RWKV-v4neo/run.py:228-237" + }, + "713": { + "file_id": 41, + "content": "This code block prints the time taken for preprocessing and generation, separates with a line of dashes, and then proceeds to print information about the time slots. It also flushes the buffer immediately after each character is printed to update the output instantly. The comments are for record-keeping and provide an organized summary of the code's actions.", + "type": "comment" + }, + "714": { + "file_id": 42, + "content": "/RWKV-v4neo/src/binidx.py", + "type": "filepath" + }, + "715": { + "file_id": 42, + "content": "The code defines 'MMapIndexedDataset' and 'Index' for handling indexed datasets, reading binary data, creating memory-mapped objects, and allows retrieval/manipulation of data. It doesn't support prefetch and checks if files exist for given path.", + "type": "summary" + }, + "716": { + "file_id": 42, + "content": "from lib2to3.pgen2 import token\nimport os\nimport torch\nimport numpy as np\nimport shutil\nimport struct\nfrom functools import lru_cache\nfrom itertools import accumulate\ndef print_rank_0(*message):\n pass\n # \"\"\"If distributed is initialized print only on rank 0.\"\"\"\n # if torch.distributed.is_initialized():\n # if torch.distributed.get_rank() == 0:\n # print(*message, flush=True)\n # else:\n # print(*message, flush=True)\ndef _warmup_mmap_file(path):\n pass\n # with open(path, \"rb\") as stream:\n # while stream.read(100 * 1024 * 1024):\n # pass\ndtypes = {\n 1: np.uint8,\n 2: np.int8,\n 3: np.int16,\n 4: np.int32,\n 5: np.int64,\n 6: float,\n 7: np.double,\n 8: np.uint16,\n}\ndef code(dtype):\n for k in dtypes.keys():\n if dtypes[k] == dtype:\n return k\n raise ValueError(dtype)\ndef index_file_path(prefix_path):\n return prefix_path + \".idx\"\ndef data_file_path(prefix_path):\n return prefix_path + \".bin\"\nclass MMapIndexedDataset(torch.utils.data.Dataset):", + "type": "code", + "location": "/RWKV-v4neo/src/binidx.py:1-48" + }, + "717": { + "file_id": 42, + "content": "This code imports necessary libraries and defines functions for handling indexed datasets. The 'print_rank_0' function prints messages only on rank 0 if distributed is initialized, while '_warmup_mmap_file' warms up an mmap file by reading it in chunks. The 'dtypes' dictionary maps data types to their respective codes. 'index_file_path' and 'data_file_path' functions return the paths for index and data files respectively. The class 'MMapIndexedDataset' inherits from torch.utils.data.Dataset, suggesting it handles indexed datasets in a specific format.", + "type": "comment" + }, + "718": { + "file_id": 42, + "content": " class Index(object):\n _HDR_MAGIC = b\"MMIDIDX\\x00\\x00\"\n @classmethod\n def writer(cls, path, dtype):\n class _Writer(object):\n def __enter__(self):\n self._file = open(path, \"wb\")\n # Write Magic string so we can check the file format then opening it again.\n self._file.write(cls._HDR_MAGIC)\n # Write version number\n # Little endian unsigned 64 Bit integer\n self._file.write(struct.pack(\" 0:\n # self.data_pile = MMapIndexedDataset('/fsx/pile/pile_20B_tokenizer_text_document')\n self.data_pile = MMapIndexedDataset('/fsx/pile_deduped/pile_0.87_deduped_text_document')\n self.data_pile_size = len(self.data_pile._bin_buffer) // self.data._index._dtype_size", + "type": "code", + "location": "/RWKV-v4neo/src/dataset.py:26-42" + }, + "739": { + "file_id": 43, + "content": "This code block is checking if the `my_pile_version` argument is equal to 2. If so, it reads in the data file and prepares it for use. It splits the data into chunks and asserts that the size of each chunk matches the expected size. Finally, if the `my_qa_mask` argument is greater than zero, it assigns a specific dataset to `data_pile`.", + "type": "comment" + }, + "740": { + "file_id": 43, + "content": " else:\n self.data_pile = None\n self.data_pile_size = 0\n if args.my_pile_stage > 0:\n # assert self.data_size == 332115325534 and self.vocab_size == 50277\n self.samples_per_epoch = args.epoch_steps * args.real_bsz\n assert self.samples_per_epoch == 40320\n rank_zero_info(f\"########## Pile 20b-tokenized stage {args.my_pile_stage} ##########\")\n dataset_slot = self.data_size // args.ctx_len\n if args.my_pile_stage != 4:\n assert MaybeIsPrime(args.magic_prime)\n assert args.magic_prime % 3 == 2\n assert args.magic_prime / dataset_slot > 0.99 and args.magic_prime / dataset_slot <= 1\n elif args.data_type == \"numpy\":\n self.data = np.load(args.data_file).astype(\"int\")\n self.vocab_size = args.vocab_size\n rank_zero_info(f\"Current vocab size = {self.vocab_size} (make sure it's correct)\")", + "type": "code", + "location": "/RWKV-v4neo/src/dataset.py:43-60" + }, + "741": { + "file_id": 43, + "content": "If the data is not tokenized, self.data_pile is set to None and self.data_pile_size is set to 0. If args.my_pile_stage is greater than 0, it checks if the dataset size is as expected (332115325534 tokens) and vocab size (50277). It calculates samples_per_epoch based on epoch_steps and real_bsz. Asserts that samples_per_epoch is 40320. Prints rank-zero info with stage number if args.my_pile_stage != 4. If args.data_type is \"numpy\", loads data from args.data_file, converts it to int, sets self.vocab_size, and prints current vocab size to ensure correctness.", + "type": "comment" + }, + "742": { + "file_id": 43, + "content": " self.data_size = len(self.data)\n rank_zero_info(f\"Data has {self.data_size} tokens.\")\n elif args.data_type == \"uint16\":\n self.data = np.fromfile(args.data_file, dtype=np.uint16).astype(\"int32\").reshape(-1, args.my_sample_len)\n self.vocab_size = args.vocab_size\n rank_zero_info(f\"Current vocab size = {self.vocab_size} (make sure it's correct)\")\n self.data_size = self.data.shape[0]\n rank_zero_info(f\"Data has {self.data_size} samples.\")\n elif args.data_type == \"wds_img\":\n self.vocab_size = -1\n self.data_size = -1\n self.data = None\n self.error_count = 0\n else:\n if args.data_type == \"dummy\":\n rank_zero_info(\"Building dummy data...\")\n self.data = \"\"\n for i in range(100000):\n aa = (i) % 10000\n bb = (i * i) % 10000\n cc = aa + bb\n self.data += f\".{aa}+{bb}={cc}.\"", + "type": "code", + "location": "/RWKV-v4neo/src/dataset.py:61-82" + }, + "743": { + "file_id": 43, + "content": "The code handles loading and resizing data based on the specified data type. If no data type is specified, it loads uint16 data from the file, resizes vocab size, calculates number of samples, and notifies rank 0. If data type is \"wds_img\", it sets vocab size and data size to -1, sets data to None, and error count to 0. If data type is \"dummy\", it creates dummy data by concatenating numbers and notifies rank 0.", + "type": "comment" + }, + "744": { + "file_id": 43, + "content": " else:\n self.data = open(args.data_file, \"r\", encoding=args.data_type).read()\n rank_zero_info(\"Building token list...\")\n unique = sorted(list(set(self.data)))\n self.vocab_size = len(unique)\n # rank_zero_info()\n # for u in unique:\n # print(u, end=' ')\n # rank_zero_info('\\n\\n')\n xx = 0\n xxObj = {}\n for u in unique:\n xxObj[xx] = u\n xx += 1\n with open(f\"{args.proj_dir}/vocab.json\", \"w\", encoding=\"utf-8\") as vocab_file:\n vocab_file.write(json.dumps(xxObj, ensure_ascii=False))\n self.data_size = len(self.data)\n rank_zero_info(f\"Data has {self.data_size} tokens, {self.vocab_size} vocab size.\")\n self.stoi = {ch: i for i, ch in enumerate(unique)}\n self.itos = {i: ch for i, ch in enumerate(unique)}\n def __len__(self):\n return self.args.epoch_steps * self.args.micro_bsz\n def __getitem__(self, idx):", + "type": "code", + "location": "/RWKV-v4neo/src/dataset.py:83-107" + }, + "745": { + "file_id": 43, + "content": "The code reads data from a file and builds a token list, storing it in a JSON file. It then creates dictionaries for mapping tokens to indices and indices to tokens. Finally, it provides methods for the length of the dataset and accessing specific items within the dataset.", + "type": "comment" + }, + "746": { + "file_id": 43, + "content": " args = self.args\n rank = self.global_rank\n epoch = self.real_epoch\n world_size = self.world_size\n # print(f\"epoch {epoch} idx {idx} rank {rank}/{world_size}\")\n if args.data_type == \"wds_img\":\n def init_wds(self, bias=0):\n def identity(x):\n return x \n import webdataset as wds\n import torchvision.transforms as transforms\n # img_transform = transforms.Compose(\n # [transforms.CenterCrop(256)]\n # )\n img_transform = transforms.Compose([\n transforms.CenterCrop(512),\n transforms.Resize((args.my_img_size))\n ])\n self.data_raw = wds.WebDataset(args.data_file, resampled=True).shuffle(10000, initial=1000, rng=random.Random(epoch*100000+rank+bias*1e9)).decode(\"torchrgb\").to_tuple(\"jpg\", \"json\", \"txt\").map_tuple(img_transform, identity, identity)\n for pp in self.data_raw.pipeline:", + "type": "code", + "location": "/RWKV-v4neo/src/dataset.py:108-128" + }, + "747": { + "file_id": 43, + "content": "This code initializes a WebDataset for image data with specified transformation. It shuffles the dataset and decodes it into torchrgb format, then maps the tuple of jpg, json, and txt files to image transformations, identity mappings for other file types, and returns the initialized dataset.", + "type": "comment" + }, + "748": { + "file_id": 43, + "content": " if 'Resampled' in str(pp):\n pp.deterministic = True\n def worker_seed():\n return rank*100000+epoch+bias*1e9\n pp.worker_seed = worker_seed\n self.data = iter(self.data_raw)\n # print(f\"WebDataset loaded for rank {rank} epoch {epoch}\")\n if self.data == None:\n init_wds(self)\n trial = 0\n while trial < 10:\n try:\n dd = next(self.data) # jpg, json, txt\n break\n except:\n print(f'[dataloader error - epoch {epoch} rank {rank} - trying a new shuffle]')\n self.error_count += 1\n init_wds(self, self.error_count)\n trial += 1\n pass\n # print(f\"epoch {epoch} idx {idx} rank {rank}/{world_size} {dd[2]}\")\n # with open(f\"sample_{rank}.txt\", \"a\", encoding=\"utf-8\") as tmp:", + "type": "code", + "location": "/RWKV-v4neo/src/dataset.py:129-150" + }, + "749": { + "file_id": 43, + "content": "This code initializes a WebDataset for distributed training, handling potential errors in data loading and maintaining worker seeds for determinism. If the dataset is not initialized, it calls init_wds() to do so. It attempts to load data from the dataset 10 times, printing an error message if there's a failure, then re-initializes the WebDataset before retrying. The code also prints information about the current epoch, rank, and progress when loading data successfully. Additionally, it has the potential to append data samples to a file named \"sample_{rank}.txt\".", + "type": "comment" + }, + "750": { + "file_id": 43, + "content": " # tmp.write(f\"epoch {epoch} idx {idx} rank {rank}/{world_size} {int(dd[1]['key'])}\\n\")\n return dd[0], dd[2]\n else:\n if args.data_type == \"uint16\":\n i = np.random.randint(0, self.data_size-1)\n dix = self.data[i]\n x = torch.tensor(dix[:-1], dtype=torch.long)\n y = torch.tensor(dix[1:], dtype=torch.long)\n else:\n ctx_len = args.ctx_len\n req_len = ctx_len + 1\n magic_prime = args.magic_prime\n data = self.data\n if args.my_pile_stage > 0:\n ii = 1 + epoch * self.samples_per_epoch + (idx * world_size) + rank\n if args.my_qa_mask > 0:\n ii_orig = ii\n if ii % 2 == 0:\n ii = -1\n data = self.data_pile\n else:\n ii = ii // 2\n if data == self.data_pile:", + "type": "code", + "location": "/RWKV-v4neo/src/dataset.py:151-175" + }, + "751": { + "file_id": 43, + "content": "The code initializes a random index 'i' within the data range, creates two tensors 'x' and 'y' with torch.tensor() from the data slice. If the data type is uint16, it selects a random index 'i' from 0 to data_size-1, extracts x from the data at index i, y from the next element. Else, it calculates the context length, required length, and uses magic prime for data selection based on current epoch, idx, rank, world_size. If my_pile_stage is greater than 0, it computes the index 'ii', checks if my_qa_mask > 0 to determine if ii should be -1 or half of itself. Depending on data type and certain conditions, it selects data_pile for data tensor.", + "type": "comment" + }, + "752": { + "file_id": 43, + "content": " i = np.random.randint(0, self.data_pile_size - req_len)\n else:\n if args.my_pile_stage == 4 or ii < args.my_random_steps:\n # cheat: pick a random spot in dataset\n if args.my_pile_version == 1:\n i = np.random.randint(0, self.data_size - req_len)\n else:\n i = np.random.randint(0, self.data_size)\n else:\n ii = ii - args.my_random_steps\n factor = (math.sqrt(5) - 1) / 2\n factor = int(magic_prime * factor)\n i = ((factor * ii * ii * ii) % magic_prime) * ctx_len\n i = i + args.my_pile_shift\n # print(f\"epoch {epoch} idx {idx} rank {rank}/{world_size} ii {ii} pos {round(i / self.data_size, 3)}\")\n else:\n # cheat: pick a random spot in dataset", + "type": "code", + "location": "/RWKV-v4neo/src/dataset.py:176-192" + }, + "753": { + "file_id": 43, + "content": "This code randomly selects a chunk of data for model training. If the stage is 4 or random steps are involved, it either picks a random spot in the dataset based on the current version, or uses a formula involving magic prime and pile shift to determine the position. It then adds the context length. Finally, if the input type is \"cheat\", it picks a random spot in the dataset. The code also prints some information about epoch, index, rank, world size, iterations, and selected position.", + "type": "comment" + }, + "754": { + "file_id": 43, + "content": " i = np.random.randint(0, self.data_size - req_len)\n if args.data_type == \"binidx\":\n if args.my_pile_version == 1:\n dix = data.get(idx=0, offset=i, length=req_len).astype(int)\n else:\n # self.data : cutoff, chunk_count, data\n for j in range(len(data)):\n if i < data[j][0]:\n ii = i\n i = (i - (data[j-1][0] if j > 0 else 0)) % data[j][1]\n dix = data[j][2].get(idx=0, offset=i, length=req_len).astype(int)\n # print(ii, j, i)\n break\n elif args.data_type == \"numpy\":\n dix = data[i : i + req_len]\n else:\n dix = [self.stoi[s] for s in data[i : i + req_len]]\n if args.my_qa_mask == 1:\n if data == self.data_pile:", + "type": "code", + "location": "/RWKV-v4neo/src/dataset.py:193-213" + }, + "755": { + "file_id": 43, + "content": "The code retrieves a random index (i) within the data size and based on the data type, it selects the appropriate indices (dix) from the provided dataset. If data_type is \"binidx\", it checks if my_pile_version is 1 or not; then it gets the indices using different methods. If data_type is \"numpy\", it directly selects the indices using numpy's slicing. Finally, it converts string tokens to indices using self.stoi for non-\"binidx\" and \"numpy\" data types.", + "type": "comment" + }, + "756": { + "file_id": 43, + "content": " z = [1] * ctx_len\n else:\n z = [0] * ctx_len\n z_sum = 0\n isGood = False\n for i in range(3, ctx_len):\n if dix[i] == 27 and dix[i-1] == 34 and dix[i-2] == 187 and dix[i-3] == 187:\n isGood = True\n if dix[i] == 0:\n isGood = False\n if isGood:\n z[i] = 1\n z_sum += 1\n if z_sum == 0:\n z = [1] * ctx_len\n i = np.random.randint(0, self.data_pile_size - req_len)\n dix = self.data_pile.get(idx=0, offset=i, length=req_len).astype(int)\n z = torch.tensor(z, dtype=torch.bfloat16)\n x = torch.tensor(dix[:-1], dtype=torch.long)\n y = torch.tensor(dix[1:], dtype=torch.long)", + "type": "code", + "location": "/RWKV-v4neo/src/dataset.py:214-234" + }, + "757": { + "file_id": 43, + "content": "This code checks if the input data has a specific pattern, and sets corresponding values in the 'z' list. If no such pattern is found, it generates a random index and retrieves the data from the dataset. It then converts the data into tensors for further processing.", + "type": "comment" + }, + "758": { + "file_id": 43, + "content": " # if ii_orig < 50:\n # # if rank == 1:\n # print('rank', rank, 'i', ii_orig, ii, i, 'x', x[:5], '...', x[-5:])\n # else:\n # exit(0)\n if args.my_qa_mask == 1:\n return x, y, z\n return x, y", + "type": "code", + "location": "/RWKV-v4neo/src/dataset.py:236-245" + }, + "759": { + "file_id": 43, + "content": "This code snippet appears to be part of a larger function. It checks if `ii_orig` is less than 50, and if it is, it prints some information related to rank, indices, and a portion of the array `x`. If `args.my_qa_mask` is equal to 1, it returns `x`, `y`, and `z`, otherwise it simply returns `x` and `y`. The purpose and functionality of this code may vary depending on the larger context in which it exists.", + "type": "comment" + }, + "760": { + "file_id": 44, + "content": "/RWKV-v4neo/src/model.py", + "type": "filepath" + }, + "761": { + "file_id": 44, + "content": "This code imports libraries, sets up profiling, defines modules based on environment variables, utilizes CUDA for efficient computation, and initializes a neural network model with backpropagation support. It applies layer normalization using CUDA function calculations and introduces the RWKV_TimeMix class for the RWKV5 model. The code initializes optimizer groups and models with layer-specific learning rates, performs all-gather for losses, handles various settings, and initializes model weights using orthogonal initialization.", + "type": "summary" + }, + "762": { + "file_id": 44, + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nimport os, math, gc, importlib\nimport torch\n# torch._C._jit_set_profiling_executor(True)\n# torch._C._jit_set_profiling_mode(True)\nimport torch.nn as nn\nfrom torch.nn import functional as F\nimport pytorch_lightning as pl\nfrom pytorch_lightning.utilities import rank_zero_info, rank_zero_only\nfrom pytorch_lightning.strategies import DeepSpeedStrategy\nif importlib.util.find_spec('deepspeed'):\n import deepspeed\n from deepspeed.ops.adam import DeepSpeedCPUAdam, FusedAdam\n# from deepspeed.runtime.fp16.onebit.zoadam import ZeroOneAdam\ntry:\n print('RWKV_MY_TESTING', os.environ[\"RWKV_MY_TESTING\"])\nexcept:\n os.environ[\"RWKV_MY_TESTING\"] = ''\ndef __nop(ob):\n return ob\nMyModule = nn.Module\nMyFunction = __nop\nif os.environ[\"RWKV_JIT_ON\"] == \"1\":", + "type": "code", + "location": "/RWKV-v4neo/src/model.py:1-31" + }, + "763": { + "file_id": 44, + "content": "This code imports necessary libraries, sets up profiling executor and mode for torch.nn, defines MyModule and MyFunction based on JIT environment variable, and imports DeepSpeed if available.", + "type": "comment" + }, + "764": { + "file_id": 44, + "content": " MyModule = torch.jit.ScriptModule\n MyFunction = torch.jit.script_method\n########################################################################################################\n# CUDA Kernel\n########################################################################################################\nT_MAX = int(os.environ[\"RWKV_T_MAX\"]) # TAKES LOTS OF VRAM!\n# it's possible to go beyond CUDA limitations if you slice the ctx and pass the hidden state in each slice\nfrom torch.utils.cpp_extension import load\nif os.environ[\"RWKV_FLOAT_MODE\"] == \"bf16\":\n wkv_cuda = load(name=f\"wkv_{T_MAX}_bf16\", sources=[\"cuda/wkv_op_bf16.cpp\", \"cuda/wkv_cuda_bf16.cu\"], verbose=True, extra_cuda_cflags=[\"-t 4\", \"-std=c++17\", \"-res-usage\", \"--maxrregcount 60\", \"--use_fast_math\", \"-O3\", \"-Xptxas -O3\", \"--extra-device-vectorization\", f\"-DTmax={T_MAX}\"])\n class WKV(torch.autograd.Function):\n @staticmethod\n def forward(ctx, B, T, C, w, u, k, v):\n ctx.B = B\n ctx.T = T\n ctx.C = C", + "type": "code", + "location": "/RWKV-v4neo/src/model.py:32-52" + }, + "765": { + "file_id": 44, + "content": "This code defines a class 'WKV' that utilizes CUDA for efficient computation, and loads a corresponding CUDA kernel module depending on the environment variable \"RWKV_FLOAT_MODE\". This process involves setting T_MAX and loading the appropriate C++ modules with specific compiler flags.", + "type": "comment" + }, + "766": { + "file_id": 44, + "content": " assert T <= T_MAX\n assert B * C % min(C, 32) == 0\n w = -torch.exp(w.float().contiguous())\n u = u.contiguous()\n k = k.contiguous()\n v = v.contiguous()\n y = torch.empty((B, T, C), device=w.device, memory_format=torch.contiguous_format, dtype=torch.bfloat16)\n wkv_cuda.forward(B, T, C, w, u, k, v, y)\n ctx.save_for_backward(w, u, k, v, y)\n return y\n @staticmethod\n def backward(ctx, gy):\n B = ctx.B\n T = ctx.T\n C = ctx.C\n assert T <= T_MAX\n assert B * C % min(C, 32) == 0\n w, u, k, v, y = ctx.saved_tensors\n gw = torch.empty((B, C), device=gy.device, memory_format=torch.contiguous_format, dtype=torch.bfloat16)\n gu = torch.empty((B, C), device=gy.device, memory_format=torch.contiguous_format, dtype=torch.bfloat16)\n gk = torch.empty((B, T, C), device=gy.device, memory_format=torch.contiguous_format, dtype=torch.bfloat16)", + "type": "code", + "location": "/RWKV-v4neo/src/model.py:53-73" + }, + "767": { + "file_id": 44, + "content": "This code defines a model function with forward and backward operations. It checks some conditions, initializes tensors, performs model computations, and saves the intermediate states for later backpropagation.", + "type": "comment" + }, + "768": { + "file_id": 44, + "content": " gv = torch.empty((B, T, C), device=gy.device, memory_format=torch.contiguous_format, dtype=torch.bfloat16)\n wkv_cuda.backward(B, T, C, w, u, k, v, y, gy.contiguous(), gw, gu, gk, gv)\n gw = torch.sum(gw, dim=0)\n gu = torch.sum(gu, dim=0)\n return (None, None, None, gw, gu, gk, gv)\nelse:\n wkv_cuda = load(name=f\"wkv_{T_MAX}\", sources=[\"cuda/wkv_op.cpp\", \"cuda/wkv_cuda.cu\"], verbose=True, extra_cuda_cflags=[\"-res-usage\", \"--maxrregcount 60\", \"--use_fast_math\", \"-O3\", \"-Xptxas -O3\", \"--extra-device-vectorization\", f\"-DTmax={T_MAX}\"])\n class WKV(torch.autograd.Function):\n @staticmethod\n def forward(ctx, B, T, C, w, u, k, v):\n ctx.B = B\n ctx.T = T\n ctx.C = C\n assert T <= T_MAX\n assert B * C % min(C, 32) == 0\n if \"32\" in os.environ[\"RWKV_FLOAT_MODE\"]:\n w = -torch.exp(w.contiguous())\n u = u.contiguous()\n k = k.contiguous()\n v = v.contiguous()", + "type": "code", + "location": "/RWKV-v4neo/src/model.py:74-93" + }, + "769": { + "file_id": 44, + "content": "This code initializes a tensor and calls a function. It then calculates the sum along dimension 0 for two other tensors, and returns them alongside others. It seems to be part of a neural network model with CUDA support. The forward method creates an instance variable for each argument, checks if T is within a limit, and performs some transformations on certain arguments.", + "type": "comment" + }, + "770": { + "file_id": 44, + "content": " else:\n w = -torch.exp(w.float().contiguous())\n u = u.float().contiguous()\n k = k.float().contiguous()\n v = v.float().contiguous()\n y = torch.empty((B, T, C), device=w.device, memory_format=torch.contiguous_format)\n wkv_cuda.forward(B, T, C, w, u, k, v, y)\n ctx.save_for_backward(w, u, k, v, y)\n if \"32\" in os.environ[\"RWKV_FLOAT_MODE\"]:\n return y\n elif os.environ[\"RWKV_FLOAT_MODE\"] == \"fp16\":\n return y.half()\n elif os.environ[\"RWKV_FLOAT_MODE\"] == \"bf16\":\n return y.bfloat16()\n @staticmethod\n def backward(ctx, gy):\n B = ctx.B\n T = ctx.T\n C = ctx.C\n assert T <= T_MAX\n assert B * C % min(C, 32) == 0\n w, u, k, v, y = ctx.saved_tensors\n gw = torch.empty((B, C), device=gy.device, memory_format=torch.contiguous_format)\n gu = torch.empty((B, C), device=gy.device, memory_format=torch.contiguous_format)", + "type": "code", + "location": "/RWKV-v4neo/src/model.py:94-117" + }, + "771": { + "file_id": 44, + "content": "The code is handling the forward and backward passes of a model. It first checks the environment variable 'RWKV_FLOAT_MODE' to determine the data type for output 'y'. If '32' is present in this environment variable, it directly returns 'y'. If 'fp16' is set, it converts 'y' to half precision and returns it. If 'bf16' is set, it converts 'y' to BFloat16 and returns it. The backward method applies constraints on the dimensions of tensors and retrieves saved tensors from context for gradients calculation.", + "type": "comment" + }, + "772": { + "file_id": 44, + "content": " gk = torch.empty((B, T, C), device=gy.device, memory_format=torch.contiguous_format)\n gv = torch.empty((B, T, C), device=gy.device, memory_format=torch.contiguous_format)\n if \"32\" in os.environ[\"RWKV_FLOAT_MODE\"]:\n wkv_cuda.backward(B, T, C, w, u, k, v, y, gy.contiguous(), gw, gu, gk, gv)\n else:\n wkv_cuda.backward(B, T, C, w, u, k, v, y, gy.float().contiguous(), gw, gu, gk, gv)\n gw = torch.sum(gw, dim=0)\n gu = torch.sum(gu, dim=0)\n if \"32\" in os.environ[\"RWKV_FLOAT_MODE\"]:\n return (None, None, None, gw, gu, gk, gv)\n elif os.environ[\"RWKV_FLOAT_MODE\"] == \"fp16\":\n return (None, None, None, gw.half(), gu.half(), gk.half(), gv.half())\n elif os.environ[\"RWKV_FLOAT_MODE\"] == \"bf16\":\n return (None, None, None, gw.bfloat16(), gu.bfloat16(), gk.bfloat16(), gv.bfloat16())\ndef RUN_CUDA(B, T, C, w, u, k, v):\n return WKV.apply(B, T, C, w, u, k, v)", + "type": "code", + "location": "/RWKV-v4neo/src/model.py:118-135" + }, + "773": { + "file_id": 44, + "content": "The code defines a function for running CUDA on BERT-like transformer models. It initializes empty tensors for gradients, and then calls the backward operation of WKV to compute gradients. Depending on the float mode environment variable, it returns the gradients in different precisions: None, half (fp16), or bfloat16. If no CUDA is used, the code calls a RUN_CUDA function, which applies the transformer model with CUDA.", + "type": "comment" + }, + "774": { + "file_id": 44, + "content": "########################################################################################################\nclass RWKV_TimeMix_RWKV5_Preview(MyModule):\n def __init__(self, args, layer_id):\n super().__init__()\n self.args = args\n self.layer_id = layer_id\n self.head_size = 64\n self.n_head = args.dim_att // self.head_size\n assert args.dim_att % self.n_head == 0\n self.head_size_divisor = 8\n self.chunk_len = 512\n assert args.ctx_len % self.chunk_len == 0\n with torch.no_grad():\n ratio_0_to_1 = layer_id / (args.n_layer - 1) # 0 to 1\n ratio_1_to_almost0 = 1.0 - (layer_id / args.n_layer) # 1 to ~0\n ddd = torch.ones(1, 1, args.n_embd)\n for i in range(args.n_embd):\n ddd[0, 0, i] = i / args.n_embd\n # fancy time_mix\n self.time_mix_k = nn.Parameter(torch.pow(ddd, ratio_1_to_almost0))\n self.time_mix_v = nn.Parameter(torch.pow(ddd, ratio_1_to_almost0) + 0.3 * ratio_0_to_1)", + "type": "code", + "location": "/RWKV-v4neo/src/model.py:137-162" + }, + "775": { + "file_id": 44, + "content": "This code defines a class for the RWKV_TimeMix_RWKV5_Preview module, which is an extension of the MyModule class. It initializes instance variables related to the model's parameters and performs some sanity checks on the input arguments. The time_mix_k and time_mix_v parameters are calculated based on a combination of the layer index and other factors.", + "type": "comment" + }, + "776": { + "file_id": 44, + "content": " self.time_mix_r = nn.Parameter(torch.pow(ddd, 0.5 * ratio_1_to_almost0))\n if 'r3' in os.environ[\"RWKV_MY_TESTING\"]:\n self.time_mix_g = nn.Parameter(torch.pow(ddd, 0.5 * ratio_1_to_almost0))\n self.gate = nn.Linear(args.n_embd, args.dim_att, bias=False)\n # fancy time_decay\n decay_speed = torch.ones(self.n_head)\n for h in range(self.n_head):\n decay_speed[h] = -6 + 5 * (h / (self.n_head - 1)) ** (0.7 + 1.3 * ratio_0_to_1)\n self.time_decay = nn.Parameter(decay_speed)\n # print(layer_id, self.time_decay.flatten()[:3].cpu().numpy(), '...', self.time_decay.flatten()[-3:].cpu().numpy())\n if 'r2' in os.environ[\"RWKV_MY_TESTING\"]:\n tmp = torch.zeros(self.n_head)\n for h in range(self.n_head):\n tmp[h] = ratio_0_to_1 * (1 - (h / (self.n_head - 1)))\n self.time_faaaa = nn.Parameter(tmp)\n else:\n self.time_first = nn.Parameter(torch.ones(self.n_head) * (-3.0))", + "type": "code", + "location": "/RWKV-v4neo/src/model.py:163-182" + }, + "777": { + "file_id": 44, + "content": "This code initializes parameters for a time-related model component, including time_mix_r, time_mix_g (conditionally), time_decay, and time_faaaa (conditionally). The values are determined by ratios and layer index. Printing the parameter values is optional based on environment variables.", + "type": "comment" + }, + "778": { + "file_id": 44, + "content": " self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))\n self.receptance = nn.Linear(args.n_embd, args.dim_att, bias=False)\n self.key = nn.Linear(args.n_embd, args.dim_att, bias=False)\n self.value = nn.Linear(args.n_embd, args.dim_att, bias=False)\n self.output = nn.Linear(args.dim_att, args.n_embd, bias=False)\n self.ln_x = nn.GroupNorm(self.n_head, args.dim_att)\n if 'r3' in os.environ[\"RWKV_MY_TESTING\"]:\n @MyFunction\n def jit_func(self, x):\n B, TT, C = x.size()\n xx = self.time_shift(x) # Mix x with the previous timestep to produce xk, xv, xr\n xk = x * self.time_mix_k + xx * (1 - self.time_mix_k)\n xv = x * self.time_mix_v + xx * (1 - self.time_mix_v)\n xr = x * self.time_mix_r + xx * (1 - self.time_mix_r)\n xg = x * self.time_mix_g + xx * (1 - self.time_mix_g)\n r = self.receptance(xr).view(B, TT, self.n_head, self.head_size).transpose(1, 2) # BTC -> BHTS\n k ", + "type": "code", + "location": "/RWKV-v4neo/src/model.py:184-204" + }, + "779": { + "file_id": 44, + "content": "The code defines a model with time shift, receptance, key, value, and output layers. It also includes a group normalization layer (ln_x). If 'r3' is present in the environment, it creates a jit_func that processes input x by mixing it with the previous timestep to produce k, v, r, and g. The receptance layer then transforms the r values into BHTS format.", + "type": "comment" + }, + "780": { + "file_id": 44, + "content": "= self.key(xk).view(B, TT, self.n_head, self.head_size).transpose(1, 2).transpose(-2, -1) # BTC -> BHTS -> BHST\n v = self.value(xv).view(B, TT, self.n_head, -1).transpose(1, 2) # BTC -> BHTS\n g = F.silu(self.gate(xg))\n return r, k, v, g\n @MyFunction\n def jit_func_2(self, r, k, v, g, w, wk, wb, ws):\n B, H, TT, S = r.size()\n T = self.chunk_len\n s = torch.zeros(B, H, S, S, device=r.device, dtype=r.dtype) # state\n x = torch.zeros(B, H, TT, S, device=r.device, dtype=r.dtype) # output\n for i in range(TT // T):\n rr = r[:, :, i*T:i*T+T, :]\n kk = k[:, :, :, i*T:i*T+T]\n vv = v[:, :, i*T:i*T+T, :]\n x[:, :, i*T:i*T+T, :] = ((rr @ kk) * w) @ vv + (rr @ s) * wb\n s = ws * s + (kk * wk) @ vv\n x = x.transpose(1, 2).contiguous().view(B * TT, H*S) # BHTS -> BTHS -> BTC\n x = self.ln_x(x / self.head_size_divisor).view(B, TT, H*S) * g", + "type": "code", + "location": "/RWKV-v4neo/src/model.py:204-228" + }, + "781": { + "file_id": 44, + "content": "This code is defining and implementing a function for an attention mechanism in a transformer model. It calculates the attention scores, performs weighted sum of values, updates the states, and applies non-linearity before returning the final output.", + "type": "comment" + }, + "782": { + "file_id": 44, + "content": " return self.output(x)\n else:\n @MyFunction\n def jit_func(self, x):\n B, TT, C = x.size()\n xx = self.time_shift(x) # Mix x with the previous timestep to produce xk, xv, xr\n xk = x * self.time_mix_k + xx * (1 - self.time_mix_k)\n xv = x * self.time_mix_v + xx * (1 - self.time_mix_v)\n xr = x * self.time_mix_r + xx * (1 - self.time_mix_r)\n r = self.receptance(xr).view(B, TT, self.n_head, self.head_size).transpose(1, 2) # BTC -> BHTS\n k = self.key(xk).view(B, TT, self.n_head, self.head_size).transpose(1, 2).transpose(-2, -1) # BTC -> BHTS -> BHST\n v = self.value(xv).view(B, TT, self.n_head, self.head_size).transpose(1, 2) # BTC -> BHTS\n return r, k, v\n @MyFunction\n def jit_func_2(self, r, k, v, w, wk, wb, ws):\n B, H, TT, S = r.size()\n T = self.chunk_len\n s = torch.zeros(B, H, S, S, device=r.device, dtype=r.dtype) # state", + "type": "code", + "location": "/RWKV-v4neo/src/model.py:229-251" + }, + "783": { + "file_id": 44, + "content": "This code defines a class with two JIT functions. The first function takes an input tensor x and performs time shifting, then separates it into three components (k, v, r). The second function takes the output of the first function and creates another tensor s using torch.zeros. Both functions use jitted methods for faster execution.", + "type": "comment" + }, + "784": { + "file_id": 44, + "content": " x = torch.zeros(B, H, TT, S, device=r.device, dtype=r.dtype) # output\n for i in range(TT // T):\n rr = r[:, :, i*T:i*T+T, :]\n kk = k[:, :, :, i*T:i*T+T]\n vv = v[:, :, i*T:i*T+T, :]\n x[:, :, i*T:i*T+T, :] = ((rr @ kk) * w) @ vv + (rr @ s) * wb\n s = ws * s + (kk * wk) @ vv\n x = x.transpose(1, 2).contiguous().view(B * TT, H*S) # BHTS -> BTHS -> BTC\n x = self.ln_x(x / self.head_size_divisor).view(B, TT, H*S)\n return self.output(x)\n def forward(self, x):\n H = self.n_head\n T = self.chunk_len\n if 'r3' in os.environ[\"RWKV_MY_TESTING\"]:\n r, k, v, g = self.jit_func(x)\n else:\n r, k, v = self.jit_func(x)\n w = torch.exp(-torch.exp(self.time_decay.float())).unsqueeze(-1)\n if 'r2' in os.environ[\"RWKV_MY_TESTING\"]:\n u = self.time_faaaa.float().unsqueeze(-1)\n else:\n u = torch.exp(self.time_first.float()).unsqueeze(-1)", + "type": "code", + "location": "/RWKV-v4neo/src/model.py:252-281" + }, + "785": { + "file_id": 44, + "content": "Initializes a tensor with zeros, performs matrix operations to compute the output tensor, and applies layer normalization before returning the final output. The code also includes variable assignments for time-based decay and initial values for other computations.", + "type": "comment" + }, + "786": { + "file_id": 44, + "content": "################################################################################\n########\n ws = w.pow(T).reshape(1, H, 1, 1)\n ind = torch.arange(T-1, -1, -1, device=r.device).unsqueeze(0).repeat(H, 1)\n w = w.repeat(1, T).pow(ind)\n wk = w.reshape(1, H, 1, T)\n wb = wk.transpose(-2, -1).flip(2)\n w = torch.cat([w[:, 1:], u], dim=1)\n w = F.pad(w, (0, T))\n w = torch.tile(w, [T])\n w = w[:, :-T].reshape(-1, T, 2 * T - 1)\n w = w[:, :, T-1:].reshape(1, H, T, T)\n########\n################################################################################\n w = w.to(dtype=r.dtype)\n wk = wk.to(dtype=r.dtype)\n wb = wb.to(dtype=r.dtype)\n ws = ws.to(dtype=r.dtype)\n if 'r3' in os.environ[\"RWKV_MY_TESTING\"]:\n return self.jit_func_2(r, k, v, g, w, wk, wb, ws)\n else:\n return self.jit_func_2(r, k, v, w, wk, wb, ws) \n########################################################################################################", + "type": "code", + "location": "/RWKV-v4neo/src/model.py:283-311" + }, + "787": { + "file_id": 44, + "content": "This code segment is manipulating and reshaping a tensor 'w' by repeating, powering, transposing, and padding it. It then reshapes the result to specific dimensions and performs type conversions before returning from either jit_func_2 function depending on the environment variable \"RWKV_MY_TESTING\".", + "type": "comment" + }, + "788": { + "file_id": 44, + "content": "# CUDA RWKV5 Kernel\n########################################################################################################\nif 'r4' in os.environ[\"RWKV_MY_TESTING\"]:\n HEAD_SIZE = int(os.environ[\"RWKV_HEAD_SIZE_A\"])\n wkv5_cuda = load(name=\"wkv5\", sources=[\"cuda/wkv5_op.cpp\", f\"cuda/wkv5_cuda.cu\"],\n verbose=True, extra_cuda_cflags=[\"-res-usage\", \"--use_fast_math\", \"-O3\", \"-Xptxas -O3\", \"--extra-device-vectorization\", f\"-D_N_={HEAD_SIZE}\"])\n class WKV_5(torch.autograd.Function):\n @staticmethod\n def forward(ctx, B, T, C, H, r, k, v, w, u):\n with torch.no_grad():\n assert r.dtype == torch.bfloat16\n assert k.dtype == torch.bfloat16\n assert v.dtype == torch.bfloat16\n assert w.dtype == torch.bfloat16\n assert u.dtype == torch.bfloat16\n assert HEAD_SIZE == C // H\n ctx.B = B\n ctx.T = T\n ctx.C = C\n ctx.H = H\n assert r.is_contiguous()", + "type": "code", + "location": "/RWKV-v4neo/src/model.py:312-334" + }, + "789": { + "file_id": 44, + "content": "This code defines a WKV_5 function as a wrapper for CUDA RWKV5 kernel, which performs tensor operations on B, T, C, H, r, k, v, w, and u tensors. The kernel is loaded from the specified sources, with optional environment variables controlling its size. It asserts that certain conditions are met, such as matching data types and sizes for each input tensor, before proceeding with further computations.", + "type": "comment" + }, + "790": { + "file_id": 44, + "content": " assert k.is_contiguous()\n assert v.is_contiguous()\n assert w.is_contiguous()\n assert u.is_contiguous()\n ew = (-torch.exp(w.float())).contiguous()\n eew = (torch.exp(ew)).contiguous()\n ctx.save_for_backward(r, k, v, eew, ew, u)\n y = torch.empty((B, T, C), device=r.device, dtype=torch.bfloat16, memory_format=torch.contiguous_format) # .uniform_(-1, 1)\n wkv5_cuda.forward(B, T, C, H, r, k, v, eew, u, y)\n return y\n @staticmethod\n def backward(ctx, gy):\n with torch.no_grad():\n assert gy.dtype == torch.bfloat16\n B = ctx.B\n T = ctx.T\n C = ctx.C\n H = ctx.H\n assert gy.is_contiguous()\n r, k, v, eew, ew, u = ctx.saved_tensors\n gr = torch.empty((B, T, C), device=gy.device, requires_grad=False, dtype=torch.bfloat16, memory_format=torch.contiguous_format) # .uniform_(-1, 1)", + "type": "code", + "location": "/RWKV-v4neo/src/model.py:335-356" + }, + "791": { + "file_id": 44, + "content": "This code snippet is checking the contiguity of tensors k, v, w, and u. It then initializes eew and ew as exponentiated versions of w, saves them along with r, k, v, and u for backpropagation. Next, it creates an empty tensor y for forward pass with specific device, dtype, and memory format. Finally, it defines a backward method to compute gradients for backpropagation.", + "type": "comment" + }, + "792": { + "file_id": 44, + "content": " gk = torch.empty((B, T, C), device=gy.device, requires_grad=False, dtype=torch.bfloat16, memory_format=torch.contiguous_format) # .uniform_(-1, 1)\n gv = torch.empty((B, T, C), device=gy.device, requires_grad=False, dtype=torch.bfloat16, memory_format=torch.contiguous_format) # .uniform_(-1, 1)\n gw = torch.empty((B, C), device=gy.device, requires_grad=False, dtype=torch.bfloat16, memory_format=torch.contiguous_format) # .uniform_(-1, 1)\n gu = torch.empty((B, C), device=gy.device, requires_grad=False, dtype=torch.bfloat16, memory_format=torch.contiguous_format) # .uniform_(-1, 1)\n wkv5_cuda.backward(B, T, C, H, r, k, v, eew, ew, u, gy, gr, gk, gv, gw, gu)\n gw = torch.sum(gw, 0).view(H, C//H)\n gu = torch.sum(gu, 0).view(H, C//H)\n return (None, None, None, None, gr, gk, gv, gw, gu)\n def RUN_CUDA_RWKV5(B, T, C, H, r, k, v, w, u):\n return WKV_5.apply(B, T, C, H, r, k, v, w, u)", + "type": "code", + "location": "/RWKV-v4neo/src/model.py:357-367" + }, + "793": { + "file_id": 44, + "content": "This code initializes four tensors with uniform random values for model parameters, then calls a CUDA function to perform calculations on the input data. The resulting gradients are stored in gk and gu and returned along with other results.", + "type": "comment" + }, + "794": { + "file_id": 44, + "content": "########################################################################################################\nclass RWKV_TimeMix_RWKV5(MyModule):\n def __init__(self, args, layer_id):\n super().__init__()\n self.args = args\n self.layer_id = layer_id\n self.head_size = args.head_size_a\n assert HEAD_SIZE == self.head_size # change HEAD_SIZE to match args.head_size_a\n self.n_head = args.dim_att // self.head_size\n assert args.dim_att % self.n_head == 0\n self.head_size_divisor = args.head_size_divisor\n with torch.no_grad():\n ratio_0_to_1 = layer_id / (args.n_layer - 1) # 0 to 1\n ratio_1_to_almost0 = 1.0 - (layer_id / args.n_layer) # 1 to ~0\n ddd = torch.ones(1, 1, args.n_embd)\n for i in range(args.n_embd):\n ddd[0, 0, i] = i / args.n_embd\n # fancy time_mix\n self.time_mix_k = nn.Parameter(torch.pow(ddd, ratio_1_to_almost0))\n self.time_mix_v = nn.Parameter(torch.pow(ddd, ratio_1_to_almost0) + 0.3 * ratio_0_to_1)", + "type": "code", + "location": "/RWKV-v4neo/src/model.py:369-392" + }, + "795": { + "file_id": 44, + "content": "The code initializes the RWKV_TimeMix_RWKV5 class, sets up layer parameters based on input arguments, and creates time_mix_k and time_mix_v parameters using a combination of ratio-based calculations and input arguments.", + "type": "comment" + }, + "796": { + "file_id": 44, + "content": " self.time_mix_r = nn.Parameter(torch.pow(ddd, 0.5 * ratio_1_to_almost0))\n self.time_mix_g = nn.Parameter(torch.pow(ddd, 0.5 * ratio_1_to_almost0))\n # fancy time_decay\n decay_speed = torch.ones(args.dim_att)\n for n in range(args.dim_att):\n decay_speed[n] = -6 + 5 * (n / (args.dim_att - 1)) ** (0.7 + 1.3 * ratio_0_to_1)\n self.time_decay = nn.Parameter(decay_speed.reshape(self.n_head, self.head_size))\n # print(layer_id, self.time_decay.flatten()[:3].cpu().numpy(), '...', self.time_decay.flatten()[-3:].cpu().numpy())\n tmp = torch.zeros(args.dim_att)\n for n in range(args.dim_att):\n zigzag = ((n + 1) % 3 - 1) * 0.1\n tmp[n] = ratio_0_to_1 * (1 - (n / (args.dim_att - 1))) + zigzag\n self.time_faaaa = nn.Parameter(tmp.reshape(self.n_head, self.head_size))\n self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))\n self.receptance = nn.Linear(args.n_embd, args.dim_att, bias=False)", + "type": "code", + "location": "/RWKV-v4neo/src/model.py:393-411" + }, + "797": { + "file_id": 44, + "content": "This code initializes three learnable parameters: time_mix_r, time_mix_g, and time_faaaa. It also includes a decay speed parameter (time_decay) and a zero-padding operation (time_shift). These parameters are used for time-related operations in the model. The receptance layer is initialized as a linear layer without bias, taking input embeddings of size args.n_embd to args.dim_att. This code likely contributes to the model's ability to process temporal information effectively.", + "type": "comment" + }, + "798": { + "file_id": 44, + "content": " self.key = nn.Linear(args.n_embd, args.dim_att, bias=False)\n self.value = nn.Linear(args.n_embd, args.dim_att, bias=False)\n self.output = nn.Linear(args.dim_att, args.n_embd, bias=False)\n self.gate = nn.Linear(args.n_embd, args.dim_att, bias=False)\n self.ln_x = nn.GroupNorm(self.n_head, args.dim_att)\n @MyFunction\n def jit_func(self, x):\n B, T, C = x.size()\n xx = self.time_shift(x) # Mix x with the previous timestep to produce xk, xv, xr\n xk = x * self.time_mix_k + xx * (1 - self.time_mix_k)\n xv = x * self.time_mix_v + xx * (1 - self.time_mix_v)\n xr = x * self.time_mix_r + xx * (1 - self.time_mix_r)\n xg = x * self.time_mix_g + xx * (1 - self.time_mix_g)\n r = self.receptance(xr)\n k = self.key(xk)\n v = self.value(xv)\n g = F.silu(self.gate(xg))\n return r, k, v, g\n @MyFunction\n def jit_func_2(self, x, g):\n B, T, C = x.size()\n x = x.view(B * T, C)\n x = self.ln_x(x / self.head_size_divisor).view(B, T, C)", + "type": "code", + "location": "/RWKV-v4neo/src/model.py:412-441" + }, + "799": { + "file_id": 44, + "content": "The code defines a model with four linear layers and one group normalization layer. The `jit_func` method performs time-shifted mixing of input `x` to produce key, value, residual, and gate tensors, which are then passed through their respective layers and normalized. The `jit_func_2` method applies group normalization and divides by the head size divisor before reshaping the tensor.", + "type": "comment" + } +} \ No newline at end of file diff --git a/docs/data/8.json b/docs/data/8.json new file mode 100644 index 00000000..0aaf1842 --- /dev/null +++ b/docs/data/8.json @@ -0,0 +1,549 @@ +{ + "800": { + "file_id": 44, + "content": " x = self.output(x * g)\n return x\n def forward(self, x):\n B, T, C = x.size()\n H = self.n_head\n r, k, v, g = self.jit_func(x)\n x = RUN_CUDA_RWKV5(B, T, C, H, r, k, v, w=self.time_decay, u=self.time_faaaa)\n return self.jit_func_2(x, g)\n########################################################################################################\n# RWKV: RWKV Time-mix + RWKV Channel-mix\n########################################################################################################\nclass RWKV_TimeMix(MyModule):\n def __init__(self, args, layer_id):\n super().__init__()\n self.args = args\n self.layer_id = layer_id\n self.ctx_len = args.ctx_len\n self.n_embd = args.n_embd\n with torch.no_grad(): # fancy init\n ratio_0_to_1 = layer_id / (args.n_layer - 1) # 0 to 1\n ratio_1_to_almost0 = 1.0 - (layer_id / args.n_layer) # 1 to ~0\n ddd = torch.ones(1, 1, args.n_embd)\n for i in range(args.n_embd):", + "type": "code", + "location": "/RWKV-v4neo/src/model.py:442-472" + }, + "801": { + "file_id": 44, + "content": "This code defines a class `RWKV_TimeMix` that inherits from `MyModule`. It initializes attributes such as `args`, `layer_id`, `ctx_len`, and `n_embd`. It also performs a fancy init by calculating ratios based on the layer ID and number of layers.", + "type": "comment" + }, + "802": { + "file_id": 44, + "content": " ddd[0, 0, i] = i / args.n_embd\n # fancy time_decay\n decay_speed = torch.ones(args.dim_att)\n for h in range(args.dim_att):\n decay_speed[h] = -5 + 8 * (h / (args.dim_att - 1)) ** (0.7 + 1.3 * ratio_0_to_1)\n self.time_decay = nn.Parameter(decay_speed)\n # print(layer_id, self.time_decay.flatten()[:3].cpu().numpy(), '...', self.time_decay.flatten()[-3:].cpu().numpy())\n # fancy time_first\n zigzag = torch.tensor([(i + 1) % 3 - 1 for i in range(args.dim_att)]) * 0.5\n self.time_first = nn.Parameter(torch.ones(args.dim_att) * math.log(0.3) + zigzag)\n # fancy time_mix\n self.time_mix_k = nn.Parameter(torch.pow(ddd, ratio_1_to_almost0))\n self.time_mix_v = nn.Parameter(torch.pow(ddd, ratio_1_to_almost0) + 0.3 * ratio_0_to_1)\n self.time_mix_r = nn.Parameter(torch.pow(ddd, 0.5 * ratio_1_to_almost0))\n self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))\n self.key = nn.Linear(args.n_embd, args.dim_att, bias=False)", + "type": "code", + "location": "/RWKV-v4neo/src/model.py:473-492" + }, + "803": { + "file_id": 44, + "content": "This code is initializing parameters for a time-based transformer model layer. It sets the decay rate, first value in time series, and mixing coefficients using fancy techniques to achieve more complexity in the attention mechanism. The time shift operation pads the input, and a linear transformation is applied to create the key values.", + "type": "comment" + }, + "804": { + "file_id": 44, + "content": " self.value = nn.Linear(args.n_embd, args.dim_att, bias=False)\n self.receptance = nn.Linear(args.n_embd, args.dim_att, bias=False)\n self.output = nn.Linear(args.dim_att, args.n_embd, bias=False)\n if 'a' in os.environ[\"RWKV_MY_TESTING\"]:\n self.register_buffer(\"att_mask\", torch.tril(torch.ones(args.ctx_len, args.ctx_len)))\n d_qkv = args.n_embd // 16\n self.qq = nn.Linear(args.n_embd, d_qkv, bias=False)\n self.kk = nn.Linear(args.n_embd, d_qkv, bias=False)\n self.vv = nn.Linear(args.n_embd, d_qkv, bias=False)\n self.oo = nn.Linear(d_qkv, args.n_embd, bias=False)\n with torch.no_grad():\n self.time_mix_qq = nn.Parameter(torch.pow(ddd, ratio_1_to_almost0))\n self.time_mix_kk = nn.Parameter(torch.pow(ddd, ratio_1_to_almost0))\n self.time_mix_vv = nn.Parameter(torch.pow(ddd, ratio_1_to_almost0) + 0.3 * ratio_0_to_1)\n if 'a' not in os.environ[\"RWKV_MY_TESTING\"]:\n @MyFunction", + "type": "code", + "location": "/RWKV-v4neo/src/model.py:493-510" + }, + "805": { + "file_id": 44, + "content": "This code initializes layers for a transformer model and includes optional testing-specific buffers and parameters. The 'MyFunction' decorator suggests additional functionality is defined elsewhere.", + "type": "comment" + }, + "806": { + "file_id": 44, + "content": " def jit_func(self, x):\n xx = self.time_shift(x) # Mix x with the previous timestep to produce xk, xv, xr\n xk = x * self.time_mix_k + xx * (1 - self.time_mix_k)\n xv = x * self.time_mix_v + xx * (1 - self.time_mix_v)\n xr = x * self.time_mix_r + xx * (1 - self.time_mix_r)\n k = self.key(xk)\n v = self.value(xv)\n r = self.receptance(xr)\n sr = torch.sigmoid(r)\n return sr, k, v\n def forward(self, x):\n B, T, C = x.size() # x = (Batch,Time,Channel)\n sr, k, v = self.jit_func(x)\n rwkv = sr * RUN_CUDA(B, T, self.args.dim_att, self.time_decay, self.time_first, k, v)\n return self.output(rwkv)\n if 'a' in os.environ[\"RWKV_MY_TESTING\"]:\n @MyFunction\n def QKV(self, q, k, v):\n att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))\n att = att.masked_fill(self.att_mask == 0, float('-inf'))\n att = F.softmax(att, dim = -1)", + "type": "code", + "location": "/RWKV-v4neo/src/model.py:511-533" + }, + "807": { + "file_id": 44, + "content": "This function defines jit_func, which mixes the current timestep with the previous one for x, k, and v. Then forward function calculates RWKV using these variables and applies softmax to attentions. QKV function computes attention scores between query (q) and key (k), masking zero-valued elements and applying softmax along dim=-1.", + "type": "comment" + }, + "808": { + "file_id": 44, + "content": " x = att @ v\n return x\n @MyFunction\n def jit_funcQKV(self, x):\n xx = self.time_shift(x) # Mix x with the previous timestep to produce xk, xv, xr\n xk = x * self.time_mix_k + xx * (1 - self.time_mix_k)\n xv = x * self.time_mix_v + xx * (1 - self.time_mix_v)\n xr = x * self.time_mix_r + xx * (1 - self.time_mix_r)\n xqq = x * self.time_mix_qq + xx * (1 - self.time_mix_qq)\n xkk = x * self.time_mix_kk + xx * (1 - self.time_mix_kk)\n xvv = x * self.time_mix_vv + xx * (1 - self.time_mix_vv)\n k = self.key(xk)\n v = self.value(xv)\n r = self.receptance(xr)\n sr = torch.sigmoid(r)\n qq = self.qq(xqq)\n kk = self.kk(xkk)\n vv = self.vv(xvv)\n return sr, k, v, qq, kk, vv\n def forward(self, x):\n B, T, C = x.size() # x = (Batch,Time,Channel)\n sr, k, v, qq, kk, vv = self.jit_funcQKV(x)\n rwkv = sr * RUN_CUDA(B, T, self.args.dim_att, self.time_decay, self.time_first, k, v)", + "type": "code", + "location": "/RWKV-v4neo/src/model.py:534-558" + }, + "809": { + "file_id": 44, + "content": "The code defines a function `jit_funcQKV` that takes an input tensor `x`, applies time shifting and mixing, then extracts key, value, receptance, qq, kk, vv from the result. The forward function reshapes input tensor `x` to (Batch, Time, Channel) and calls `jit_funcQKV` to compute sr, k, v, qq, kk, vv. It then uses these outputs with another function `RUN_CUDA` to calculate the final output rwkv.", + "type": "comment" + }, + "810": { + "file_id": 44, + "content": " rwkv = self.output(rwkv) + self.oo(self.QKV(qq, kk, vv))\n return rwkv\n########################################################################################################\nclass RWKV_ChannelMix(MyModule):\n def __init__(self, args, layer_id):\n super().__init__()\n self.args = args\n self.layer_id = layer_id\n self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))\n with torch.no_grad(): # fancy init of time_mix\n ratio_1_to_almost0 = 1.0 - (layer_id / args.n_layer) # 1 to ~0\n ddd = torch.ones(1, 1, args.n_embd)\n for i in range(args.n_embd):\n ddd[0, 0, i] = i / args.n_embd\n self.time_mix_k = nn.Parameter(torch.pow(ddd, ratio_1_to_almost0))\n self.time_mix_r = nn.Parameter(torch.pow(ddd, ratio_1_to_almost0))\n self.key = nn.Linear(args.n_embd, args.dim_ffn, bias=False)\n self.receptance = nn.Linear(args.n_embd, args.n_embd, bias=False)\n self.value = nn.Linear(args.dim_ffn, args.n_embd, bias=False)", + "type": "code", + "location": "/RWKV-v4neo/src/model.py:559-581" + }, + "811": { + "file_id": 44, + "content": "The code defines a class `RWKV_ChannelMix` that extends the `MyModule` class. It initializes the layer with parameters based on the input arguments and layer ID, and includes time mixing and linear layers for key, receptance, and value operations.", + "type": "comment" + }, + "812": { + "file_id": 44, + "content": " @MyFunction\n def forward(self, x):\n xx = self.time_shift(x)\n xk = x * self.time_mix_k + xx * (1 - self.time_mix_k)\n xr = x * self.time_mix_r + xx * (1 - self.time_mix_r)\n k = self.key(xk)\n k = torch.relu(k) ** 2\n kv = self.value(k)\n return torch.sigmoid(self.receptance(xr)) * kv\nclass MishGLU(MyModule):\n def __init__(self, args, layer_id):\n super().__init__()\n self.args = args\n self.layer_id = layer_id\n self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))\n with torch.no_grad():\n ratio_1_to_almost0 = 1.0 - (layer_id / args.n_layer)\n x = torch.ones(1, 1, args.n_embd)\n for i in range(args.n_embd):\n x[0, 0, i] = i / args.n_embd\n self.time_mix_k = nn.Parameter(torch.pow(x, ratio_1_to_almost0))\n self.time_mix_r = nn.Parameter(torch.pow(x, ratio_1_to_almost0))\n self.aa = nn.Linear(args.n_embd, args.dim_ffn, bias=False)\n self.bb = nn.Linear(args.n_embd, args.dim_ffn, bias=False)", + "type": "code", + "location": "/RWKV-v4neo/src/model.py:583-610" + }, + "813": { + "file_id": 44, + "content": "The code defines a class \"MishGLU\" that inherits from \"MyModule\". It initializes the object with parameters and layers. The class contains a \"forward\" method which performs time shifting, mixing, and calculations using various linear layers and functions. It uses parameters that depend on the layer ID and number of layers for time-based mixing. The output is obtained through a combination of sigmoid and value calculation.", + "type": "comment" + }, + "814": { + "file_id": 44, + "content": " self.value = nn.Linear(args.dim_ffn, args.n_embd, bias=False)\n @MyFunction\n def forward(self, x):\n xx = self.time_shift(x)\n xa = x * self.time_mix_k + xx * (1 - self.time_mix_k)\n xb = x * self.time_mix_r + xx * (1 - self.time_mix_r)\n a = self.aa(xa)\n b = self.bb(xb)\n return self.value(a * F.mish(b))\n########################################################################################################\n# The RWKV Model with our blocks\n########################################################################################################\nclass Block(nn.Module):\n def __init__(self, args, layer_id):\n super().__init__()\n self.args = args\n self.layer_id = layer_id\n self.ln1 = nn.LayerNorm(args.n_embd)\n self.ln2 = nn.LayerNorm(args.n_embd)\n if self.layer_id == 0:\n self.ln0 = nn.LayerNorm(args.n_embd)\n if args.my_pos_emb > 0:\n self.pos_emb_x = nn.Parameter(torch.zeros((1,args.my_pos_emb,args.n_embd)))", + "type": "code", + "location": "/RWKV-v4neo/src/model.py:611-639" + }, + "815": { + "file_id": 44, + "content": "The code defines a model with a block class. It contains linear layers, layer normalization, and Mish activation function. The block has different layer norms for each stage, with the first layer having an additional normalization layer. There is also a parameter for positional embedding if enabled in the arguments.", + "type": "comment" + }, + "816": { + "file_id": 44, + "content": " self.pos_emb_y = nn.Parameter(torch.zeros((args.my_pos_emb,1,args.n_embd)))\n if self.layer_id == 0 and self.args.pre_ffn > 0:\n self.ffnPre = RWKV_ChannelMix(args, 0)\n else:\n if 'r4' in os.environ[\"RWKV_MY_TESTING\"]:\n self.att = RWKV_TimeMix_RWKV5(args, layer_id)\n elif 'r' in os.environ[\"RWKV_MY_TESTING\"]:\n self.att = RWKV_TimeMix_RWKV5_Preview(args, layer_id)\n else:\n self.att = RWKV_TimeMix(args, layer_id)\n if 'g' in os.environ[\"RWKV_MY_TESTING\"]:\n self.ffn = MishGLU(args, layer_id)\n else:\n self.ffn = RWKV_ChannelMix(args, layer_id)\n if args.tiny_att_dim > 0 and self.layer_id == args.tiny_att_layer:\n self.tiny_ln = nn.LayerNorm(args.n_embd)\n self.tiny_q = nn.Linear(args.n_embd, args.tiny_att_dim, bias=False)\n self.tiny_k = nn.Linear(args.n_embd, args.tiny_att_dim, bias=False)\n self.tiny_v = nn.Linear(args.n_embd, args.n_embd, bias=False)", + "type": "code", + "location": "/RWKV-v4neo/src/model.py:640-661" + }, + "817": { + "file_id": 44, + "content": "The code initializes the model's parameters, creates layers for position embedding and various attention mechanisms based on environment variables, and includes a MishGLU or RWKV_ChannelMix layer depending on the environment variable. If 'g' is in the environment variable, it uses a different ffn (fully connected feedforward) layer. Additionally, if args.tiny\\_att\\_dim is greater than 0 and the current layer matches args.tiny\\_att\\_layer, it adds specific layers for tiny attention with LayerNorm and linear layers for query, key, and value.", + "type": "comment" + }, + "818": { + "file_id": 44, + "content": " self.register_buffer(\"tiny_mask\", torch.tril(torch.ones(args.ctx_len, args.ctx_len)))\n if args.dropout > 0:\n self.drop0 = nn.Dropout(p = args.dropout)\n self.drop1 = nn.Dropout(p = args.dropout)\n def forward(self, x, x_emb=None):\n args = self.args\n B, T, C = x.size()\n if self.layer_id == 0:\n x = self.ln0(x)\n if args.my_pos_emb > 0:\n pos_emb = (self.pos_emb_x + self.pos_emb_y).reshape(T+1, -1)[:-1,:]\n x = x + pos_emb\n if self.args.dropout == 0:\n if self.layer_id == 0 and args.pre_ffn > 0:\n x = x + self.ffnPre(self.ln1(x))\n else:\n x = x + self.att(self.ln1(x))\n x = x + self.ffn(self.ln2(x))\n else:\n if self.layer_id == 0 and args.pre_ffn > 0:\n x = self.drop0(x + self.ffnPre(self.ln1(x)))\n else:\n x = self.drop0(x + self.att(self.ln1(x)))\n x = self.drop1(x + self.ffn(self.ln2(x)))", + "type": "code", + "location": "/RWKV-v4neo/src/model.py:662-688" + }, + "819": { + "file_id": 44, + "content": "The code defines a forward function for a model and includes dropout layers, LayerNorm (lnX), attention layer (att), feed-forward network (ffnPre and ffn), position embedding (pos_emb_x and pos_emb_y), and a triangular mask (tiny_mask). It applies the layers based on input parameters like dropout rate, pre-ffn flag, and layer index.", + "type": "comment" + }, + "820": { + "file_id": 44, + "content": " if args.tiny_att_dim > 0 and self.layer_id == args.tiny_att_layer:\n xx = self.tiny_ln(x)\n q = self.tiny_q(xx)[:, :T, :]\n k = self.tiny_k(xx)[:, :T, :]\n c = (q @ k.transpose(-2, -1)) * (args.tiny_att_dim ** (-0.5))\n c = c.masked_fill(self.tiny_mask[:T, :T] == 0, 0)\n x = x + c @ self.tiny_v(x_emb)\n return x\nclass L2Wrap(torch.autograd.Function):\n @staticmethod\n def forward(ctx, loss, y):\n ctx.save_for_backward(y)\n return loss\n @staticmethod\n def backward(ctx, grad_output):\n y = ctx.saved_tensors[0]\n # to encourage the logits to be close to 0\n factor = 1e-4 / (y.shape[0] * y.shape[1])\n maxx, ids = torch.max(y, -1, keepdim=True)\n gy = torch.zeros_like(y)\n gy.scatter_(-1, ids, maxx * factor)\n return (grad_output, gy)\nclass RWKV(pl.LightningModule):\n def __init__(self, args):\n super().__init__()\n self.args = args\n if not hasattr(args, 'dim_att'):", + "type": "code", + "location": "/RWKV-v4neo/src/model.py:690-721" + }, + "821": { + "file_id": 44, + "content": "The code is implementing an attention mechanism in the RWKV model. It checks if the layer id matches a specific layer and if a smaller attention dimension is desired. If both conditions are met, it applies the scaled dot product attention, masks out padding tokens, and adds the result to the input. The L2Wrap function is used to implement an L2 regularization term in the loss computation.", + "type": "comment" + }, + "822": { + "file_id": 44, + "content": " args.dim_att = args.n_embd\n if not hasattr(args, 'dim_ffn'):\n args.dim_ffn = args.n_embd * 4\n if not hasattr(args, 'tiny_att_layer'):\n args.tiny_att_layer = -1\n if not hasattr(args, 'tiny_att_dim'):\n args.tiny_att_dim = -1\n assert args.n_embd % 32 == 0\n assert args.dim_att % 32 == 0\n assert args.dim_ffn % 32 == 0\n self.emb = nn.Embedding(args.vocab_size, args.n_embd)\n self.blocks = nn.ModuleList([Block(args, i) for i in range(args.n_layer)])\n self.ln_out = nn.LayerNorm(args.n_embd)\n self.head = nn.Linear(args.n_embd, args.vocab_size, bias=False)\n if args.head_qk > 0:\n self.head_q = nn.Linear(args.n_embd, args.head_qk, bias=False)\n self.head_k = nn.Linear(args.n_embd, args.head_qk, bias=False)\n self.register_buffer(\"copy_mask\", torch.tril(torch.ones(args.ctx_len, args.ctx_len)))\n if args.dropout > 0:\n self.drop0 = nn.Dropout(p = args.dropout)", + "type": "code", + "location": "/RWKV-v4neo/src/model.py:722-745" + }, + "823": { + "file_id": 44, + "content": "This code initializes a transformer model by setting attributes based on argument values, creating embedding and linear layers, and organizing blocks within the model. Assertions ensure that certain dimensions are multiples of 32 for optimization purposes. If certain attributes aren't set, the code assigns default values. Additionally, if dropout or key/query projection is enabled, corresponding layers are created.", + "type": "comment" + }, + "824": { + "file_id": 44, + "content": " def configure_optimizers(self):\n args = self.args\n lr_decay = set()\n lr_1x = set()\n lr_2x = set()\n lr_3x = set()\n for n, p in self.named_parameters():\n if (\"time_mix\" in n) and (args.layerwise_lr > 0):\n if args.my_pile_stage == 2:\n lr_2x.add(n)\n else:\n lr_1x.add(n)\n elif (\"time_decay\" in n) and (args.layerwise_lr > 0):\n if args.my_pile_stage == 2:\n lr_3x.add(n)\n else:\n lr_2x.add(n)\n elif (\"time_faaaa\" in n) and (args.layerwise_lr > 0):\n if args.my_pile_stage == 2:\n lr_2x.add(n)\n else:\n lr_1x.add(n)\n elif (\"time_first\" in n) and (args.layerwise_lr > 0):\n lr_3x.add(n)\n elif (len(p.squeeze().shape) >= 2) and (args.weight_decay > 0):\n lr_decay.add(n)\n else:\n lr_1x.add(n)", + "type": "code", + "location": "/RWKV-v4neo/src/model.py:747-775" + }, + "825": { + "file_id": 44, + "content": "This function sets up optimizers based on parameter names and provided arguments. It categorizes parameters into different learning rate groups: 1x, 2x, or 3x multipliers for time-related layers; decay rate for weight decay; and all others with a single learning rate.", + "type": "comment" + }, + "826": { + "file_id": 44, + "content": " lr_decay = sorted(list(lr_decay))\n lr_1x = sorted(list(lr_1x))\n lr_2x = sorted(list(lr_2x))\n lr_3x = sorted(list(lr_3x))\n # print('decay', lr_decay)\n # print('1x', lr_1x)\n # print('2x', lr_2x)\n # print('3x', lr_3x)\n param_dict = {n: p for n, p in self.named_parameters()}\n if args.layerwise_lr > 0:\n if args.my_pile_stage == 2:\n optim_groups = [\n {\"params\": [param_dict[n] for n in lr_1x], \"weight_decay\": 0.0, \"my_lr_scale\": 1.0},\n {\"params\": [param_dict[n] for n in lr_2x], \"weight_decay\": 0.0, \"my_lr_scale\": 5.0},# test: 2e-3 / args.lr_init},\n {\"params\": [param_dict[n] for n in lr_3x], \"weight_decay\": 0.0, \"my_lr_scale\": 5.0},# test: 3e-3 / args.lr_init},\n ]\n else:\n optim_groups = [\n {\"params\": [param_dict[n] for n in lr_1x], \"weight_decay\": 0.0, \"my_lr_scale\": 1.0},\n {\"params\": [param_dict[n] for n in lr_2x], \"weight_decay\": 0.0, \"my_lr_scale\": 2.0},", + "type": "code", + "location": "/RWKV-v4neo/src/model.py:777-797" + }, + "827": { + "file_id": 44, + "content": "This code is initializing optimizer groups for layerwise learning rate (LR) in a neural network model. It sorts the LR values and creates separate optimizer groups for each level of scaling, assigning different layers to each group based on their corresponding scaling factor. The layer-specific scaling is applied when args.my_pile_stage == 2, otherwise, it uses a different set of scales. This allows for more fine-grained control over the learning rates for different layers during training.", + "type": "comment" + }, + "828": { + "file_id": 44, + "content": " {\"params\": [param_dict[n] for n in lr_3x], \"weight_decay\": 0.0, \"my_lr_scale\": 3.0},\n ]\n else:\n optim_groups = [{\"params\": [param_dict[n] for n in lr_1x], \"weight_decay\": 0.0, \"my_lr_scale\": 1.0}]\n if args.weight_decay > 0:\n optim_groups += [{\"params\": [param_dict[n] for n in lr_decay], \"weight_decay\": args.weight_decay, \"my_lr_scale\": 1.0}]\n if self.deepspeed_offload:\n return DeepSpeedCPUAdam(optim_groups, lr=self.args.lr_init, betas=self.args.betas, eps=self.args.adam_eps, bias_correction=True, adamw_mode=True, amsgrad=False)\n return FusedAdam(optim_groups, lr=self.args.lr_init, betas=self.args.betas, eps=self.args.adam_eps, bias_correction=True, adam_w_mode=True, amsgrad=False)\n else:\n if self.deepspeed_offload:\n return DeepSpeedCPUAdam(optim_groups, lr=self.args.lr_init, betas=self.args.betas, eps=self.args.adam_eps, bias_correction=True, adamw_mode=False, weight_decay=0, amsgrad=False)", + "type": "code", + "location": "/RWKV-v4neo/src/model.py:798-810" + }, + "829": { + "file_id": 44, + "content": "This code initializes optimizer groups for model parameters based on learning rates and weight decay. If weight decay is provided, it adds separate optimization group with the specified weight decay. Depending on Deepspeed offload flag, it returns either DeepSpeedCPUAdam or FusedAdam optimizer instances.", + "type": "comment" + }, + "830": { + "file_id": 44, + "content": " return FusedAdam(optim_groups, lr=self.args.lr_init, betas=self.args.betas, eps=self.args.adam_eps, bias_correction=True, adam_w_mode=False, weight_decay=0, amsgrad=False)\n # return ZeroOneAdam(optim_groups, lr=self.args.lr_init, betas=self.args.betas, eps=self.args.adam_eps, bias_correction=True, weight_decay=0, amsgrad=False, cuda_aware=False)\n @property\n def deepspeed_offload(self) -> bool:\n strategy = self.trainer.strategy\n if isinstance(strategy, DeepSpeedStrategy):\n cfg = strategy.config[\"zero_optimization\"]\n return cfg.get(\"offload_optimizer\") or cfg.get(\"offload_param\")\n return False\n def forward(self, idx):\n args = self.args\n B, T = idx.size()\n assert T <= args.ctx_len, \"Cannot forward, model ctx_len is exhausted.\"\n x = self.emb(idx)\n x_emb = x\n if args.dropout > 0:\n x = self.drop0(x)\n if args.tiny_att_dim > 0:\n for block in self.blocks:\n if args.grad_cp == 1:", + "type": "code", + "location": "/RWKV-v4neo/src/model.py:811-834" + }, + "831": { + "file_id": 44, + "content": "This code snippet defines a model that uses either FusedAdam or ZeroOneAdam optimizer based on the deepspeed_offload property. The function forward takes an index and processes it through embedding, potentially dropout, and then passes it to multiple blocks if grad_cp is 1. The deepspeed_offload property checks a DeepSpeedStrategy configuration for offloading settings.", + "type": "comment" + }, + "832": { + "file_id": 44, + "content": " x = deepspeed.checkpointing.checkpoint(block, x, x_emb)\n else:\n x = block(x, x_emb)\n else:\n for block in self.blocks:\n if args.grad_cp == 1:\n x = deepspeed.checkpointing.checkpoint(block, x)\n else:\n x = block(x)\n x = self.ln_out(x)\n if args.head_qk > 0:\n q = self.head_q(x)[:, :T, :]\n k = self.head_k(x)[:, :T, :]\n c = (q @ k.transpose(-2, -1)) * (1.0 / args.head_qk)\n c = c.masked_fill(self.copy_mask[:T, :T] == 0, 0)\n if \"32\" in os.environ[\"RWKV_FLOAT_MODE\"]:\n c = c @ F.one_hot(idx, num_classes=args.vocab_size)\n elif os.environ[\"RWKV_FLOAT_MODE\"] == \"fp16\":\n c = c @ F.one_hot(idx, num_classes=args.vocab_size).half()\n elif os.environ[\"RWKV_FLOAT_MODE\"] == \"bf16\":\n c = c @ F.one_hot(idx, num_classes=args.vocab_size).bfloat16()\n x = self.head(x) + c", + "type": "code", + "location": "/RWKV-v4neo/src/model.py:835-860" + }, + "833": { + "file_id": 44, + "content": "This code is responsible for handling the checkpointing and layer execution within a specific block in the RWKV model. It also handles head attention calculations and applies one-hot encoding based on the floating-point mode environment variable.", + "type": "comment" + }, + "834": { + "file_id": 44, + "content": " else:\n x = self.head(x)\n return x\n def training_step(self, batch, batch_idx):\n args = self.args\n if args.my_qa_mask != 1:\n idx, targets = batch\n logits = self(idx)\n loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))\n # if '0' in os.environ[\"RWKV_MY_TESTING\"]:\n # print('logits', logits)\n # torch.set_printoptions(threshold=10000)\n # print('idx', idx)\n # exit(0)\n else:\n idx, targets, mask = batch\n mask = mask.view(-1)\n sum_mask = torch.sum(mask).item()\n # if sum_mask == 0:\n # return torch.tensor([0.0], requires_grad=True)\n logits = self(idx)\n if sum_mask == mask.shape[0]:\n loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))\n # print('rank', self.global_rank, 'loss', loss.item())\n else:\n loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), reduction='none')", + "type": "code", + "location": "/RWKV-v4neo/src/model.py:861-889" + }, + "835": { + "file_id": 44, + "content": "This code contains two methods: '__call__' and 'training_step'. The '__call__' method performs a forward pass through the model by calling the 'head' method if not in training mode, otherwise it returns the input 'x' as-is. In the 'training_step' method, it checks the argument 'args.my_qa_mask' to determine whether to perform training or not. If 'args.my_qa_mask != 1', it calculates and returns a loss using cross entropy loss function. If 'args.my_qa_mask == 1', it applies a mask to the targets during training.", + "type": "comment" + }, + "836": { + "file_id": 44, + "content": " # loss_raw = loss\n loss = torch.sum(loss * mask) / sum_mask\n # torch.set_printoptions(threshold=10000)\n # if True: #self.global_rank == 1:\n # tmp = ''\n # sss = 0\n # ccc = 0\n # for i in range(mask.shape[0]):\n # if mask[i] > 0:\n # tmp += str(idx.view(-1)[i].item()) + ','\n # sss += loss_raw.view(-1)[i].float().item()\n # ccc += 1\n # print('rank', self.global_rank, 'loss', loss.item(), 'lavg', sss / ccc)#, 'tmp', tmp, 'input', idx)\n return L2Wrap.apply(loss, logits)\n def training_step_end(self, batch_parts):\n if pl.__version__[0]!='2':\n all = self.all_gather(batch_parts)\n if self.trainer.is_global_zero:\n self.trainer.my_loss_all = all\n def generate_init_weight(self):\n print(\n f\"\"\"\n############################################################################", + "type": "code", + "location": "/RWKV-v4neo/src/model.py:890-916" + }, + "837": { + "file_id": 44, + "content": "This code is defining the model, loss calculation, and training step functions for a neural network. It calculates the loss based on a masked sum of the loss_raw values, and then performs an all-gather operation to collect the losses from all ranks before proceeding to the next steps. The output is wrapped with L2 norm using the apply function. The generate_init_weight function prints some information at the start of training.", + "type": "comment" + }, + "838": { + "file_id": 44, + "content": "#\n# Init model weight (slow for large models)...\n#\n############################################################################\n\"\"\"\n )\n m = {}\n for n in self.state_dict():\n p = self.state_dict()[n]\n shape = p.shape\n gain = 1.0\n scale = 1.0\n if \"ln_\" in n or \".ln\" in n or \"time_\" in n or \"_mask\" in n or \"pos_emb\" in n or '.mask.' in n:\n if 'ln_x.weight' in n:\n layer_scale = (1+int(n.split('.')[1])) / self.args.n_layer\n m[n] = (p * 0.0) + (layer_scale ** 0.7)\n else:\n m[n] = p\n else:\n if n == \"emb.weight\":\n scale = -1 * self.args.lr_init\n else:\n if shape[0] > shape[1]:\n gain = math.sqrt(shape[0] / shape[1])\n if 'r' in os.environ[\"RWKV_MY_TESTING\"]:\n zero = [\".att.output.\", \".ffn.value.\", \".ffn.receptance.\", \".ffnPre.value.\", \".ffnPre.receptance.\", \"head_q.\", '.oo.', '.rr.']", + "type": "code", + "location": "/RWKV-v4neo/src/model.py:917-943" + }, + "839": { + "file_id": 44, + "content": "Initializing model weights for large models is slow, so the code handles this process by iterating over each item in self.state_dict(). For certain specific named parameters (like 'ln_x.weight', '.att.output.', etc.), it sets values accordingly. Otherwise, it simply copies the original values. The lr_init is used to set scale and gain based on the shape of the parameter. If the environment variable RWKV_MY_TESTING contains 'r', certain parameters are excluded from initialization.", + "type": "comment" + }, + "840": { + "file_id": 44, + "content": " else:\n zero = [\".att.key.\", \".att.receptance.\", \".att.output.\", \".ffn.value.\", \".ffn.receptance.\", \".ffnPre.value.\", \".ffnPre.receptance.\", \"head_q.\", '.oo.', '.rr.']\n for kk in zero:\n if kk in n:\n scale = 0\n if n == \"head.weight\":\n scale = 0.5\n if \"head_k.\" in n:\n scale = 0.1\n if \"head_q.\" in n:\n scale = 0\n print(f\"{str(shape[0]).ljust(5)} {str(shape[1]).ljust(5)} {str(scale).ljust(4)} {n}\")\n if self.args.accelerator.upper() == \"GPU\":\n m[n] = torch.empty((shape[0], shape[1]), device=\"cuda\")\n else:\n m[n] = torch.empty((shape[0], shape[1]))\n if scale == 0:\n nn.init.zeros_(m[n])\n elif scale < 0:\n nn.init.uniform_(m[n], a=scale, b=-scale)", + "type": "code", + "location": "/RWKV-v4neo/src/model.py:944-966" + }, + "841": { + "file_id": 44, + "content": "This code checks the variable 'n' to determine its scale factor. If 'n' matches a specific set of keys, it sets the scale to 0 (zero initialization), 0.5, or 0.1. It then creates an empty tensor based on GPU accelerator settings and initializes it with zero or uniform values depending on the scale.", + "type": "comment" + }, + "842": { + "file_id": 44, + "content": " else:\n nn.init.orthogonal_(m[n], gain=gain * scale)\n m[n] = m[n].cpu()\n if os.environ[\"RWKV_FLOAT_MODE\"] == \"fp16\":\n m[n] = m[n].half()\n elif os.environ[\"RWKV_FLOAT_MODE\"] == \"bf16\":\n m[n] = m[n].bfloat16()\n # if n == \"emb.weight\":\n # print(m[n])\n gc.collect()\n torch.cuda.empty_cache()\n return m", + "type": "code", + "location": "/RWKV-v4neo/src/model.py:967-981" + }, + "843": { + "file_id": 44, + "content": "This code initializes the model's weights using orthogonal initialization with a specified gain and scale. It then moves the weights to CPU and converts them based on the RWKV_FLOAT_MODE environment variable. Finally, it collects garbage and empties the CUDA cache before returning the updated model parameters.", + "type": "comment" + }, + "844": { + "file_id": 45, + "content": "/RWKV-v4neo/src/model_img.py", + "type": "filepath" + }, + "845": { + "file_id": 45, + "content": "The code utilizes DeepSpeed, VGG16 and RWKV-LM layers, VGG-19 pretrained features, L2 pooling layers, calculates distances for score calculation, and defines the RWKV-v4neo language model with Conv2d layers, BatchNorm2d, Mish activation function, LightningModule, optimizer, and encoder-decoder architecture for multi-device training support.", + "type": "summary" + }, + "846": { + "file_id": 45, + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nimport numpy as np\nimport os, math, gc\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nimport torchvision as vision\nimport pytorch_lightning as pl\nfrom pytorch_lightning.utilities import rank_zero_info, rank_zero_only\nfrom pytorch_lightning.strategies import DeepSpeedStrategy\nimport deepspeed\nfrom deepspeed.ops.adam import DeepSpeedCPUAdam, FusedAdam\n# from pytorch_msssim import MS_SSIM\ndef __nop(ob):\n return ob\nMyModule = torch.jit.ScriptModule\n# MyFunction = __nop\nMyFunction = torch.jit.script_method\nimport clip\nfrom transformers import CLIPModel\nclass L2pooling(nn.Module):\n def __init__(self, filter_size=5, stride=2, channels=None, pad_off=0):\n super(L2pooling, self).__init__()\n self.padding = (filter_size - 2) // 2", + "type": "code", + "location": "/RWKV-v4neo/src/model_img.py:1-30" + }, + "847": { + "file_id": 45, + "content": "This code imports necessary libraries and defines functions and modules for a machine learning model. It uses DeepSpeed for efficient training and includes functions like L2pooling for data processing.", + "type": "comment" + }, + "848": { + "file_id": 45, + "content": " self.stride = stride\n self.channels = channels\n a = np.hanning(filter_size)[1:-1]\n g = torch.Tensor(a[:, None] * a[None, :])\n g = g / torch.sum(g)\n self.register_buffer(\n \"filter\", g[None, None, :, :].repeat((self.channels, 1, 1, 1))\n )\n def forward(self, input):\n input = input**2\n out = F.conv2d(\n input,\n self.filter,\n stride=self.stride,\n padding=self.padding,\n groups=input.shape[1],\n )\n return (out + 1e-12).sqrt()\nclass DISTS(torch.nn.Module):\n def __init__(self, load_weights=True):\n super(DISTS, self).__init__()\n vgg_pretrained_features = vision.models.vgg16(\n weights=\"VGG16_Weights.IMAGENET1K_V1\"\n ).features\n self.stage1 = torch.nn.Sequential()\n self.stage2 = torch.nn.Sequential()\n self.stage3 = torch.nn.Sequential()\n self.stage4 = torch.nn.Sequential()\n self.stage5 = torch.nn.Sequential()\n for x in range(0, 4):", + "type": "code", + "location": "/RWKV-v4neo/src/model_img.py:31-63" + }, + "849": { + "file_id": 45, + "content": "This code defines a class for a neural network model. The model has stages, and each stage contains convolutional layers from the VGG16 model followed by a custom layer called RWKV-LM/RWKV-v4neo/src/model_img.py:30-62. The custom layer applies a Hanning window function to the filter, then normalizes it, and repeats it for each channel. Finally, it performs convolution with the input image using specified stride, padding, and groups.", + "type": "comment" + }, + "850": { + "file_id": 45, + "content": " self.stage1.add_module(str(x), vgg_pretrained_features[x])\n self.stage2.add_module(str(4), L2pooling(channels=64))\n for x in range(5, 9):\n self.stage2.add_module(str(x), vgg_pretrained_features[x])\n self.stage3.add_module(str(9), L2pooling(channels=128))\n for x in range(10, 16):\n self.stage3.add_module(str(x), vgg_pretrained_features[x])\n self.stage4.add_module(str(16), L2pooling(channels=256))\n for x in range(17, 23):\n self.stage4.add_module(str(x), vgg_pretrained_features[x])\n self.stage5.add_module(str(23), L2pooling(channels=512))\n for x in range(24, 30):\n self.stage5.add_module(str(x), vgg_pretrained_features[x])\n self.register_buffer(\n \"mean\", torch.tensor([0.485, 0.456, 0.406]).view(1, -1, 1, 1)\n )\n self.register_buffer(\n \"std\", torch.tensor([0.229, 0.224, 0.225]).view(1, -1, 1, 1)\n )\n self.chns = [3, 64, 128, 256, 512, 512]\n self.register_buffer(", + "type": "code", + "location": "/RWKV-v4neo/src/model_img.py:64-86" + }, + "851": { + "file_id": 45, + "content": "The code creates a model architecture by adding modules to the stages of the network. It uses VGG-19 pretrained features for each stage and adds L2 pooling layers in between. The mean and std values are registered as buffers, and a list of channel numbers is created.", + "type": "comment" + }, + "852": { + "file_id": 45, + "content": " \"alpha\", nn.Parameter(torch.randn(1, sum(self.chns), 1, 1))\n )\n self.register_buffer(\"beta\", nn.Parameter(torch.randn(1, sum(self.chns), 1, 1)))\n self.alpha.data.normal_(0.1, 0.01)\n self.beta.data.normal_(0.1, 0.01)\n weights = torch.load(\"test/DISTS_weights.pt\")\n self.alpha.data = weights[\"alpha\"]\n self.beta.data = weights[\"beta\"]\n for param in self.parameters():\n param.requires_grad = False\n def forward_once(self, x):\n h = (x - self.mean) / self.std\n h = self.stage1(h)\n h_relu1_2 = h\n h = self.stage2(h)\n h_relu2_2 = h\n h = self.stage3(h)\n h_relu3_3 = h\n h = self.stage4(h)\n h_relu4_3 = h\n h = self.stage5(h)\n h_relu5_3 = h\n return [x, h_relu1_2, h_relu2_2, h_relu3_3, h_relu4_3, h_relu5_3]\n def forward(self, x, y, require_grad=False, batch_average=False):\n if require_grad:\n feats0 = self.forward_once(x)\n feats1 = self.forward_once(y)", + "type": "code", + "location": "/RWKV-v4neo/src/model_img.py:87-116" + }, + "853": { + "file_id": 45, + "content": "The code initializes two parameters, \"alpha\" and \"beta\", with random values and normalizes their data. It then loads weights from a file and assigns them to the respective parameters. Finally, it sets the gradient flag to False for all parameters and defines forward functions to perform calculations on input features.", + "type": "comment" + }, + "854": { + "file_id": 45, + "content": " else:\n with torch.no_grad():\n feats0 = self.forward_once(x)\n feats1 = self.forward_once(y)\n dist1 = 0\n dist2 = 0\n c1 = 1e-6\n c2 = 1e-6\n w_sum = self.alpha.sum() + self.beta.sum()\n alpha = torch.split(self.alpha / w_sum, self.chns, dim=1)\n beta = torch.split(self.beta / w_sum, self.chns, dim=1)\n for k in range(len(self.chns)):\n x_mean = feats0[k].mean([2, 3], keepdim=True)\n y_mean = feats1[k].mean([2, 3], keepdim=True)\n S1 = (2 * x_mean * y_mean + c1) / (x_mean**2 + y_mean**2 + c1)\n dist1 = dist1 + (alpha[k] * S1).sum(1, keepdim=True)\n x_var = ((feats0[k] - x_mean) ** 2).mean([2, 3], keepdim=True)\n y_var = ((feats1[k] - y_mean) ** 2).mean([2, 3], keepdim=True)\n xy_cov = (feats0[k] * feats1[k]).mean(\n [2, 3], keepdim=True\n ) - x_mean * y_mean\n S2 = (2 * xy_cov + c2) / (x_var + y_var + c2)\n dist2 = dist2 + (beta[k] * S2).sum(1, keepdim=True)", + "type": "code", + "location": "/RWKV-v4neo/src/model_img.py:117-141" + }, + "855": { + "file_id": 45, + "content": "This code calculates the distances between two feature embeddings and assigns weights to them based on alpha and beta. It first performs a forward pass for x and y, then normalizes alpha and beta by dividing their sums with the total number of channels. For each channel, it computes the mean and variance of x and y, and also calculates the covariance between x and y. Finally, it applies weights to the distances and sums them up for both embeddings.", + "type": "comment" + }, + "856": { + "file_id": 45, + "content": " score = 1 - (dist1 + dist2).squeeze()\n if batch_average:\n return score.mean()\n else:\n return score\n class ToBinary(torch.autograd.Function):\n @staticmethod\n def forward(ctx, x):#, noise_scale):\n # if noise_scale > 0:\n # noise_min = 0.5 - noise_scale / 2\n # noise_max = 0.5 + noise_scale / 2\n # return torch.floor(x + torch.empty_like(x).uniform_(noise_min, noise_max))\n # else:\n return torch.floor(x + 0.5) # no need for noise when we have plenty of data\n @staticmethod\n def backward(ctx, grad_output):\n return grad_output.clone()#, None\n########################################################################################################\nclass R_ENCODER(MyModule):\n def __init__(self, args):\n super().__init__()\n self.args = args\n dd = 8\n self.Bxx = nn.BatchNorm2d(dd*64)\n self.CIN = nn.Conv2d(3, dd, kernel_size=3, padding=1)", + "type": "code", + "location": "/RWKV-v4neo/src/model_img.py:143-173" + }, + "857": { + "file_id": 45, + "content": "This code contains a function that calculates a score based on distances and returns it. If batch_average is True, the score is averaged across all elements in the batch. The ToBinary class performs binary rounding of input values. The R_ENCODER class initializes a model with BatchNorm2d and Conv2d layers for image processing.", + "type": "comment" + }, + "858": { + "file_id": 45, + "content": " self.Cx0 = nn.Conv2d(dd, 32, kernel_size=3, padding=1)\n self.Cx1 = nn.Conv2d(32, dd, kernel_size=3, padding=1)\n self.B00 = nn.BatchNorm2d(dd*4)\n self.C00 = nn.Conv2d(dd*4, 256, kernel_size=3, padding=1)\n self.C01 = nn.Conv2d(256, dd*4, kernel_size=3, padding=1)\n self.C02 = nn.Conv2d(dd*4, 256, kernel_size=3, padding=1)\n self.C03 = nn.Conv2d(256, dd*4, kernel_size=3, padding=1)\n self.B10 = nn.BatchNorm2d(dd*16)\n self.C10 = nn.Conv2d(dd*16, 256, kernel_size=3, padding=1)\n self.C11 = nn.Conv2d(256, dd*16, kernel_size=3, padding=1)\n self.C12 = nn.Conv2d(dd*16, 256, kernel_size=3, padding=1)\n self.C13 = nn.Conv2d(256, dd*16, kernel_size=3, padding=1)\n self.B20 = nn.BatchNorm2d(dd*64)\n self.C20 = nn.Conv2d(dd*64, 256, kernel_size=3, padding=1)\n self.C21 = nn.Conv2d(256, dd*64, kernel_size=3, padding=1)\n self.C22 = nn.Conv2d(dd*64, 256, kernel_size=3, padding=1)\n self.C23 = nn.Conv2d(256, dd*64, kernel_size=3, padding=1)", + "type": "code", + "location": "/RWKV-v4neo/src/model_img.py:174-193" + }, + "859": { + "file_id": 45, + "content": "This code defines multiple convolutional layers (Conv2d) and batch normalization layers (BatchNorm2d) for a neural network model. The layers have different input/output dimensions, kernel sizes, and padding values to perform feature extraction and normalization in the model's architecture.", + "type": "comment" + }, + "860": { + "file_id": 45, + "content": " # self.B21 = nn.BatchNorm2d(dd*64)\n # self.C24 = nn.Conv2d(dd*64, 256, kernel_size=3, padding=1)\n # self.C25 = nn.Conv2d(256, dd*64, kernel_size=3, padding=1)\n # self.C26 = nn.Conv2d(dd*64, 256, kernel_size=3, padding=1)\n # self.C27 = nn.Conv2d(256, dd*64, kernel_size=3, padding=1)\n self.COUT = nn.Conv2d(dd*64, args.my_img_bit, kernel_size=3, padding=1)\n @MyFunction\n def forward(self, img):\n ACT = F.mish\n x = self.CIN(img)\n xx = self.Bxx(F.pixel_unshuffle(x, 8))\n x = x + self.Cx1(ACT(self.Cx0(x)))\n x = F.pixel_unshuffle(x, 2)\n x = x + self.C01(ACT(self.C00(ACT(self.B00(x)))))\n x = x + self.C03(ACT(self.C02(x)))\n x = F.pixel_unshuffle(x, 2)\n x = x + self.C11(ACT(self.C10(ACT(self.B10(x)))))\n x = x + self.C13(ACT(self.C12(x)))\n x = F.pixel_unshuffle(x, 2)\n x = x + self.C21(ACT(self.C20(ACT(self.B20(x)))))\n x = x + self.C23(ACT(self.C22(x)))\n # x = x + self.C25(ACT(self.C24(ACT(self.B21(x)))))", + "type": "code", + "location": "/RWKV-v4neo/src/model_img.py:194-221" + }, + "861": { + "file_id": 45, + "content": "This code defines a forward function for a neural network. It uses Mish activation functions and applies convolutional layers with batch normalization for feature extraction and image processing. The output is generated by combining the outputs of multiple convolutional layers, and pixel unshuffling is used to change the channel dimension.", + "type": "comment" + }, + "862": { + "file_id": 45, + "content": " # x = x + self.C27(ACT(self.C26(x)))\n x = self.COUT(x + xx)\n return torch.sigmoid(x)\n########################################################################################################\nclass R_DECODER(MyModule):\n def __init__(self, args):\n super().__init__()\n self.args = args\n dd = 8\n self.CIN = nn.Conv2d(args.my_img_bit, dd*64, kernel_size=3, padding=1)\n self.B00 = nn.BatchNorm2d(dd*64)\n self.C00 = nn.Conv2d(dd*64, 256, kernel_size=3, padding=1)\n self.C01 = nn.Conv2d(256, dd*64, kernel_size=3, padding=1)\n self.C02 = nn.Conv2d(dd*64, 256, kernel_size=3, padding=1)\n self.C03 = nn.Conv2d(256, dd*64, kernel_size=3, padding=1)\n # self.B01 = nn.BatchNorm2d(dd*64)\n # self.C04 = nn.Conv2d(dd*64, 256, kernel_size=3, padding=1)\n # self.C05 = nn.Conv2d(256, dd*64, kernel_size=3, padding=1)\n # self.C06 = nn.Conv2d(dd*64, 256, kernel_size=3, padding=1)\n # self.C07 = nn.Conv2d(256, dd*64, kernel_size=3, padding=1)", + "type": "code", + "location": "/RWKV-v4neo/src/model_img.py:222-245" + }, + "863": { + "file_id": 45, + "content": "The given code is a part of the RWKV model implementation. It defines the Decoder class which takes in arguments and initializes multiple Conv2d layers for processing. The Conv2d layers are responsible for feature extraction and down-sampling. The batch normalization layer (BatchNorm2d) helps with speeding up the training and improving model performance by reducing internal covariate shift. However, there is a comment suggesting that the BatchNorm2d could be removed or reduced to improve performance and speed. Additionally, some of the Conv2d layers are commented out, indicating they might be unused or under development.", + "type": "comment" + }, + "864": { + "file_id": 45, + "content": " self.B10 = nn.BatchNorm2d(dd*16)\n self.C10 = nn.Conv2d(dd*16, 256, kernel_size=3, padding=1)\n self.C11 = nn.Conv2d(256, dd*16, kernel_size=3, padding=1)\n self.C12 = nn.Conv2d(dd*16, 256, kernel_size=3, padding=1)\n self.C13 = nn.Conv2d(256, dd*16, kernel_size=3, padding=1)\n self.B20 = nn.BatchNorm2d(dd*4)\n self.C20 = nn.Conv2d(dd*4, 256, kernel_size=3, padding=1)\n self.C21 = nn.Conv2d(256, dd*4, kernel_size=3, padding=1)\n self.C22 = nn.Conv2d(dd*4, 256, kernel_size=3, padding=1)\n self.C23 = nn.Conv2d(256, dd*4, kernel_size=3, padding=1)\n self.Cx0 = nn.Conv2d(dd, 32, kernel_size=3, padding=1)\n self.Cx1 = nn.Conv2d(32, dd, kernel_size=3, padding=1)\n self.COUT = nn.Conv2d(dd, 3, kernel_size=3, padding=1)\n @MyFunction\n def forward(self, code):\n ACT = F.mish\n x = self.CIN(code)\n x = x + self.C01(ACT(self.C00(ACT(self.B00(x)))))\n x = x + self.C03(ACT(self.C02(x)))\n # x = x + self.C05(ACT(self.C04(ACT(self.B01(x)))))", + "type": "code", + "location": "/RWKV-v4neo/src/model_img.py:247-270" + }, + "865": { + "file_id": 45, + "content": "This code defines a model for the RWKV-v4neo language model. It includes multiple convolutional and batch normalization layers, as well as the Mish activation function (F.mish). The forward function applies these layers to an input code and performs addition operations between different layer outputs.", + "type": "comment" + }, + "866": { + "file_id": 45, + "content": " # x = x + self.C07(ACT(self.C06(x)))\n x = F.pixel_shuffle(x, 2)\n x = x + self.C11(ACT(self.C10(ACT(self.B10(x)))))\n x = x + self.C13(ACT(self.C12(x)))\n x = F.pixel_shuffle(x, 2)\n x = x + self.C21(ACT(self.C20(ACT(self.B20(x)))))\n x = x + self.C23(ACT(self.C22(x)))\n x = F.pixel_shuffle(x, 2)\n x = x + self.Cx1(ACT(self.Cx0(x)))\n x = self.COUT(x)\n return torch.sigmoid(x)\n########################################################################################################`\ndef cosine_loss(x, y):\n x = F.normalize(x, dim=-1)\n y = F.normalize(y, dim=-1)\n return 1 - torch.einsum('ij,ij->i',[x,y])\nclass RWKV_IMG(pl.LightningModule):\n def __init__(self, args):\n super().__init__()\n self.args = args\n self.encoder = R_ENCODER(args)\n self.decoder = R_DECODER(args)\n self.clip_model = None\n clip_name = args.my_img_clip\n if clip_name == 'B32':\n clip_name = 'ViT-B/32'\n elif clip_name == 'B16':", + "type": "code", + "location": "/RWKV-v4neo/src/model_img.py:271-306" + }, + "867": { + "file_id": 45, + "content": "This code snippet belongs to a LightningModule class in the RWKV-v4neo package. It includes a cosine_loss function and an RWKV_IMG class which has an encoder, decoder, and clip_model as its components. The encoder and decoder are instances of R_ENCODER and R_DECODER classes respectively. The code snippet defines operations to be performed on the input x using various transformations and normalizations before returning a sigmoid transformed output.", + "type": "comment" + }, + "868": { + "file_id": 45, + "content": " clip_name = 'ViT-B/16'\n elif clip_name == 'L14':\n clip_name = 'ViT-L/14'\n elif clip_name == 'OB32':\n clip_name = \"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\"\n self.clip_model = CLIPModel.from_pretrained(clip_name)\n self.clip_model.encode_image = self.clip_model.get_image_features\n if self.clip_model == None:\n self.clip_model, _ = clip.load(clip_name, jit = True)\n self.register_buffer(\n \"clip_mean\", torch.tensor([0.48145466, 0.4578275, 0.40821073]).view(1, 3, 1, 1)\n )\n self.register_buffer(\n \"clip_std\", torch.tensor([0.26862954, 0.26130258, 0.27577711]).view(1, 3, 1, 1)\n )\n for n, p in self.named_parameters():\n if 'clip_model' in n:\n p.requires_grad = False\n self.loss_dists = DISTS()\n # self.loss_ssim = MS_SSIM(data_range=1, size_average=True, channel=3)\n def configure_optimizers(self):\n args = self.args\n optim_groups = [", + "type": "code", + "location": "/RWKV-v4neo/src/model_img.py:307-332" + }, + "869": { + "file_id": 45, + "content": "In the provided code snippet, the `clip_name` is assigned based on certain conditions. If it's 'ViT-B/16', no change. If 'L14', it's changed to 'ViT-L/14'. And if 'OB32', it becomes \"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\" and a pretrained CLIP model is loaded for this case. The code also initializes `clip_mean` and `clip_std` buffers with specific values, sets parameters of 'clip_model' as non-trainable, and configures the optimizers based on 'args'.", + "type": "comment" + }, + "870": { + "file_id": 45, + "content": " {\"params\": [p for n, p in self.named_parameters()], \"weight_decay\": 0.0},\n ]\n if self.deepspeed_offload:\n return DeepSpeedCPUAdam(\n optim_groups,\n lr=self.args.lr_init,\n betas=self.args.betas,\n eps=self.args.adam_eps,\n bias_correction=True,\n adamw_mode=False,\n weight_decay=0,\n amsgrad=False,\n )\n return FusedAdam(\n optim_groups,\n lr=self.args.lr_init,\n betas=self.args.betas,\n eps=self.args.adam_eps,\n bias_correction=True,\n adam_w_mode=False,\n weight_decay=0,\n amsgrad=False,\n )\n # return ZeroOneAdam(optim_groups, lr=self.args.lr_init, betas=self.args.betas, eps=self.args.adam_eps, bias_correction=True, weight_decay=0, amsgrad=False, cuda_aware=False)\n @property\n def deepspeed_offload(self) -> bool:\n strategy = self.trainer.strategy", + "type": "code", + "location": "/RWKV-v4neo/src/model_img.py:333-360" + }, + "871": { + "file_id": 45, + "content": "This code defines an optimizer function that chooses between DeepSpeedCPUAdam and FusedAdam based on the deepspeed_offload flag. The optimizer takes in optim_groups, lr (learning rate), betas, eps (epsilon), bias_correction, adam_w_mode, weight_decay, and amsgrad as parameters. It returns an instance of either DeepSpeedCPUAdam or FusedAdam depending on whether deepspeed_offload is True or False. The @property method deepspeed_offload retrieves the strategy from the trainer.", + "type": "comment" + }, + "872": { + "file_id": 45, + "content": " if isinstance(strategy, DeepSpeedStrategy):\n config = strategy.config[\"zero_optimization\"]\n return config.get(\"offload_optimizer\") or config.get(\"offload_param\")\n return False\n def forward(self, img):\n z = self.encoder(img)\n z = ToBinary.apply(z)#, self.args.my_img_noise_scale)\n out = self.decoder(z)\n return out\n def training_step(self, batch, batch_idx):\n args = self.args\n img, txt = batch\n out = self(img)\n if self.trainer.is_global_zero:\n if (self.trainer.global_step + 1) % (100 * int(args.devices)) == 0:\n img_dir = f\"test/image_model/{args.run_name}\"\n if not os.path.exists(img_dir):\n os.makedirs(img_dir)\n vision.utils.save_image(\n img[:4], f\"{img_dir}/{self.trainer.global_step}-src.jpg\"#, padding=0\n )\n vision.utils.save_image(\n out[:4], f\"{img_dir}/{self.trainer.global_step}-out.jpg\"#, padding=0", + "type": "code", + "location": "/RWKV-v4neo/src/model_img.py:361-385" + }, + "873": { + "file_id": 45, + "content": "This code defines a model class that takes an image as input and outputs an image. It uses an encoder and decoder for processing the input. The model also has a training step where it saves images at specific global steps during training if running on multiple devices.", + "type": "comment" + }, + "874": { + "file_id": 45, + "content": " )\n # loss_ssim = 1 - self.loss_ssim(out, img)\n loss_dists = self.loss_dists(out, img, require_grad=True, batch_average=True)\n iii = self.clip_model.encode_image((img - self.clip_mean) / self.clip_std)\n ooo = self.clip_model.encode_image((out - self.clip_mean) / self.clip_std)\n loss_clip = torch.mean(cosine_loss(iii, ooo))\n if args.my_img_l1_scale > 0:\n loss_l1 = F.l1_loss(out, img)\n return loss_dists + loss_clip * args.my_img_clip_scale + loss_l1 * args.my_img_l1_scale\n else:\n return loss_dists + loss_clip * args.my_img_clip_scale\n def training_step_end(self, batch_parts):\n all = self.all_gather(batch_parts)\n if self.trainer.is_global_zero:\n self.trainer.my_loss_all = all\n def generate_init_weight(self):\n print(\n f\"\"\"\n############################################################################\n#\n# Init model weight (slow for large models)...\n#\n############################################################################", + "type": "code", + "location": "/RWKV-v4neo/src/model_img.py:386-413" + }, + "875": { + "file_id": 45, + "content": "This code snippet is for a deep learning model that takes input image and produces output, then calculates loss for each step during training. It uses different types of losses such as SSIM (line 396) and cosine similarity (lines 390-395). The code also handles global average pooling and all-gathering for distributed training.\n\nQuestion: What is the purpose of the 'args' argument used in this code?", + "type": "comment" + }, + "876": { + "file_id": 45, + "content": "\"\"\"\n )\n m = {}\n for n in self.state_dict():\n scale = 1\n p = self.state_dict()[n]\n shape = p.shape\n ss = n.split('.')\n # if ss[0] in ['encoder', 'decoder']:\n # if ss[2] == 'bias':\n # scale = 0\n # # elif n == 'encoder.CIN.weight':\n # # nn.init.dirac_(p)\n # else:\n # try:\n # if ss[1][0] == 'C' and (int(ss[1][2]) % 2 == 1):\n # scale = 0\n # except:\n # pass\n # m[n] = p * scale\n m[n] = p\n m[n] = m[n].cpu()\n if os.environ[\"RWKV_FLOAT_MODE\"] == \"fp16\":\n m[n] = m[n].half()\n elif os.environ[\"RWKV_FLOAT_MODE\"] == \"bf16\":\n m[n] = m[n].bfloat16()\n gc.collect()\n torch.cuda.empty_cache()\n return m", + "type": "code", + "location": "/RWKV-v4neo/src/model_img.py:414-446" + }, + "877": { + "file_id": 45, + "content": "This code snippet is used to create a dictionary of model parameters, where it loads each parameter from the model's state_dict and applies necessary scaling or conversion based on its name. It also handles different float modes such as fp16 and bf16 as specified in the environment variable RWKV_FLOAT_MODE. Finally, it collects garbage, empties CUDA cache, and returns the dictionary of parameters.", + "type": "comment" + }, + "878": { + "file_id": 46, + "content": "/RWKV-v4neo/src/model_run.py", + "type": "filepath" + }, + "879": { + "file_id": 46, + "content": "The code initializes and optimizes an RWKV Language Model in PyTorch, creates a model class with feed-forward network, applies deep learning processing techniques, checks if current layer is rescaling, adjusts input, and performs layer normalization/feed-forward operations before returning modified input and state.", + "type": "summary" + }, + "880": { + "file_id": 46, + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nimport types\nimport torch\nimport math, os, gc\nfrom torch.nn import functional as F\nimport torch.nn as nn\nfrom typing import List, Dict\nMyModule = nn.Module\ndef __nop(ob):\n return ob\nMyFunction = __nop\n# # try torchdynamo\n# import torchdynamo\n# MyFunction = torchdynamo.optimize(os.environ[\"RWKV_RUN_BACKEND\"]) # !!!BUGGY!!! wrong output\n# try torch jit --> faster for fp32, slower for fp16 (why?)\nif os.environ[\"RWKV_JIT_ON\"] == \"1\":\n MyModule = torch.jit.ScriptModule\n MyFunction = torch.jit.script_method\nRWKV_HEAD_QK_DIM = 0\nprint(f'\\nRWKV_HEAD_QK_DIM {RWKV_HEAD_QK_DIM} RWKV_JIT_ON {os.environ[\"RWKV_JIT_ON\"]}\\n')\nDEBUG_TIME = False # True False - show trained time-coeffs\nRWKV_RESCALE_LAYER = 6 # set x=x/2 every X layer\n############################################################################################################", + "type": "code", + "location": "/RWKV-v4neo/src/model_run.py:1-33" + }, + "881": { + "file_id": 46, + "content": "This code is initializing the RWKV Language Model, which is implemented in PyTorch. It defines a module and function for optimizing the code using torchdynamo or torch jit depending on the environment variable RWKV_JIT_ON. The code also sets some variables like RWKV_HEAD_QK_DIM and DEBUG_TIME, which control certain aspects of the model's behavior.", + "type": "comment" + }, + "882": { + "file_id": 46, + "content": "class RWKV_RNN(MyModule):\n def __init__(self, args):\n super().__init__()\n self.args = args\n self.FLOAT_MODE = args.FLOAT_MODE\n self.RUN_DEVICE = args.RUN_DEVICE\n with torch.no_grad():\n w = torch.load(args.MODEL_NAME + '.pth', map_location='cpu')\n # refine weights and send to correct device\n keys = list(w.keys())\n if 'pos_emb_x' in keys:\n w['pos_emb'] = (w['pos_emb_x'] + w['pos_emb_y']).reshape(args.ctx_len+1, -1)[:-1,:]\n keys = list(w.keys())\n print_need_newline = False\n for x in keys:\n block_id = 0\n if 'blocks.' in x:\n block_id = int(x.split('.')[1])\n if 'att.output.weight' in x:\n w[x] = w[x] / (2 ** int(block_id // RWKV_RESCALE_LAYER))\n if 'ffn.value.weight' in x:\n w[x] = w[x] / (2 ** int(block_id // RWKV_RESCALE_LAYER))\n if '.time_' in x:\n w[x] = w[x].squeeze()", + "type": "code", + "location": "/RWKV-v4neo/src/model_run.py:35-61" + }, + "883": { + "file_id": 46, + "content": "This code defines a class for the RWKV_RNN model, initializes its attributes based on provided arguments, loads and refines weights from a pre-trained model, adjusts certain layers' weights according to block ID, and handles loading and reshaping position embedding.", + "type": "comment" + }, + "884": { + "file_id": 46, + "content": " if DEBUG_TIME:\n print(x, w[x].numpy())\n if '.time_decay' in x:\n w[x] = w[x].float()\n w[x] = -torch.exp(w[x])\n elif '.time_first' in x:\n w[x] = w[x].float()\n else:\n if self.FLOAT_MODE == \"fp32\":\n w[x] = w[x].float()\n elif self.FLOAT_MODE == \"bf16\":\n w[x] = w[x].bfloat16()\n elif self.FLOAT_MODE == \"fp16\":\n w[x] = w[x].half()\n w[x].requires_grad = False\n if args.RUN_DEVICE == 'cuda' and x != 'emb.weight':\n w[x] = w[x].cuda()\n if ('blocks.' not in x) or ('blocks.0.' in x):\n if print_need_newline:\n print('\\n', end = '')\n print_need_newline = False\n print(x.ljust(40), str(w[x].dtype).replace('torch.', '').ljust(10), w[x].device)", + "type": "code", + "location": "/RWKV-v4neo/src/model_run.py:62-85" + }, + "885": { + "file_id": 46, + "content": "This code snippet is responsible for managing the data type and device of model parameters based on specified settings. It prints information about each parameter, including its name, data type, and device it's stored in. The DEBUG_TIME flag controls whether to print activation-time weights, while FLOAT_MODE determines the desired floating-point precision (fp32, fp16, or bf16). Parameters from specific groups are not modified unless they are in a specified group ('blocks.'). If RUN_DEVICE is 'cuda', parameters are moved to GPU if they're not the embedding layer weight.", + "type": "comment" + }, + "886": { + "file_id": 46, + "content": " else:\n print_need_newline = True\n print('.', end = '', flush = True)\n # store weights in self.w\n keys = list(w.keys())\n self.w = types.SimpleNamespace()\n for x in keys:\n xx = x.split('.')\n here = self.w\n for i in range(len(xx)):\n if xx[i].isdigit():\n ii = int(xx[i])\n if ii not in here:\n here[ii] = types.SimpleNamespace()\n here = here[ii]\n else:\n if i == len(xx) - 1:\n setattr(here, xx[i], w[x])\n elif not hasattr(here, xx[i]):\n if xx[i+1].isdigit():\n setattr(here, xx[i], {})\n else:\n setattr(here, xx[i], types.SimpleNamespace())\n here = getattr(here, xx[i])\n self.eval()\n gc.collect()\n torch.cuda.empty_cache()", + "type": "code", + "location": "/RWKV-v4neo/src/model_run.py:86-114" + }, + "887": { + "file_id": 46, + "content": "This code is organizing and storing weights from a dictionary 'w' into an object 'self.w'. It uses string manipulation to split keys into components, then iteratively creates nested namespaces or dictionaries within self.w according to the key structure. Finally, it sets attributes in each namespace/dictionary using setattr(). The code also ensures garbage collection and empties CUDA cache for optimization.", + "type": "comment" + }, + "888": { + "file_id": 46, + "content": " def LN(self, x, w):\n return F.layer_norm(x, (self.args.n_embd,), weight=w.weight, bias=w.bias)\n # state[] 0=ffn_xx 1=att_xx 2=att_aa 3=att_bb 4=att_pp\n @MyFunction\n def FF(self, x, state, i:int, time_mix_k, time_mix_r, kw, vw, rw):\n if self.FLOAT_MODE == \"bf16\":\n xk = x * time_mix_k + state[5*i+0].type(torch.bfloat16) * (1 - time_mix_k)\n xr = x * time_mix_r + state[5*i+0].type(torch.bfloat16) * (1 - time_mix_r)\n state[5*i+0] = x.float()\n elif self.FLOAT_MODE == \"fp16\":\n xk = x * time_mix_k + state[5*i+0].half() * (1 - time_mix_k)\n xr = x * time_mix_r + state[5*i+0].half() * (1 - time_mix_r)\n state[5*i+0] = x.float() \n else:\n xk = x * time_mix_k + state[5*i+0] * (1 - time_mix_k)\n xr = x * time_mix_r + state[5*i+0] * (1 - time_mix_r)\n state[5*i+0] = x\n r = torch.sigmoid(rw @ xr)\n k = torch.square(torch.relu(kw @ xk))\n kv = vw @ k\n return r * kv", + "type": "code", + "location": "/RWKV-v4neo/src/model_run.py:116-140" + }, + "889": { + "file_id": 46, + "content": "This function, \"FF\", applies a feed-forward network (FFN) to the input tensor 'x' using state information from a previous iteration. It also accounts for different floating point types ('bf16', 'fp16') and performs element-wise operations with learnable weights. The resulting output is a product of the input, kernel, and weight matrices, with elements multiplied by sigmoid and squared ReLU activation functions respectively.", + "type": "comment" + }, + "890": { + "file_id": 46, + "content": " @MyFunction\n def SA(self, x, state, i:int, time_mix_k, time_mix_v, time_mix_r, time_first, time_decay, kw, vw, rw, ow):\n if self.FLOAT_MODE == \"bf16\":\n xk = x * time_mix_k + state[5*i+1].type(torch.bfloat16) * (1 - time_mix_k)\n xv = x * time_mix_v + state[5*i+1].type(torch.bfloat16) * (1 - time_mix_v)\n xr = x * time_mix_r + state[5*i+1].type(torch.bfloat16) * (1 - time_mix_r)\n state[5*i+1] = x.float()\n elif self.FLOAT_MODE == \"fp16\":\n xk = x * time_mix_k + state[5*i+1].half() * (1 - time_mix_k)\n xv = x * time_mix_v + state[5*i+1].half() * (1 - time_mix_v)\n xr = x * time_mix_r + state[5*i+1].half() * (1 - time_mix_r)\n state[5*i+1] = x.float() \n else:\n xk = x * time_mix_k + state[5*i+1] * (1 - time_mix_k)\n xv = x * time_mix_v + state[5*i+1] * (1 - time_mix_v)\n xr = x * time_mix_r + state[5*i+1] * (1 - time_mix_r)\n state[5*i+1] = x\n r = torch.sigmoid(rw @ xr)", + "type": "code", + "location": "/RWKV-v4neo/src/model_run.py:142-160" + }, + "891": { + "file_id": 46, + "content": "This function calculates a weighted average of three inputs (x, xv, and xr) based on mixing factors and applies them to state[5*i+1]. Depending on FLOAT_MODE, it performs the calculation with different precision (bf16, fp16, or float32). The result is passed through a sigmoid function to obtain the final output r.", + "type": "comment" + }, + "892": { + "file_id": 46, + "content": " k = kw @ xk\n v = vw @ xv\n if '16' in self.FLOAT_MODE:\n kk = k.float()\n vv = v.float()\n else:\n kk = k\n vv = v\n aa = state[5*i+2]\n bb = state[5*i+3]\n pp = state[5*i+4]\n ww = time_first + kk\n p = torch.maximum(pp, ww)\n e1 = torch.exp(pp - p)\n e2 = torch.exp(ww - p)\n a = e1 * aa + e2 * vv\n b = e1 * bb + e2\n ww = pp + time_decay\n p = torch.maximum(ww, kk)\n e1 = torch.exp(ww - p)\n e2 = torch.exp(kk - p)\n state[5*i+2] = e1 * aa + e2 * vv\n state[5*i+3] = e1 * bb + e2\n state[5*i+4] = p\n if self.FLOAT_MODE == \"bf16\":\n wkv = (a / b).type(torch.bfloat16)\n elif self.FLOAT_MODE == \"fp16\":\n wkv = (a / b).half()\n else:\n wkv = a / b\n return ow @ (r * wkv)\n def forward(self, ctx, state, preprocess_only = False):\n with torch.no_grad():\n w = self.w\n args = self.args\n x = w.emb.weight[ctx[-1]]", + "type": "code", + "location": "/RWKV-v4neo/src/model_run.py:161-200" + }, + "893": { + "file_id": 46, + "content": "This code is performing a matrix multiplication operation and then applying exponential functions and maximum operations on the result. It also checks the FLOAT_MODE to handle different data types and returns the output of the operation.", + "type": "comment" + }, + "894": { + "file_id": 46, + "content": " if self.RUN_DEVICE == 'cuda':\n x = x.cuda()\n try:\n pos_emb = w.pos_emb[len(ctx)-1]\n x = x + pos_emb\n except:\n pass \n if state == None:\n state = torch.zeros(args.n_layer * 5, args.n_embd, device=self.RUN_DEVICE)\n for i in range(args.n_layer):\n state[5*i+4] -= 1e30\n for i in range(args.n_layer):\n if i == 0:\n x = self.LN(x, w.blocks[i].ln0)\n ww = w.blocks[i].att\n x = x + self.SA(self.LN(x, w.blocks[i].ln1), state, i, \n ww.time_mix_k, ww.time_mix_v, ww.time_mix_r, ww.time_first, ww.time_decay, \n ww.key.weight, ww.value.weight, ww.receptance.weight, ww.output.weight)\n ww = w.blocks[i].ffn\n x = x + self.FF(self.LN(x, w.blocks[i].ln2), state, i, \n ww.time_mix_k, ww.time_mix_r, \n ww.key.weight, ww.value.weight, ww.receptance.weight)", + "type": "code", + "location": "/RWKV-v4neo/src/model_run.py:201-226" + }, + "895": { + "file_id": 46, + "content": "This code segment is responsible for handling the input data and processing it through a sequence of layers in a deep learning model. The RUN_DEVICE variable determines whether the computation should be done on CPU or GPU. Positional embedding is added to the input, and initial state values are set if necessary. Finally, the input goes through multiple layers, including attention and feed-forward networks, with appropriate time mixing and normalization.", + "type": "comment" + }, + "896": { + "file_id": 46, + "content": " if (i+1) % RWKV_RESCALE_LAYER == 0:\n x = x / 2\n if preprocess_only:\n return state\n x = self.LN(x, w.ln_out)\n x = w.head.weight @ x\n return x.float(), state", + "type": "code", + "location": "/RWKV-v4neo/src/model_run.py:228-237" + }, + "897": { + "file_id": 46, + "content": "This code segment checks if the current layer is a rescaling layer and adjusts the input accordingly. If preprocessing only is enabled, it returns the state; otherwise, it applies layer normalization and feed-forward operations before returning the modified input and state.", + "type": "comment" + }, + "898": { + "file_id": 47, + "content": "/RWKV-v4neo/src/trainer.py", + "type": "filepath" + }, + "899": { + "file_id": 47, + "content": "The code defines a `my_save()` function for saving PyTorch Lightning model data using AWS S3, handles learning rate scheduling and logs progress. It also loads, reshapes and converts a model dictionary, performs interpolation, saves epoch information, and generates initial weights for model training.", + "type": "summary" + } +} \ No newline at end of file diff --git a/docs/data/9.json b/docs/data/9.json new file mode 100644 index 00000000..a1d6fe21 --- /dev/null +++ b/docs/data/9.json @@ -0,0 +1,547 @@ +{ + "900": { + "file_id": 47, + "content": "import os, math, time, datetime, subprocess\nimport torch\nfrom torch.utils.data import DataLoader\nimport pytorch_lightning as pl\nfrom pytorch_lightning.utilities import rank_zero_info, rank_zero_only\ndef my_save(args, trainer, dd, ff):\n if '14b-run1' in ff:\n fn = ff.split('/')[-1]\n fff = '/dev/shm/' + fn\n torch.save(dd, fff)\n subprocess.Popen(f\" aws s3 mv {fff} s3://rwkv-14b-4k/{fn} --quiet\", shell=True)\n elif ('world/14b' in ff) or ('world/7b' in ff):\n aa = ff.split('/')[1]\n fn = ff.split('/')[-1]\n fff = f'/dev/shm/{aa}-{fn}'\n torch.save(dd, fff)\n subprocess.Popen(f\" aws s3 mv {fff} s3://rwkv-world/{aa}-{fn} --quiet\", shell=True)\n else:\n if 'deepspeed_stage_3' in args.strategy:\n trainer.save_checkpoint(ff, weights_only=True)\n else:\n torch.save(dd, ff)\nclass train_callback(pl.Callback):\n def __init__(self, args):\n super().__init__()\n self.args = args\n def on_train_batch_start(self, trainer, pl_module, batch, batch_idx):", + "type": "code", + "location": "/RWKV-v4neo/src/trainer.py:1-30" + }, + "901": { + "file_id": 47, + "content": "This code defines a function `my_save()` that saves the model's data depending on the file path (`ff`) and calls another function `train_callback()`, which is a PyTorch Lightning callback class. The code also uses subprocess to move saved files to AWS S3 storage and incorporates the usage of Deepspeed for distributed training.", + "type": "comment" + }, + "902": { + "file_id": 47, + "content": " args = self.args\n # if args.cuda_cleanup > 0:\n # torch.cuda.empty_cache()\n real_step = trainer.global_step + args.epoch_begin * args.epoch_steps\n # LR schedule\n w_step = args.warmup_steps\n if args.lr_final == args.lr_init or args.epoch_count == 0:\n lr = args.lr_init\n else:\n decay_step = real_step - args.my_pile_edecay * args.epoch_steps\n decay_total = (args.epoch_count - args.my_pile_edecay) * args.epoch_steps\n progress = (decay_step - w_step + 1) / (decay_total - w_step)\n progress = min(1, max(0, progress))\n if args.lr_final == 0 or args.lr_init == 0: # linear decay\n lr = args.lr_init + (args.lr_final - args.lr_init) * progress\n else: # exp decay\n lr = args.lr_init * math.exp(math.log(args.lr_final / args.lr_init) * pow(progress, 1))\n # if trainer.is_global_zero:\n # print(trainer.global_step, decay_step, decay_total, w_step, progress, lr)", + "type": "code", + "location": "/RWKV-v4neo/src/trainer.py:31-51" + }, + "903": { + "file_id": 47, + "content": "Code snippet handles learning rate (LR) scheduling and potentially clears GPU cache based on provided arguments. It calculates the real training step, determines if LR should be adjusted based on epoch count and warmup steps, and applies linear or exponential decay to adjust the learning rate. It also prints some info if it's the global zero trainer.", + "type": "comment" + }, + "904": { + "file_id": 47, + "content": " if args.my_exit_tokens != 0: # cosine decay\n real_tokens = real_step * args.ctx_len * args.real_bsz\n warmup_tokens = w_step * args.ctx_len * args.real_bsz\n progress = (real_tokens - warmup_tokens) / (abs(args.my_exit_tokens) - warmup_tokens)\n progress = max(0, min(1, progress))\n lr_final_factor = args.lr_final / args.lr_init \n lr_mult = (0.5 + lr_final_factor / 2) + (0.5 - lr_final_factor / 2) * math.cos(math.pi * progress)\n if args.my_exit_tokens > 0:\n lr = args.lr_init * lr_mult\n else:\n lr = (lr + args.lr_init * lr_mult) / 2\n if progress >= 1:\n if (trainer.is_global_zero) or ('deepspeed_stage_3' in args.strategy):\n my_save(\n args, trainer,\n pl_module.state_dict(),\n f\"{args.proj_dir}/rwkv-final.pth\",\n )\n exit(0)\n if trainer.global_step < w_step:", + "type": "code", + "location": "/RWKV-v4neo/src/trainer.py:53-72" + }, + "905": { + "file_id": 47, + "content": "This code is setting the learning rate (lr) based on the exit tokens. If my_exit_tokens > 0, lr = lr_init * lr_mult. If my_exit_tokens < 0, lr = (lr + lr_init * lr_mult) / 2. The code also saves and exits if progress >= 1 or if global step is less than w_step. This appears to be part of a training process where the learning rate dynamically adjusts during training based on exit tokens.", + "type": "comment" + }, + "906": { + "file_id": 47, + "content": " lr = lr * (0.2 + 0.8 * trainer.global_step / w_step)\n if args.weight_decay_final > 0:\n wd_now = args.weight_decay * math.exp(math.log(args.weight_decay_final / args.weight_decay) * progress)\n else:\n wd_now = args.weight_decay\n for param_group in trainer.optimizers[0].param_groups:\n if param_group[\"weight_decay\"] > 0:\n param_group[\"weight_decay\"] = wd_now\n if args.layerwise_lr > 0:\n param_group[\"lr\"] = lr * param_group[\"my_lr_scale\"]\n # print(param_group[\"lr\"], param_group[\"my_lr_scale\"])\n else:\n param_group[\"lr\"] = lr\n trainer.my_lr = lr\n trainer.my_wd = wd_now\n # rank_zero_info(f\"{real_step} {lr}\")\n if trainer.global_step == 0:\n if trainer.is_global_zero: # logging\n trainer.my_loss_sum = 0\n trainer.my_loss_count = 0\n trainer.my_log = open(args.proj_dir + \"/train_log.txt\", \"a\")", + "type": "code", + "location": "/RWKV-v4neo/src/trainer.py:73-97" + }, + "907": { + "file_id": 47, + "content": "The code sets the learning rate (lr) based on a decay formula and updates the weight decay (wd_now). It iterates through each param group, setting the lr and wd accordingly. If layerwise learning rate is enabled, it adjusts the lr further based on my_lr_scale. The trainer's current lr and wd are stored for future reference, and logging is initialized if this is the first global step.", + "type": "comment" + }, + "908": { + "file_id": 47, + "content": " trainer.my_log.write(f\"NEW RUN {args.my_timestamp}\\n{vars(self.args)}\\n\")\n try:\n print(f\"\\n{trainer.strategy.config}\\n\")\n trainer.my_log.write(f\"{trainer.strategy.config}\\n\")\n except:\n pass\n trainer.my_log.flush()\n if len(args.wandb) > 0:\n print(\"Login to wandb...\")\n import wandb\n wandb.init(\n project=args.wandb,\n name=args.run_name + \" \" + args.my_timestamp,\n config=args,\n save_code=False,\n )\n trainer.my_wandb = wandb\n def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):\n args = self.args\n token_per_step = args.ctx_len * args.real_bsz\n real_step = trainer.global_step + args.epoch_begin * args.epoch_steps\n if trainer.is_global_zero: # logging", + "type": "code", + "location": "/RWKV-v4neo/src/trainer.py:98-120" + }, + "909": { + "file_id": 47, + "content": "Writes log information to file, tries printing strategy configuration but handles exceptions, flushes the log, initializes W&B if enabled. In on_train_batch_end, calculates token per step, determines real step, logs only on global_step 0 (zero-based indexing).", + "type": "comment" + }, + "910": { + "file_id": 47, + "content": " t_now = time.time_ns()\n kt_s = 0\n try:\n t_cost = (t_now - trainer.my_time_ns) / 1e9\n kt_s = token_per_step / t_cost / 1000\n self.log(\"REAL it/s\", 1.0 / t_cost, prog_bar=True, on_step=True)\n self.log(\"Kt/s\", kt_s, prog_bar=True, on_step=True)\n except:\n pass\n trainer.my_time_ns = t_now\n if pl.__version__[0]=='2':\n trainer.my_loss = outputs[\"loss\"]\n else:\n trainer.my_loss = trainer.my_loss_all.float().mean().item()\n trainer.my_loss_sum += trainer.my_loss\n trainer.my_loss_count += 1\n trainer.my_epoch_loss = trainer.my_loss_sum / trainer.my_loss_count\n self.log(\"lr\", trainer.my_lr, prog_bar=True, on_step=True)\n self.log(\"loss\", trainer.my_epoch_loss, prog_bar=True, on_step=True)\n # self.log(\"s\", real_step, prog_bar=True, on_step=True)\n if len(args.wandb) > 0:", + "type": "code", + "location": "/RWKV-v4neo/src/trainer.py:121-142" + }, + "911": { + "file_id": 47, + "content": "Code block calculates the time taken for training step, real iterations per second (it/s), and kilo-tokens per second (Kt/s). It also logs the learning rate (lr) and current loss for progress tracking. If using PyTorch 2 version, it retrieves loss value differently. It also logs the epoch loss and optionally sends information to W&B if specified in arguments.", + "type": "comment" + }, + "912": { + "file_id": 47, + "content": " lll = {\"loss\": trainer.my_loss, \"lr\": trainer.my_lr, \"wd\": trainer.my_wd, \"Gtokens\": real_step * token_per_step / 1e9}\n if kt_s > 0:\n lll[\"kt/s\"] = kt_s\n trainer.my_wandb.log(lll, step=int(real_step))\n if (trainer.is_global_zero) or ('deepspeed_stage_3' in args.strategy): # save pth\n if args.magic_prime > 0:\n expand_factor = 2 if args.my_qa_mask > 0 else 1\n if int(real_step) == int(args.magic_prime * expand_factor // args.real_bsz) - 1 + int(args.my_random_steps):\n to_save_dict = pl_module.state_dict()\n my_save(\n args, trainer,\n to_save_dict,\n f\"{args.proj_dir}/rwkv-final.pth\",\n )\n def on_train_epoch_start(self, trainer, pl_module):\n args = self.args\n if pl.__version__[0]=='2':\n dataset = trainer.train_dataloader.dataset\n else:\n dataset = trainer.train_dataloader.dataset.datasets", + "type": "code", + "location": "/RWKV-v4neo/src/trainer.py:143-164" + }, + "913": { + "file_id": 47, + "content": "This code is responsible for logging training metrics and saving the model checkpoint. It checks if it's the global zero or using DeepSpeed, and saves the model state dictionary as \"rwkv-final.pth\" when the current step meets certain conditions related to batch size and random steps. The logging includes loss, learning rate, weight decay, and token count per second (if applicable).", + "type": "comment" + }, + "914": { + "file_id": 47, + "content": " assert \"MyDataset\" in str(dataset)\n dataset.global_rank = trainer.global_rank\n dataset.real_epoch = int(args.epoch_begin + trainer.current_epoch)\n dataset.world_size = trainer.world_size\n # print(f'########## world_size {dataset.world_size} global_rank {dataset.global_rank} real_epoch {dataset.real_epoch} ##########')\n def on_train_epoch_end(self, trainer, pl_module):\n args = self.args\n to_save_dict = {}\n if (trainer.is_global_zero) or ('deepspeed_stage_3' in args.strategy): # save pth\n if (args.epoch_save > 0 and trainer.current_epoch % args.epoch_save == 0) or (trainer.current_epoch == args.epoch_count - 1):\n if args.data_type == 'wds_img':\n raw_dict = pl_module.state_dict()\n for k in raw_dict:\n if k.startswith('encoder.') or k.startswith('decoder.'):\n to_save_dict[k] = raw_dict[k]\n else:\n to_save_dict = pl_module.state_dict()", + "type": "code", + "location": "/RWKV-v4neo/src/trainer.py:165-182" + }, + "915": { + "file_id": 47, + "content": "This code snippet is part of the trainer class and defines a method 'on_train_epoch_end'. It asserts that the dataset has the name 'MyDataset' and assigns values to the dataset object properties. If global_zero or 'deepspeed_stage_3' strategy is used, it saves the model state at specified epoch intervals.", + "type": "comment" + }, + "916": { + "file_id": 47, + "content": " try:\n my_save(\n args, trainer,\n to_save_dict,\n f\"{args.proj_dir}/rwkv-{args.epoch_begin + trainer.current_epoch}.pth\",\n )\n except Exception as e:\n print('Error\\n\\n', e, '\\n\\n')\n if trainer.is_global_zero: # logging\n trainer.my_log.write(f\"{args.epoch_begin + trainer.current_epoch} {trainer.my_epoch_loss:.6f} {math.exp(trainer.my_epoch_loss):.4f} {trainer.my_lr:.8f} {datetime.datetime.now()} {trainer.current_epoch}\\n\")\n trainer.my_log.flush()\n trainer.my_loss_sum = 0\n trainer.my_loss_count = 0\n if (args.epoch_begin + trainer.current_epoch) >= args.my_exit:\n exit(0)\n@rank_zero_only\ndef generate_init_weight(model, init_weight_name):\n mm = model.generate_init_weight()\n if model.args.my_pile_stage == 1:\n if len(model.args.load_model) > 0:\n print(f\"Combine weights from {model.args.load_model}...\")", + "type": "code", + "location": "/RWKV-v4neo/src/trainer.py:183-208" + }, + "917": { + "file_id": 47, + "content": "Trying to save the model, log epoch information, and optionally exit if the current epoch exceeds a specified limit. Additionally, there's a function for generating initial weights, combining with pre-existing ones if available.", + "type": "comment" + }, + "918": { + "file_id": 47, + "content": " load_dict = torch.load(model.args.load_model, map_location=\"cpu\")\n for k in load_dict:\n try:\n assert k in mm\n except:\n print('missing', k)\n exit(0)\n src = load_dict[k]\n try:\n mm[k] = src.reshape(mm[k].shape)\n except:\n tmp = mm[k].squeeze().clone()\n print(k, src.shape, '-->', mm[k].shape)\n ss = src.shape[0]\n dd = tmp.shape[0]\n for i in range(dd):\n pos = i / dd * ss\n if pos >= ss - 1:\n tmp[i] = src[ss-1]\n else:\n p0 = int(math.floor(pos))\n ii = pos - p0\n tmp[i] = src[p0] * (1-ii) + src[p0+1] * (ii)\n mm[k] = tmp.reshape(mm[k].shape)\n sss = src.squeeze().float().cpu().numpy()", + "type": "code", + "location": "/RWKV-v4neo/src/trainer.py:209-233" + }, + "919": { + "file_id": 47, + "content": "The code loads a dictionary from the specified model file, checks if all keys are present in 'mm' dictionary, and reshapes the loaded source to match the shape of existing data in 'mm'. If source shape doesn't match, it performs a linear interpolation to fit the new data. Finally, converts the source to float and cpu numpy array.", + "type": "comment" + }, + "920": { + "file_id": 47, + "content": " print(sss[:10], '...', sss[-10:])\n mmm = mm[k].squeeze().float().cpu().numpy()\n print(mmm[:10], '...', mmm[-10:])\n print(f\"Save to {init_weight_name}...\")\n torch.save(mm, init_weight_name)\n if model.args.my_pile_stage == 1:\n print(\"Done. Now go for stage 2.\")\n exit(0)", + "type": "code", + "location": "/RWKV-v4neo/src/trainer.py:234-243" + }, + "921": { + "file_id": 47, + "content": "This code segment prints parts of 'sss' and 'mmm', saves 'mm' to a file, and if in stage 1, suggests moving on to stage 2. It seems to be part of a model training process where it displays data, saves an intermediate model checkpoint, and moves to the next phase.", + "type": "comment" + }, + "922": { + "file_id": 48, + "content": "/RWKV-v4neo/src/utils.py", + "type": "filepath" + }, + "923": { + "file_id": 48, + "content": "The code imports libraries, defines a tokenizing class, utilizes Fermat's Little Theorem and Miller-Rabin primality test for prime number calculation, performs verification process to determine if a given number is prime, and returns True or False accordingly.", + "type": "summary" + }, + "924": { + "file_id": 48, + "content": "import json, time, random, os\nimport numpy as np\nimport torch\nfrom torch.nn import functional as F\ntime_slot = {}\ntime_ref = time.time_ns()\ndef record_time(name):\n if name not in time_slot:\n time_slot[name] = 1e20\n tt = (time.time_ns() - time_ref) / 1e9\n if tt < time_slot[name]:\n time_slot[name] = tt\nclass TOKENIZER():\n def __init__(self, WORD_NAME, UNKNOWN_CHAR='\\ue083'):\n if 'list' in str(type(WORD_NAME)):\n self.charMode = False\n if WORD_NAME[0] == WORD_NAME[1]:\n from transformers import PreTrainedTokenizerFast\n self.tokenizer = PreTrainedTokenizerFast(tokenizer_file=WORD_NAME[0])\n else:\n from transformers import GPT2TokenizerFast\n self.tokenizer = GPT2TokenizerFast(WORD_NAME[0], WORD_NAME[1])\n self.vocab_size = len(self.tokenizer)\n else:\n self.charMode = True\n with open(WORD_NAME + '.json', \"r\", encoding=\"utf-16\") as result_file:\n self.word_table = json.load(result_file)", + "type": "code", + "location": "/RWKV-v4neo/src/utils.py:1-30" + }, + "925": { + "file_id": 48, + "content": "This code imports necessary libraries and defines a class for tokenizing text using either pre-trained tokenizers or custom word tables. It also records time usage for optimization purposes.", + "type": "comment" + }, + "926": { + "file_id": 48, + "content": " self.vocab_size = len(self.word_table)\n self.stoi = {v: int(k) for k, v in self.word_table.items()}\n self.itos = {int(k): v for k, v in self.word_table.items()}\n self.UNKNOWN_CHAR = self.stoi[UNKNOWN_CHAR]\n def refine_context(self, context):\n context = context.strip().split('\\n')\n for c in range(len(context)):\n context[c] = context[c].strip().strip('\\u3000').strip('\\r')\n context = list(filter(lambda c: c != '', context))\n context = '\\n' + ('\\n'.join(context)).strip()\n if context == '':\n context = '\\n'\n return context\n def sample_logits(self, out, x, ctx_len, temperature=1.0, top_p_usual=None, top_p_newline=None):\n # out[self.UNKNOWN_CHAR] = -float('Inf')\n lastChar = int(x[-1])\n probs = F.softmax(out, dim=-1)\n if self.charMode:\n if self.itos[lastChar] == '\\n':\n top_p = top_p_newline\n else:\n top_p = top_p_usual\n else:", + "type": "code", + "location": "/RWKV-v4neo/src/utils.py:32-60" + }, + "927": { + "file_id": 48, + "content": "This code snippet is part of a model for text generation. The 'utils' class contains methods to refine the context by removing extra whitespace and unwanted characters, set vocabulary size based on the word table, map words to integers and vice versa, and sample logits to generate text using softmax function with option to specify temperature and top probabilities for specific characters.", + "type": "comment" + }, + "928": { + "file_id": 48, + "content": " top_p = top_p_usual\n if os.environ[\"RWKV_RUN_DEVICE\"] == \"cpu\":\n probs = probs.numpy()\n sorted_probs = np.sort(probs)[::-1]\n cumulative_probs = np.cumsum(sorted_probs)\n cutoff = float(sorted_probs[np.argmax(cumulative_probs > top_p)])\n probs[probs < cutoff] = 0\n if temperature != 1.0:\n probs = probs.pow(1.0 / temperature)\n probs = probs / np.sum(probs)\n out = np.random.choice(a=len(probs), p=probs)\n return out\n else:\n sorted_probs = torch.sort(probs, descending=True)[0]\n cumulative_probs = torch.cumsum(sorted_probs, dim=-1).cpu().numpy()\n cutoff = float(sorted_probs[np.argmax(cumulative_probs > top_p)])\n probs[probs < cutoff] = 0\n if temperature != 1.0:\n probs = probs.pow(1.0 / temperature)\n out = torch.multinomial(probs, num_samples=1)[0]\n return out\ndef MaybeIsPrime(number):\n if FermatPrimalityTest(number) and MillerRabinPrimalityTest(number):", + "type": "code", + "location": "/RWKV-v4neo/src/utils.py:61-85" + }, + "929": { + "file_id": 48, + "content": "This function calculates prime numbers using Fermat's Little Theorem and Miller-Rabin primality test. The prime number check is done only if the environment variable RWKV_RUN_DEVICE is set to \"cpu\". If not, it uses torch.multinomial() for selection.", + "type": "comment" + }, + "930": { + "file_id": 48, + "content": " return True\n else:\n return False\ndef FermatPrimalityTest(number):\n if number > 1:\n for time in range(3):\n randomNumber = random.randint(2, number) - 1\n if pow(randomNumber, number - 1, number) != 1:\n return False\n return True\n else:\n return False\ndef MillerRabinPrimalityTest(number):\n if number == 2:\n return True\n elif number == 1 or number % 2 == 0:\n return False\n oddPartOfNumber = number - 1\n timesTwoDividNumber = 0\n while oddPartOfNumber % 2 == 0:\n oddPartOfNumber = oddPartOfNumber // 2\n timesTwoDividNumber = timesTwoDividNumber + 1\n for time in range(3):\n while True:\n randomNumber = random.randint(2, number) - 1\n if randomNumber != 0 and randomNumber != 1:\n break\n randomNumberWithPower = pow(randomNumber, oddPartOfNumber, number)\n if (randomNumberWithPower != 1) and (randomNumberWithPower != number - 1):\n iterationNumber = 1", + "type": "code", + "location": "/RWKV-v4neo/src/utils.py:86-122" + }, + "931": { + "file_id": 48, + "content": "The code contains three functions: 1) FermatPrimalityTest checks if a number is prime using the Fermat Little Theorem; it returns True if number is greater than 1, and False otherwise. 2) MillerRabinPrimalityTest checks if a number is prime using the Miller-Rabin algorithm; it returns True for number=2, False for number=1 or even numbers, and starts by simplifying the odd part of the number. 3) The code uses a while loop to calculate the odd part of the number and determines the maximum iterations needed. The function then uses another loop to randomly select randomNumbers and test them for primality using the calculated parameters.", + "type": "comment" + }, + "932": { + "file_id": 48, + "content": " while (iterationNumber <= timesTwoDividNumber - 1) and (randomNumberWithPower != number - 1):\n randomNumberWithPower = pow(randomNumberWithPower, 2, number)\n iterationNumber = iterationNumber + 1\n if randomNumberWithPower != (number - 1):\n return False\n return True", + "type": "code", + "location": "/RWKV-v4neo/src/utils.py:124-130" + }, + "933": { + "file_id": 48, + "content": "This code performs a verification process where it checks if the given number can be determined as prime. It does this by iterating through a range of values, squaring a random number and calculating its modulo with the input number until either all iterations are completed or the value no longer matches the expected result. If the latter occurs, it returns False indicating that the number is not prime. Otherwise, it returns True.", + "type": "comment" + }, + "934": { + "file_id": 49, + "content": "/RWKV-v4neo/train.py", + "type": "filepath" + }, + "935": { + "file_id": 49, + "content": "This code trains an RWKV language model with PyTorch Lightning, supports customizable training parameters, fine-tunes on enwik8 data, and saves models every 5 epochs using 'argparse' for command line arguments. It includes essential setup tasks like version assertions and learning rate schedule configuration.", + "type": "summary" + }, + "936": { + "file_id": 49, + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nimport logging\nlogging.basicConfig(level=logging.INFO)\nif __name__ == \"__main__\":\n from argparse import ArgumentParser\n from pytorch_lightning import Trainer\n from pytorch_lightning.utilities import rank_zero_info, rank_zero_only\n import pytorch_lightning as pl\n rank_zero_info(\"########## work in progress ##########\")\n ########################################################################################################\n #\n # example: train a simple L12-D768 RWKV on dummy data\n #\n # python train.py --load_model \"\" --wandb \"\" --proj_dir \"out\" \\\n # --data_file \"\" --data_type \"dummy\" --vocab_size 0 \\\n # --ctx_len 128 --epoch_steps 1000 --epoch_count 20 --epoch_begin 0 --epoch_save 10 \\\n # --micro_bsz 16 --n_layer 12 --n_embd 768 --pre_ffn 0 --head_qk 0 \\", + "type": "code", + "location": "/RWKV-v4neo/train.py:1-23" + }, + "937": { + "file_id": 49, + "content": "This code is for training an RWKV language model using PyTorch Lightning framework. It includes basic configuration, argument parsing, and example usage for training a simple L12-D768 RWKV model on dummy data. The user can specify various parameters like load_model, wandb, proj_dir, data_file, data_type, vocab_size, ctx_len, epoch_steps, epoch_count, epoch_begin, epoch_save, micro_bsz, n_layer, n_embd, pre_ffn, and head_qk.", + "type": "comment" + }, + "938": { + "file_id": 49, + "content": " # --lr_init 6e-4 --lr_final 1e-5 --warmup_steps 0 --beta1 0.9 --beta2 0.99 --adam_eps 1e-8 \\\n # --accelerator gpu --devices 1 --precision bf16 --strategy ddp_find_unused_parameters_false --grad_cp 0\n # example: train a simple L6-D512 RWKV from scratch on enwik8\n #\n # python train.py --load_model \"\" --wandb \"\" --proj_dir \"out\" \\\n # --data_file \"../data/enwik8\" --data_type \"utf-8\" --vocab_size 0 \\\n # --ctx_len 512 --epoch_steps 5000 --epoch_count 500 --epoch_begin 0 --epoch_save 5 \\\n # --micro_bsz 12 --n_layer 6 --n_embd 512 --pre_ffn 0 --head_qk 0 \\\n # --lr_init 8e-4 --lr_final 1e-5 --warmup_steps 0 --beta1 0.9 --beta2 0.99 --adam_eps 1e-8 \\\n # --accelerator gpu --devices 1 --precision bf16 --strategy ddp_find_unused_parameters_false --grad_cp 0\n # example: fine-tune RWKV 1.5B using 8xA100 40G = 1.76it/s = 115k token/s, VRAM 37477M\n #\n # python train.py --load_model \"/fsx/BlinkDL/CODE/FP16/out_1b2/all-8040.pth\" --wandb \"\" --proj_dir \"out\" \\\n # --data_file \"../data/train.npy\" --data_type \"numpy\" --vocab_size 50277 \\", + "type": "code", + "location": "/RWKV-v4neo/train.py:24-39" + }, + "939": { + "file_id": 49, + "content": "Train a simple L6-D512 RWKV from scratch on enwik8, fine-tune RWKV 1.5B using 8xA100 40G = 1.76it/s = 115k token/s, VRAM 37477M", + "type": "comment" + }, + "940": { + "file_id": 49, + "content": " # --ctx_len 1024 --epoch_steps 1000 --epoch_count 1000 --epoch_begin 0 --epoch_save 5 \\\n # --micro_bsz 8 --n_layer 24 --n_embd 2048 --pre_ffn 0 --head_qk 0 \\\n # --lr_init 1e-5 --lr_final 1e-5 --warmup_steps 0 --beta1 0.9 --beta2 0.999 --adam_eps 1e-8 \\\n # --accelerator gpu --devices 8 --precision bf16 --strategy deepspeed_stage_2 --grad_cp 0\n # example: fine-tune RWKV 1.5B using 1 GPU fp16 (VRAM 16G) NOTE: fp16 might overflow\n #\n # python train.py --load_model \"/fsx/BlinkDL/CODE/FP16/out_1b2/all-8040.pth\" --wandb \"\" --proj_dir \"out\" \\\n # --data_file \"../data/train.npy\" --data_type \"numpy\" --vocab_size 50277 \\\n # --ctx_len 1024 --epoch_steps 200 --epoch_count 1000 --epoch_begin 0 --epoch_save 1 \\\n # --micro_bsz 11 --n_layer 24 --n_embd 2048 --pre_ffn 0 --head_qk 0 \\\n # --lr_init 1e-5 --lr_final 1e-5 --warmup_steps 0 --beta1 0.9 --beta2 0.999 --adam_eps 1e-8 \\\n # --accelerator gpu --devices 1 --precision fp16 --strategy deepspeed_stage_2_offload --grad_cp 1\n parser = ArgumentParser()", + "type": "code", + "location": "/RWKV-v4neo/train.py:40-54" + }, + "941": { + "file_id": 49, + "content": "This code configures a fine-tuning process for RWKV using 8 GPUs with BF16 precision, and saves models every 5 epochs. The command line arguments specify the model path, W&B project directory, training data file, vocabulary size, and various hyperparameters like batch size, layers, embedding dimensions, learning rate, etc. The parser is used to parse these command line arguments.", + "type": "comment" + }, + "942": { + "file_id": 49, + "content": " parser.add_argument(\"--load_model\", default=\"\", type=str) # full path, with .pth\n parser.add_argument(\"--wandb\", default=\"\", type=str) # wandb project name. if \"\" then don't use wandb\n parser.add_argument(\"--proj_dir\", default=\"out\", type=str)\n parser.add_argument(\"--random_seed\", default=\"-1\", type=int)\n parser.add_argument(\"--data_file\", default=\"\", type=str)\n parser.add_argument(\"--data_type\", default=\"utf-8\", type=str)\n parser.add_argument(\"--vocab_size\", default=0, type=int) # vocab_size = 0 means auto (for char-level LM and .txt data)\n parser.add_argument(\"--ctx_len\", default=1024, type=int)\n parser.add_argument(\"--epoch_steps\", default=1000, type=int) # a mini \"epoch\" has [epoch_steps] steps\n parser.add_argument(\"--epoch_count\", default=500, type=int) # train for this many \"epochs\". will continue afterwards with lr = lr_final\n parser.add_argument(\"--epoch_begin\", default=0, type=int) # if you load a model trained for x \"epochs\", set epoch_begin = x\n ", + "type": "code", + "location": "/RWKV-v4neo/train.py:56-69" + }, + "943": { + "file_id": 49, + "content": "This code is using the 'argparse' module to add command line arguments for specifying a model load path, Wandb project name, project directory, random seed, data file, data type, vocabulary size, context length, epoch steps, training epochs count, and epoch begin point. These arguments control how the program behaves during execution.", + "type": "comment" + }, + "944": { + "file_id": 49, + "content": " parser.add_argument(\"--epoch_save\", default=5, type=int) # save the model every [epoch_save] \"epochs\"\n parser.add_argument(\"--micro_bsz\", default=12, type=int) # micro batch size (batch size per GPU)\n parser.add_argument(\"--n_layer\", default=6, type=int)\n parser.add_argument(\"--n_embd\", default=512, type=int)\n parser.add_argument(\"--dim_att\", default=0, type=int)\n parser.add_argument(\"--dim_ffn\", default=0, type=int)\n parser.add_argument(\"--pre_ffn\", default=0, type=int) # replace first att layer by ffn (sometimes better)\n parser.add_argument(\"--head_qk\", default=0, type=int) # my headQK trick\n parser.add_argument(\"--tiny_att_dim\", default=0, type=int) # tiny attention dim\n parser.add_argument(\"--tiny_att_layer\", default=-999, type=int) # tiny attention @ which layer\n parser.add_argument(\"--lr_init\", default=6e-4, type=float) # 6e-4 for L12-D768, 4e-4 for L24-D1024, 3e-4 for L24-D2048\n parser.add_argument(\"--lr_final\", default=1e-5, type=float)\n parser.add_argument(\"--warmup_steps\", default=-1, type=int) # try 50 if you load a model", + "type": "code", + "location": "/RWKV-v4neo/train.py:69-83" + }, + "945": { + "file_id": 49, + "content": "This code snippet from \"RWKV-LM/RWKV-v4neo/train.py\" provides default values and types for various command line arguments used in model training. These parameters control aspects like epoch save frequency, batch size per GPU, model layers, embedding dimension, activation function settings, and learning rate configurations. The code also includes optional features like the \"headQK trick\", tiny attention dimensions, and layer placement.", + "type": "comment" + }, + "946": { + "file_id": 49, + "content": " parser.add_argument(\"--beta1\", default=0.9, type=float)\n parser.add_argument(\"--beta2\", default=0.99, type=float) # use 0.999 when your model is close to convergence\n parser.add_argument(\"--adam_eps\", default=1e-8, type=float)\n parser.add_argument(\"--grad_cp\", default=0, type=int) # gradient checkpt: saves VRAM, but slower\n parser.add_argument(\"--dropout\", default=0, type=float) # try 0.01 / 0.02 / 0.05 / 0.1\n parser.add_argument(\"--weight_decay\", default=0, type=float) # try 0.1 / 0.01 / 0.001\n parser.add_argument(\"--weight_decay_final\", default=-1, type=float)\n parser.add_argument(\"--my_pile_version\", default=1, type=int) # my special pile version\n parser.add_argument(\"--my_pile_stage\", default=0, type=int) # my special pile mode\n parser.add_argument(\"--my_pile_shift\", default=-1, type=int) # my special pile mode - text shift\n parser.add_argument(\"--my_pile_edecay\", default=0, type=int)\n parser.add_argument(\"--layerwise_lr\", default=1, type=int) # layerwise lr for faster convergence (but slower it/s)", + "type": "code", + "location": "/RWKV-v4neo/train.py:84-96" + }, + "947": { + "file_id": 49, + "content": "The code is using the 'argparse' module to add arguments for hyperparameters, such as beta1 and beta2 values for Adam optimizer, Adam epsilon, gradient checkpoint frequency, dropout rate, weight decay, special pile version and stage, text shift for special pile, and layerwise learning rate.", + "type": "comment" + }, + "948": { + "file_id": 49, + "content": " parser.add_argument(\"--ds_bucket_mb\", default=200, type=int) # deepspeed bucket size in MB. 200 seems enough\n # parser.add_argument(\"--cuda_cleanup\", default=0, type=int) # extra cuda cleanup (sometimes helpful)\n parser.add_argument(\"--my_img_version\", default=0, type=str)\n parser.add_argument(\"--my_img_size\", default=0, type=int)\n parser.add_argument(\"--my_img_bit\", default=0, type=int)\n parser.add_argument(\"--my_img_clip\", default='x', type=str)\n parser.add_argument(\"--my_img_clip_scale\", default=1, type=float)\n parser.add_argument(\"--my_img_l1_scale\", default=0, type=float)\n parser.add_argument(\"--my_img_encoder\", default='x', type=str)\n # parser.add_argument(\"--my_img_noise_scale\", default=0, type=float)\n parser.add_argument(\"--my_sample_len\", default=0, type=int)\n parser.add_argument(\"--my_ffn_shift\", default=1, type=int)\n parser.add_argument(\"--my_att_shift\", default=1, type=int)\n parser.add_argument(\"--head_size_a\", default=64, type=int) # can try larger values for larger models", + "type": "code", + "location": "/RWKV-v4neo/train.py:97-111" + }, + "949": { + "file_id": 49, + "content": "This code snippet contains various command line arguments used in a training process. It defines the default values and types for these arguments, such as --ds_bucket_mb, --cuda_cleanup, --my_img_version, etc. These options control different aspects of the model's behavior or performance during training. For instance, --my_sample_len specifies the length of samples to use while training, and --head_size_a sets the size of attention heads for the model. The code provides default values that should be sufficient for most cases but can be modified if needed.", + "type": "comment" + }, + "950": { + "file_id": 49, + "content": " parser.add_argument(\"--head_size_divisor\", default=8, type=int)\n parser.add_argument(\"--my_pos_emb\", default=0, type=int)\n parser.add_argument(\"--load_partial\", default=0, type=int)\n parser.add_argument(\"--magic_prime\", default=0, type=int)\n parser.add_argument(\"--my_qa_mask\", default=0, type=int)\n parser.add_argument(\"--my_random_steps\", default=0, type=int)\n parser.add_argument(\"--my_testing\", default='', type=str)\n parser.add_argument(\"--my_exit\", default=99999999, type=int)\n parser.add_argument(\"--my_exit_tokens\", default=0, type=int)\n if pl.__version__[0]=='2':\n parser.add_argument(\"--accelerator\", default=\"gpu\", type=str)\n parser.add_argument(\"--strategy\", default=\"auto\", type=str)\n parser.add_argument(\"--devices\", default=1, type=int)\n parser.add_argument(\"--num_nodes\", default=1, type=int)\n parser.add_argument(\"--precision\", default=\"fp16\", type=str)\n parser.add_argument(\"--accumulate_grad_batches\", default=1, type=int)\n else:", + "type": "code", + "location": "/RWKV-v4neo/train.py:112-129" + }, + "951": { + "file_id": 49, + "content": "This code defines command-line arguments for a program using the 'argparse' module. The options include settings for model training (like head size, loading partial data, magic prime), as well as accelerator configuration in case of running on PyTorch Lightning (PL) with Python 2. No comments are needed as this is just defining command-line arguments.", + "type": "comment" + }, + "952": { + "file_id": 49, + "content": " parser = Trainer.add_argparse_args(parser)\n args = parser.parse_args()\n ########################################################################################################\n import os, warnings, math, datetime, sys, time\n import numpy as np\n import torch\n from torch.utils.data import DataLoader\n if \"deepspeed\" in args.strategy:\n import deepspeed\n from pytorch_lightning import seed_everything\n if args.random_seed >= 0:\n print(f\"########## WARNING: GLOBAL SEED {args.random_seed} THIS WILL AFFECT MULTIGPU SAMPLING ##########\\n\" * 3)\n seed_everything(args.random_seed)\n np.set_printoptions(precision=4, suppress=True, linewidth=200)\n warnings.filterwarnings(\"ignore\", \".*Consider increasing the value of the `num_workers` argument*\")\n warnings.filterwarnings(\"ignore\", \".*The progress bar already tracks a metric with the*\")\n # os.environ[\"WDS_SHOW_SEED\"] = \"1\"\n args.my_timestamp = datetime.datetime.today().strftime(\"%Y-%m-%d-%H-%M-%S\")\n args.enable_checkpointing = False", + "type": "code", + "location": "/RWKV-v4neo/train.py:130-153" + }, + "953": { + "file_id": 49, + "content": "This code snippet is importing necessary libraries and setting up global seed for multi-GPU sampling. It also configures print options, ignores certain warnings, and sets the timestamp for experiment name.", + "type": "comment" + }, + "954": { + "file_id": 49, + "content": " args.replace_sampler_ddp = False\n args.logger = False\n args.gradient_clip_val = 1.0\n args.num_sanity_val_steps = 0\n args.check_val_every_n_epoch = int(1e20)\n args.log_every_n_steps = int(1e20)\n args.max_epochs = -1 # continue forever\n args.betas = (args.beta1, args.beta2)\n args.real_bsz = int(args.num_nodes) * int(args.devices) * args.micro_bsz\n os.environ[\"RWKV_T_MAX\"] = str(args.ctx_len)\n os.environ[\"RWKV_MY_TESTING\"] = args.my_testing\n os.environ[\"RWKV_HEAD_SIZE_A\"] = str(args.head_size_a)\n if args.dim_att <= 0:\n args.dim_att = args.n_embd\n if args.dim_ffn <= 0:\n if 'r3' in args.my_testing:\n args.dim_ffn = int((args.n_embd * 3.5) // 32 * 32)\n else:\n args.dim_ffn = args.n_embd * 4\n if args.data_type == \"wds_img\":\n args.run_name = f\"v{args.my_img_version}-{args.my_img_size}-{args.my_img_bit}bit-{args.my_img_clip}x{args.my_img_clip_scale}\"\n args.proj_dir = f\"{args.proj_dir}-{args.run_name}\"\n else:\n args.run_name = f\"{args.vocab_size} ctx{args.ctx_len} L{args.n_layer} D{args.n_embd}\"", + "type": "code", + "location": "/RWKV-v4neo/train.py:154-178" + }, + "955": { + "file_id": 49, + "content": "The code sets various arguments for model training, including disabling DDP sampler and logger, setting gradient clip value, and modifying batch size based on the number of nodes and devices. It also adjusts the dimensions of attention and feedforward layers if necessary, and customizes run name based on data type or vocabulary size, context length, layer count, and embedding dimension.", + "type": "comment" + }, + "956": { + "file_id": 49, + "content": " if not os.path.exists(args.proj_dir):\n os.makedirs(args.proj_dir)\n if args.my_pile_stage > 0:\n magic_prime_bak = args.magic_prime\n if args.my_pile_version == 1:\n if args.ctx_len == 1024:\n args.magic_prime = 324331313\n elif args.ctx_len == 2048:\n args.magic_prime = 162165671\n elif args.ctx_len == 4096:\n args.magic_prime = 81082817\n elif args.ctx_len == 8192:\n args.magic_prime = 40541399\n else:\n if args.ctx_len == 1024:\n args.magic_prime = 1670239709\n elif args.ctx_len == 2048:\n args.magic_prime = 835119767\n elif args.ctx_len == 4096:\n args.magic_prime = 417559889\n elif args.ctx_len == 6144:\n args.magic_prime = 278373239\n elif args.ctx_len == 8192:\n args.magic_prime = 208779911\n if args.my_pile_shift < 0:\n args.my_pile_shift = 0\n if magic_prime_bak > 0:", + "type": "code", + "location": "/RWKV-v4neo/train.py:179-208" + }, + "957": { + "file_id": 49, + "content": "This code checks if a directory exists, creates it if not, adjusts magic_prime and my_pile_shift values based on ctx_len, and sets my_pile_shift to 0 if it's negative.", + "type": "comment" + }, + "958": { + "file_id": 49, + "content": " args.magic_prime = magic_prime_bak\n if args.my_qa_mask == 2:\n args.epoch_count = 2 * args.magic_prime // 40320\n else:\n args.epoch_count = args.magic_prime // 40320\n args.epoch_steps = 40320 // args.real_bsz\n assert args.epoch_steps * args.real_bsz == 40320\n # if args.my_pile_stage == 2:\n # assert args.lr_final == args.lr_init\n if args.my_pile_stage >= 2: # find latest saved model\n list_p = []\n for p in os.listdir(args.proj_dir):\n if p.startswith(\"rwkv\") and p.endswith(\".pth\"):\n p = ((p.split(\"-\"))[1].split(\".\"))[0]\n if p != \"final\":\n if p == \"init\":\n p = -1\n else:\n p = int(p)\n list_p += [p]\n list_p.sort()\n max_p = list_p[-1]\n if len(list_p) > 1:\n args.my_pile_prev_p = list_p[-2] # in case max_p is corrupted", + "type": "code", + "location": "/RWKV-v4neo/train.py:209-233" + }, + "959": { + "file_id": 49, + "content": "This code sets the epoch count based on magic_prime and my_qa_mask, determines epoch steps for batch size, asserts that their product equals 40320, and checks if my_pile_stage is 2 to find the latest saved model. If my_pile_stage >= 2, it lists all models in proj_dir, sorts them, sets max_p as last one, and my_pile_prev_p if there are more than one.", + "type": "comment" + }, + "960": { + "file_id": 49, + "content": " if max_p == -1:\n args.load_model = f\"{args.proj_dir}/rwkv-init.pth\"\n else:\n args.load_model = f\"{args.proj_dir}/rwkv-{max_p}.pth\"\n if args.warmup_steps < 0:\n if args.my_pile_stage == 2:\n args.warmup_steps = 10\n else:\n args.warmup_steps = 30\n args.epoch_begin = max_p + 1\n samples_per_epoch = args.epoch_steps * args.real_bsz\n tokens_per_epoch = samples_per_epoch * args.ctx_len\n try:\n deepspeed_version = deepspeed.__version__\n except:\n deepspeed_version = None\n pass\n rank_zero_info(\n f\"\"\"\n############################################################################\n#\n# RWKV-4 {args.precision.upper()} on {args.num_nodes}x{args.devices} {args.accelerator.upper()}, bsz {args.num_nodes}x{args.devices}x{args.micro_bsz}={args.real_bsz}, {args.strategy} {'with grad_cp' if args.grad_cp > 0 else ''}\n#\n# Data = {args.data_file} ({args.data_type}), ProjDir = {args.proj_dir}", + "type": "code", + "location": "/RWKV-v4neo/train.py:234-258" + }, + "961": { + "file_id": 49, + "content": "If max_p is -1, the model will be loaded from rwkv-init.pth in args.proj_dir. Else, it will load from rwkv-{max_p}.pth in args.proj_dir. If warmup_steps is less than 0, set warmup_steps depending on my_pile_stage. Calculate samples_per_epoch and tokens_per_epoch based on epoch_steps and ctx_len respectively. Retrieve deepspeed version. Display rank_zero_info message with relevant information about the model, data, and project directory.", + "type": "comment" + }, + "962": { + "file_id": 49, + "content": "#\n# Epoch = {args.epoch_begin} to {args.epoch_begin + args.epoch_count - 1} (will continue afterwards), save every {args.epoch_save} epoch\n#\n# Each \"epoch\" = {args.epoch_steps} steps, {samples_per_epoch} samples, {tokens_per_epoch} tokens\n#\n# Model = {args.n_layer} n_layer, {args.n_embd} n_embd, {args.ctx_len} ctx_len\n#\n# Adam = lr {args.lr_init} to {args.lr_final}, warmup {args.warmup_steps} steps, beta {args.betas}, eps {args.adam_eps}\n#\n# Found torch {torch.__version__}, recommend 1.13.1+cu117 or newer\n# Found deepspeed {deepspeed_version}, recommend 0.7.0 (faster than newer versions)\n# Found pytorch_lightning {pl.__version__}, recommend 1.9.5\n#\n############################################################################\n\"\"\"\n )\n rank_zero_info(str(vars(args)) + \"\\n\")\n assert args.data_type in [\"utf-8\", \"utf-16le\", \"numpy\", \"binidx\", \"dummy\", \"wds_img\", \"uint16\"]\n if args.lr_final == 0 or args.lr_init == 0:\n rank_zero_info(\"\\n\\nNote: lr_final = 0 or lr_init = 0. Using linear LR schedule instead.\\n\\n\")", + "type": "code", + "location": "/RWKV-v4neo/train.py:259-280" + }, + "963": { + "file_id": 49, + "content": "Code snippet defines variables for epochs, steps, samples, tokens, model layers, embedding size, context length, learning rate schedule, warmup steps, beta, and epsilon. It also mentions required Python libraries versions and their recommendations. The code asserts the data type and provides a note if either final or initial learning rate is zero, suggesting that it will use a linear learning rate schedule instead.", + "type": "comment" + }, + "964": { + "file_id": 49, + "content": " assert args.precision in [\"fp32\", \"tf32\", \"fp16\", \"bf16\"]\n os.environ[\"RWKV_FLOAT_MODE\"] = args.precision\n if args.precision == \"fp32\":\n for i in range(10):\n rank_zero_info(\"\\n\\nNote: you are using fp32 (very slow). Try bf16 / tf32 for faster training.\\n\\n\")\n if args.precision == \"fp16\":\n rank_zero_info(\"\\n\\nNote: you are using fp16 (might overflow). Try bf16 / tf32 for stable training.\\n\\n\")\n os.environ[\"RWKV_JIT_ON\"] = \"1\"\n if \"deepspeed_stage_3\" in args.strategy:\n os.environ[\"RWKV_JIT_ON\"] = \"0\"\n torch.backends.cudnn.benchmark = True\n torch.backends.cudnn.enabled = True\n if args.precision == \"fp32\":\n torch.backends.cudnn.allow_tf32 = False\n torch.backends.cuda.matmul.allow_tf32 = False\n else:\n torch.backends.cudnn.allow_tf32 = True\n torch.backends.cuda.matmul.allow_tf32 = True\n if \"32\" in args.precision:\n args.precision = 32\n elif args.precision == \"fp16\":\n args.precision = 16\n else:\n args.precision = \"bf16\"", + "type": "code", + "location": "/RWKV-v4neo/train.py:282-308" + }, + "965": { + "file_id": 49, + "content": "This code sets the precision argument, adjusts relevant environment variables and configurations for faster training with different precision types. If using fp32, it provides a note suggesting to use bf16 or tf32 for better performance. It also sets up cudnn settings and allows tf32 in certain precision cases.", + "type": "comment" + }, + "966": { + "file_id": 49, + "content": " ########################################################################################################\n from src.trainer import train_callback, generate_init_weight\n from src.dataset import MyDataset\n train_data = MyDataset(args)\n args.vocab_size = train_data.vocab_size\n if args.data_type == 'wds_img':\n from src.model_img import RWKV_IMG\n model = RWKV_IMG(args)\n else:\n from src.model import RWKV\n model = RWKV(args)\n if len(args.load_model) == 0 or args.my_pile_stage == 1: # shall we build the initial weights?\n init_weight_name = f\"{args.proj_dir}/rwkv-init.pth\"\n generate_init_weight(model, init_weight_name) # save initial weights\n args.load_model = init_weight_name\n rank_zero_info(f\"########## Loading {args.load_model}... ##########\")\n try:\n load_dict = torch.load(args.load_model, map_location=\"cpu\")\n load_keys = list(load_dict.keys())\n for k in load_keys:\n if k.startswith('_forward_module.'):", + "type": "code", + "location": "/RWKV-v4neo/train.py:310-335" + }, + "967": { + "file_id": 49, + "content": "Initializing and loading the RWKV model with specified arguments, generating initial weights if none are loaded or if at MyPile stage 1. Saving initial weights in specified directory and then attempting to load pre-trained model from given path.", + "type": "comment" + }, + "968": { + "file_id": 49, + "content": " load_dict[k.replace('_forward_module.','')] = load_dict[k]\n del load_dict[k]\n except:\n rank_zero_info(f\"Bad checkpoint {args.load_model}\")\n if args.my_pile_stage >= 2: # try again using another checkpoint\n max_p = args.my_pile_prev_p\n if max_p == -1:\n args.load_model = f\"{args.proj_dir}/rwkv-init.pth\"\n else:\n args.load_model = f\"{args.proj_dir}/rwkv-{max_p}.pth\"\n args.epoch_begin = max_p + 1\n rank_zero_info(f\"Trying {args.load_model}\")\n load_dict = torch.load(args.load_model, map_location=\"cpu\")\n if args.load_partial == 1:\n load_keys = load_dict.keys()\n for k in model.state_dict():\n if k not in load_keys:\n load_dict[k] = model.state_dict()[k]\n model.load_state_dict(load_dict)\n if pl.__version__[0]=='2':\n trainer = Trainer(accelerator=args.accelerator,strategy=args.strategy,devices=args.devices,num_nodes=args.num_nodes,precision=args.precision,", + "type": "code", + "location": "/RWKV-v4neo/train.py:336-358" + }, + "969": { + "file_id": 49, + "content": "This code attempts to load a model checkpoint. It first checks if the provided checkpoint file exists, and if not, it tries another one. Then, it loads the dictionary of state parameters from the checkpoint into memory. If loading only part of the model, it also adds missing keys from the original model's state dictionary to the loaded dictionary. Finally, it loads the state dictionary into the model for training or inference.", + "type": "comment" + }, + "970": { + "file_id": 49, + "content": " logger=args.logger,callbacks=[train_callback(args)],max_epochs=args.max_epochs,check_val_every_n_epoch=args.check_val_every_n_epoch,num_sanity_val_steps=args.num_sanity_val_steps,\n log_every_n_steps=args.log_every_n_steps,enable_checkpointing=args.enable_checkpointing,accumulate_grad_batches=args.accumulate_grad_batches,gradient_clip_val=args.gradient_clip_val)\n else:\n trainer = Trainer.from_argparse_args(\n args,\n callbacks=[train_callback(args)],\n )\n if trainer.global_rank == 0:\n for n in model.state_dict():\n shape = model.state_dict()[n].shape\n shape = [i for i in shape if i != 1]\n if len(shape) > 1:\n print(f\"{str(shape[0]).ljust(5)} {str(shape[1]).ljust(5)} {n}\")\n else:\n print(f\"{str(shape[0]).ljust(5)} {n}\")\n if \"deepspeed\" in args.strategy:\n trainer.strategy.config[\"zero_optimization\"][\"allgather_bucket_size\"] = args.ds_bucket_mb * 1000 * 1000\n ", + "type": "code", + "location": "/RWKV-v4neo/train.py:359-378" + }, + "971": { + "file_id": 49, + "content": "The code creates a trainer object with specified arguments, including a callback for training. If the model has state dictionaries with shapes larger than 1D, it prints the shape and name of each such dictionary. The code then checks if the strategy used is \"deepspeed\" and sets the bucket size accordingly.", + "type": "comment" + }, + "972": { + "file_id": 49, + "content": " trainer.strategy.config[\"zero_optimization\"][\"reduce_bucket_size\"] = args.ds_bucket_mb * 1000 * 1000\n # must set shuffle=False, persistent_workers=False (because worker is in another thread)\n data_loader = DataLoader(train_data, shuffle=False, pin_memory=True, batch_size=args.micro_bsz, num_workers=1, persistent_workers=False, drop_last=True)\n trainer.fit(model, data_loader)", + "type": "code", + "location": "/RWKV-v4neo/train.py:378-383" + }, + "973": { + "file_id": 49, + "content": "This code sets the bucket size for zero optimization and configures a data loader with specific parameters before fitting the model in a trainer.", + "type": "comment" + }, + "974": { + "file_id": 50, + "content": "/RWKV-v4neo/verify.py", + "type": "filepath" + }, + "975": { + "file_id": 50, + "content": "This code initializes and trains a RWKV language model, sets layers and context length, creates tokenizer, loads model file, and prints select RWKV-RNN outputs.", + "type": "summary" + }, + "976": { + "file_id": 50, + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\n# this is for verifying the results of different models and make sure they agree with each other\nimport os, sys, types\nimport numpy as np\nimport torch\nnp.set_printoptions(precision=4, suppress=True, linewidth=200)\ntry:\n os.environ[\"CUDA_VISIBLE_DEVICES\"] = sys.argv[1]\nexcept:\n pass\ntorch.backends.cudnn.benchmark = True\ntorch.backends.cudnn.allow_tf32 = False\ntorch.backends.cuda.matmul.allow_tf32 = False\nos.environ['RWKV_FLOAT_MODE'] = 'bf16' # bf16 or fp32\nos.environ['RWKV_RUN_DEVICE'] = 'cuda' # currently model_train requires CUDA\nRUN_DEVICE = os.environ['RWKV_RUN_DEVICE']\nTOKEN_MODE = 'pile'\nif TOKEN_MODE == 'pile':\n WORD_NAME = ['20B_tokenizer.json', '20B_tokenizer.json']\n MODEL_NAME = '/fsx/BlinkDL/HF-MODEL/rwkv-4-pile-3b/RWKV-4-Pile-3B-20221003-6783'", + "type": "code", + "location": "/RWKV-v4neo/verify.py:1-27" + }, + "977": { + "file_id": 50, + "content": "The code imports necessary libraries and sets environment variables for running the RWKV Language Model, which verifies results from different models to ensure consistency. It also specifies the device (CPU or GPU) to run the model and tokenization method (Pile).", + "type": "comment" + }, + "978": { + "file_id": 50, + "content": " n_layer = 32\n n_embd = 2560\n ctx_len = 1024\n UNKNOWN_CHAR = None\nfrom src.utils import TOKENIZER\ntokenizer = TOKENIZER(WORD_NAME, UNKNOWN_CHAR=UNKNOWN_CHAR)\nif TOKEN_MODE == 'pile':\n tokenizer.vocab_size = 50277\n########################################################################################################\nos.environ[\"RWKV_JIT_ON\"] = \"1\"\nos.environ[\"RWKV_T_MAX\"] = str(ctx_len)\nfrom src.model_run import RWKV_RNN\nfrom src.model import RWKV\nargs = types.SimpleNamespace()\nargs.vocab_size = tokenizer.vocab_size\nargs.ctx_len = ctx_len\nargs.n_embd = n_embd\nargs.n_layer = n_layer\nargs.head_qk = 0\nargs.pre_ffn = 0\nargs.grad_cp = 0\nargs.my_pos_emb = 0\nmodel_train = RWKV(args).to(RUN_DEVICE)\nif os.environ['RWKV_FLOAT_MODE'] == 'fp16':\n model_train = model_train.half()\nelif os.environ['RWKV_FLOAT_MODE'] == 'bf16':\n model_train = model_train.bfloat16()\nprint('loading ' + MODEL_NAME)\nm2 = torch.load(MODEL_NAME + '.pth', map_location='cpu')\nmodel_train.load_state_dict(m2)\nif os.environ['RWKV_FLOAT_MODE'] == 'fp16':", + "type": "code", + "location": "/RWKV-v4neo/verify.py:28-66" + }, + "979": { + "file_id": 50, + "content": "This code initializes a RWKV model for training. It sets the number of layers, embedding dimension, and context length. The tokenizer is created based on the given word name and unknown character. The environment variables are set to define the maximum context length and enable JIT compilation. The model is then loaded from the specified file using appropriate float mode.", + "type": "comment" + }, + "980": { + "file_id": 50, + "content": " model_train = model_train.half()\nelif os.environ['RWKV_FLOAT_MODE'] == 'bf16':\n model_train = model_train.bfloat16()\nargs.MODEL_NAME = MODEL_NAME\nargs.RUN_DEVICE = RUN_DEVICE\nargs.FLOAT_MODE = os.environ['RWKV_FLOAT_MODE']\nmodel_rnn = RWKV_RNN(args)\n########################################################################################################\nprint(f\"\\nVerifying {os.environ['RWKV_RUN_DEVICE']} {os.environ['RWKV_FLOAT_MODE']}\")\n# context = '\\nIn a'\ncontext = '\\nIn a shocking finding, scientist discovered a herd of dragons living in a remote, previously unexplored valley, in Tibet. Even more surprising to the researchers was the fact that the dragons spoke perfect Chinese.'\nif TOKEN_MODE == 'pile':\n ctx = tokenizer.tokenizer.encode(context)\nprint(f'input len {len(ctx)} data {ctx}')\n########################################################################################################\nwith torch.no_grad():\n print('\\nRWKV-train output')\n out = model_train.forward(torch.tensor([ctx]).to(RUN_DEVICE))[0].detach().cpu().float().numpy()", + "type": "code", + "location": "/RWKV-v4neo/verify.py:67-91" + }, + "981": { + "file_id": 50, + "content": "This code checks the RWKV_FLOAT_MODE environment variable and sets the appropriate float mode for the model_train. It then verifies the device and float mode, encodes a context string into tokens using the tokenizer, and generates output from the model_train in forward pass with no gradient calculation (torch.no_grad()).", + "type": "comment" + }, + "982": { + "file_id": 50, + "content": " print(out, '\\n')\n print('\\nRWKV-RNN output')\n state = None\n out = None\n src_len = len(ctx)\n for i in range(src_len):\n x = ctx[:i+1]\n out, state = model_rnn.forward(x, state)\n if i < 3 or i >= src_len - 3:\n print(out.detach().cpu().numpy())\n if i == 2:\n print('...')", + "type": "code", + "location": "/RWKV-v4neo/verify.py:92-104" + }, + "983": { + "file_id": 50, + "content": "This code prints RWKV-RNN output at certain positions in the sequence. It uses a for loop to iterate through the context, calling the forward function of model_rnn. The first three outputs and the third one are printed using `print(out.detach().cpu().numpy())`, with ellipsis ('...') printed after the second output.", + "type": "comment" + }, + "984": { + "file_id": 51, + "content": "/RWKV-v5/cuda/wkv5_cuda.cu", + "type": "filepath" + }, + "985": { + "file_id": 51, + "content": "This CUDA code optimizes neural network forward pass with shared memory, efficient matrix operations, and parallel computation. It performs convolution using kernel functions, shared memory, synchronization, and unrolled loops. Assertions ensure efficient GPU computation.", + "type": "summary" + }, + "986": { + "file_id": 51, + "content": "#include\n#include \n#include \"ATen/ATen.h\"\ntypedef at::BFloat16 bf16;\ntemplate \n__global__ void kernel_forward(const int B, const int T, const int C, const int H,\n const F *__restrict__ const _r, const F *__restrict__ const _k, const F *__restrict__ const _v, const float *__restrict__ _w, const F *__restrict__ _u,\n F *__restrict__ const _y)\n{\n const int b = blockIdx.x / H;\n const int h = blockIdx.x % H;\n const int i = threadIdx.x;\n _w += h*_N_;\n _u += h*_N_;\n __shared__ float r[_N_], k[_N_], u[_N_], w[_N_];\n float state[_N_] = {0};\n __syncthreads();\n w[i] = _w[i];\n u[i] = float(_u[i]);\n __syncthreads();\n for (int t = b*T*C + h*_N_ + i; t < (b+1)*T*C + h*_N_ + i; t += C)\n {\n __syncthreads();\n r[i] = float(_r[t]);\n k[i] = float(_k[t]);\n __syncthreads();\n const float v = float(_v[t]);\n float y = 0;\n #pragma unroll\n for (int j = 0; j < _N_; j+=4)", + "type": "code", + "location": "/RWKV-v4neo/cuda/wkv5_cuda.cu:1-36" + }, + "987": { + "file_id": 51, + "content": "Code implements a CUDA kernel for the forward pass of a neural network layer, where each thread calculates output values based on input data and pre-stored parameters. It uses shared memory to store intermediate results and synchronizes threads with `__syncthreads()`. The loop iterates over time steps, applying element-wise operations to calculate output values.", + "type": "comment" + }, + "988": { + "file_id": 51, + "content": " {\n const float4& r_ = (float4&)(r[j]);\n const float4& k_ = (float4&)(k[j]);\n const float4& w_ = (float4&)(w[j]);\n const float4& u_ = (float4&)(u[j]);\n float4& s = (float4&)(state[j]);\n float4 x;\n x.x = k_.x * v;\n x.y = k_.y * v;\n x.z = k_.z * v;\n x.w = k_.w * v;\n y += r_.x * (u_.x * x.x + s.x);\n y += r_.y * (u_.y * x.y + s.y);\n y += r_.z * (u_.z * x.z + s.z);\n y += r_.w * (u_.w * x.w + s.w);\n s.x = s.x * w_.x + x.x;\n s.y = s.y * w_.y + x.y;\n s.z = s.z * w_.z + x.z;\n s.w = s.w * w_.w + x.w;\n }\n _y[t] = F(y);\n }\n}\ntemplate \n__global__ void kernel_backward(const int B, const int T, const int C, const int H,\n const F *__restrict__ const _r, const F *__restrict__ const _k, const F *__restrict__ const _v, const float *__restrict__ _w, const float *__restrict__ __w, const F *__restrict__ _u, const F *__restrict__ const _gy,", + "type": "code", + "location": "/RWKV-v4neo/cuda/wkv5_cuda.cu:37-66" + }, + "989": { + "file_id": 51, + "content": "This code is performing a matrix multiplication operation using CUDA. It takes in four input matrices, calculates the dot product between two sets of vectors, and updates the state vector accordingly. The result is then passed to a function F for further processing.", + "type": "comment" + }, + "990": { + "file_id": 51, + "content": " F *__restrict__ const _gr, F *__restrict__ const _gk, F *__restrict__ const _gv, F *__restrict__ const _gw, F *__restrict__ const _gu)\n{\n const int b = blockIdx.x / H;\n const int h = blockIdx.x % H;\n const int i = threadIdx.x;\n _w += h*_N_;\n _u += h*_N_;\n __w += h*_N_;\n __shared__ float w_[_N_], u_[_N_];\n __shared__ float r[_N_], k[_N_], v[_N_], gy[_N_];\n __syncthreads();\n w_[i] = _w[i];\n u_[i] = float(_u[i]);\n __syncthreads();\n const float w = w_[i];\n const float ww = __w[i];\n const float u = u_[i];\n float state[_N_] = {0}, saaaa[_N_] = {0}, sbbbb[_N_] = {0}, scccc[_N_] = {0}, sdddd[_N_] = {0};\n float gw = 0, gu = 0;\n const int t000 = b*T*C + h*_N_ + i;\n const int t111 = (b+1)*T*C + h*_N_ + i;\n const int t222 = t111 - 2*C;\n for (int t = t000; t < t111; t += C)\n {\n __syncthreads();\n v[i] = float(_v[t]);\n gy[i] = float(_gy[t]);\n __syncthreads();\n const float k = float(_k[t]);\n float gr = 0, gu_ = 0;\n #pragma unroll", + "type": "code", + "location": "/RWKV-v4neo/cuda/wkv5_cuda.cu:67-104" + }, + "991": { + "file_id": 51, + "content": "This function calculates the recurrent weight updates in a neural network using CUDA. It uses shared memory for efficient parallel computation and synchronizes threads with `__syncthreads()`. The variables `w`, `u` represent input and output tensors, while `v` and `gy` store intermediate results. The loop iterates over the time dimension (T) and channel dimension (C).", + "type": "comment" + }, + "992": { + "file_id": 51, + "content": " for (int j = 0; j < _N_; j++)\n {\n float& s = state[j];\n float x = k * v[j];\n gr += (u * x + s) * gy[j];\n gu_ += x * gy[j];\n s = s * w + x;\n }\n _gr[t] = F(gr);\n gu += float(_r[t]) * gu_;\n }\n _gu[b*C + h*_N_ + i] = F(gu);\n for (int t = t000; t < t222; t += C)\n {\n __syncthreads();\n v[i] = float(_v[t]);\n gy[i] = float(_gy[t + 2*C]);\n __syncthreads();\n const float k = float(_k[t]);\n float gw_ = 0;\n #pragma unroll\n for (int j = 0; j < _N_; j++)\n {\n float& s = saaaa[j];\n float& s2 = sbbbb[j];\n float x = k * v[j];\n float tmp = w * (x + s);\n s = tmp;\n s2 = tmp + w * s2;\n gw_ += s2 * gy[j];\n }\n gw += float(_r[t + 2*C]) * gw_;\n } \n _gw[b*C + h*_N_ + i] = F(ww * gw);\n for (int t = t111 - C; t >= t000; t -= C)\n {\n __syncthreads();\n v[i] = float(_v[t]);\n gy[i] = float(_gy[t]);", + "type": "code", + "location": "/RWKV-v4neo/cuda/wkv5_cuda.cu:105-149" + }, + "993": { + "file_id": 51, + "content": "This code appears to be performing element-wise operations on arrays, such as adding and multiplying elements. It uses a for loop to iterate through the array elements, with each iteration updating the state variables and accumulating results for later use. The code also includes synchronization points (__syncthreads) for thread coordination, and it utilizes arrays with names like _gr, _gu, and _gw for storing intermediate and final results. Overall, this code seems to be part of a larger computation involving array manipulation and synchronization.", + "type": "comment" + }, + "994": { + "file_id": 51, + "content": " __syncthreads();\n const float rr = float(_r[t]);\n float gk = 0;\n #pragma unroll\n for (int j = 0; j < _N_; j++)\n {\n float& s = scccc[j];\n float x = rr * gy[j];\n gk += (u * x + s) * v[j];\n s = x + s * w;\n }\n _gk[t] = F(gk);\n }\n for (int t = t111 - C; t >= t000; t -= C)\n {\n __syncthreads();\n r[i] = float(_r[t]);\n k[i] = float(_k[t]);\n __syncthreads();\n const float gyy = float(_gy[t]);\n float gv = 0;\n #pragma unroll\n for (int j = 0; j < _N_; j++)\n {\n float& s = sdddd[j];\n float x = gyy * r[j];\n gv += (u_[j] * x + s) * k[j];\n s = x + s * w_[j];\n }\n _gv[t] = F(gv);\n }\n}\nvoid cuda_forward(int B, int T, int C, int H, bf16 *r, bf16 *k, bf16 *v, float *w, bf16 *u, bf16 *y)\n{\n assert(H*_N_ == C);\n assert(_N_%4 == 0);\n kernel_forward<< >>(B, T, C, H, r, k, v, w, u, y);\n}\nvo", + "type": "code", + "location": "/RWKV-v4neo/cuda/wkv5_cuda.cu:150-197" + }, + "995": { + "file_id": 51, + "content": "The code is performing a convolution operation in CUDA. It first executes a kernel function that processes the data in blocks and threads, then calls another kernel function to compute the final results. The code uses shared memory, synchronization, and unrolling of loops for efficient computation. The `cuda_forward` function initializes the necessary variables and launches the kernel functions with appropriate dimensions.", + "type": "comment" + }, + "996": { + "file_id": 51, + "content": "id cuda_backward(int B, int T, int C, int H, bf16 *r, bf16 *k, bf16 *v, float *w, float *ww, bf16 *u, bf16 *gy, bf16 *gr, bf16 *gk, bf16 *gv, bf16 *gw, bf16 *gu)\n{\n assert(H*_N_ == C);\n assert(_N_%4 == 0);\n kernel_backward<< >>(B, T, C, H, r, k, v, w, ww, u, gy, gr, gk, gv, gw, gu);\n}", + "type": "code", + "location": "/RWKV-v4neo/cuda/wkv5_cuda.cu:197-202" + }, + "997": { + "file_id": 51, + "content": "Function `cuda_backward` is a CUDA kernel launcher, taking input/output parameters and invoking the `kernel_backward` kernel with appropriate grid and block dimensions. The assertions ensure correct memory layouts and alignment for efficient GPU computation.", + "type": "comment" + }, + "998": { + "file_id": 52, + "content": "/RWKV-v5/cuda/wkv5_op.cpp", + "type": "filepath" + }, + "999": { + "file_id": 52, + "content": "This C++ code implements forward and backward neural network operations using PyTorch tensors, optimized for CUDA execution. It includes functions for BFloat16 data type, with Python module \"wkv5\" for forward and backward operations.", + "type": "summary" + } +} \ No newline at end of file diff --git a/docs/data/titles/0.json b/docs/data/titles/0.json new file mode 100644 index 00000000..3ba9c087 --- /dev/null +++ b/docs/data/titles/0.json @@ -0,0 +1,302 @@ +{ + "/README.md": "RWKV Language Model: Enhanced Features and Efficient Sampling", + "/README.md:1-26": "Install, Prepare, Train RWKV Language Model", + "/README.md:114-128": "RWKV vs GPT2: Performance, Locality & Training", + "/README.md:128-141": "RWKV: Efficient Language Model on Various Hardware", + "/README.md:141-150": "Adaptable RNN Model for Edge Devices", + "/README.md:150-161": "RWKV-4a Improvements and Results", + "/README.md:163-184": "Quick Start Guide for RWKV-LM", + "/README.md:186-196": "Training RWKV-4 with Script and Dataset", + "/README.md:198-218": "RWKV-4 Pile Fine-Tuning Guide", + "/README.md:220-237": "RWKV Embeddings: Statistic Collection & Classifier Training", + "/README.md:238-245": "Matrix Equation with Three Rows and Variable Weights", + "/README.md:246-265": "Matrix Vector Calculation Algorithm", + "/README.md:266-284": "Learnable Parameters for RWKV-6 Model Mixing", + "/README.md:28-38": "RWKV: Efficient Transformer-RNN Hybrid", + "/README.md:285-306": "Time-Dependent Mixing and Shifting in RWKV-7", + "/README.md:306-325": "Exploring Decay, Lie Groups, and Optimization Techniques in Transformers", + "/README.md:326-342": "Image-Language Model: Position Embedding and Token Shift", + "/README.md:344-375": "Improved Tokenization via Hardcoded Channels", + "/README.md:377-401": "Enhancing RWKV Initial States for Model Discovery", + "/README.md:40-61": "RWKV Model Setup and Forward Passes", + "/README.md:401-413": "Transformer Performance Optimizations", + "/README.md:415-441": "RWKV-v2-RNN Architecture: Time-Decay Curve Improvements", + "/README.md:442-469": "RWKV-3 GPT Model: LN After Embedding", + "/README.md:471-496": "Implementing ATT Mechanism in RWKV-3 Model", + "/README.md:497-528": "Time-Mixing Convolution Operations", + "/README.md:529-557": "Efficient FFN Operation for GPT", + "/README.md:557-563": "Time-Series Prediction Formula for RWKV", + "/README.md:563-567": "Weighted Sum Calculation in GPT", + "/README.md:567-572": "Sigmoid-weighted RWKV Contribution Formula", + "/README.md:572-578": "Time-Decay Exponential RNN Formula", + "/README.md:578-582": "Sigmoid-Exponentiated Matrix Calculation", + "/README.md:582-596": "Matrix-Exponential-Based RWKV", + "/README.md:598-609": "Efficient Image Processing with LM Loss", + "/README.md:610-627": "Prime Sampling for Deterministic Randomness", + "/README.md:62-90": "Explore RWKV Projects and Resources", + "/README.md:629-645": "Efficient Learning Rate Scheduling with Top-p Sampling", + "/README.md:647-656": "RWKV v1 Architecture: Time-Mix and Channel-Mix Layers", + "/README.md:656-668": "Calculating RWKV's TM and CM Components", + "/README.md:669-685": "Fast & Stable Convergence with Zero Initialization", + "/README.md:687-711": "Token-Shift Enhancement for LMs", + "/README.md:711-724": "Head-QK Trick in Transformer Model", + "/README.md:725-754": "One-hot multiplication and sampling improvement", + "/README.md:756-785": "Attention Mechanisms: RWKV Outperforms VRAM", + "/README.md:92-114": "RWKV Model Resources and Community", + "/RWKV-v1/src/model.py": "Rotary Transformer-Based RWKV Model Initialization", + "/RWKV-v1/src/model.py:1-22": "Fancy Initialization for Linear and Embedding Layers", + "/RWKV-v1/src/model.py:127-158": "Attention Mechanisms for RWKV Model", + "/RWKV-v1/src/model.py:159-180": "RWKV-v1 Attention Model Implementation", + "/RWKV-v1/src/model.py:181-203": "Multi-Head Attention with Rotary Embedding", + "/RWKV-v1/src/model.py:204-230": "Rotary Multi-Head Attention in RWKV", + "/RWKV-v1/src/model.py:23-46": "Neural Network Gain and Scale Initialization", + "/RWKV-v1/src/model.py:232-253": "Initializing Transformer Model", + "/RWKV-v1/src/model.py:254-270": "Rotary Attention Encoding", + "/RWKV-v1/src/model.py:271-299": "Multi-Head Attention Module Definition", + "/RWKV-v1/src/model.py:300-320": "Transformer Attention Mechanism Initialization", + "/RWKV-v1/src/model.py:322-339": "Attention Model in Transformer Architecture", + "/RWKV-v1/src/model.py:340-352": "Rotary Positional Encoding Attention", + "/RWKV-v1/src/model.py:354-385": "GPT Normalization Layers: RMSNorm and FixedNorm", + "/RWKV-v1/src/model.py:386-414": "Dynamic Transformer Blocks for RWKV-v1", + "/RWKV-v1/src/model.py:415-445": "RWKV Model Initialization", + "/RWKV-v1/src/model.py:446-472": "RWKV Model Initialization", + "/RWKV-v1/src/model.py:473-489": "Organizing Model Parameters for Weight Decay", + "/RWKV-v1/src/model.py:48-70": "RWKV TimeMix Initialization", + "/RWKV-v1/src/model.py:490-517": "Attention Mechanism in RWKV Model", + "/RWKV-v1/src/model.py:71-94": "Multi-Head Attention Layer Initialization", + "/RWKV-v1/src/model.py:96-125": "Forward Pass: Element-wise Operations and Layer Normalization", + "/RWKV-v1/src/trainer.py": "Comprehensive Trainer for RWKV-v1", + "/RWKV-v1/src/trainer.py:1-35": "Trainer Class Initialization", + "/RWKV-v1/src/trainer.py:100-119": "Dynamic Learning Rate Adjustment", + "/RWKV-v1/src/trainer.py:120-130": "Progress Bar Initialization and Training Epochs", + "/RWKV-v1/src/trainer.py:36-57": "Trainer Class Initialization", + "/RWKV-v1/src/trainer.py:58-81": "Training Model with Optimizer and Calculating Average Loss", + "/RWKV-v1/src/trainer.py:83-99": "Trainer: Backpropagation and Learning Rate Decay", + "/RWKV-v1/src/utils.py": "Text Generation with Top-k, Top-p, and Temperature Scaling", + "/RWKV-v1/src/utils.py:1-32": "Top-k/p Sampling Functions", + "/RWKV-v1/src/utils.py:33-50": "Model Sampling with Logits and Top-K/P", + "/RWKV-v1/train.py": "Training RWKV-v1 Language Model", + "/RWKV-v1/train.py:1-21": "RWKV Model Setup & Configuration", + "/RWKV-v1/train.py:118-135": "GPT Model Initialization and Training", + "/RWKV-v1/train.py:136-142": "Trainer Initialization and Training Savings", + "/RWKV-v1/train.py:23-43": "RWKV-v1 Training Settings", + "/RWKV-v1/train.py:45-68": "RWKV-v1 Hyperparameter Setup", + "/RWKV-v1/train.py:69-87": "Initialize RWKV Model Parameters", + "/RWKV-v1/train.py:88-117": "Train RWKV-LM: Vocab JSON and Dictionaries", + "/RWKV-v2-RNN/cuda/timex_cuda.cu": "CUDA-Optimized RWKV-v2 RNN Dot Products", + "/RWKV-v2-RNN/cuda/timex_cuda.cu:1-33": "CUDA-Optimized RWKV-v2-RNN Forward Pass", + "/RWKV-v2-RNN/cuda/timex_cuda.cu:131-163": "Matrix Variable Updating with Unroll Optimization", + "/RWKV-v2-RNN/cuda/timex_cuda.cu:164-172": "CUDA Time Step Calculation", + "/RWKV-v2-RNN/cuda/timex_cuda.cu:34-63": "CUDA Dot Product Update Kernel", + "/RWKV-v2-RNN/cuda/timex_cuda.cu:64-93": "CUDA RNN Forward Pass Calculation", + "/RWKV-v2-RNN/cuda/timex_cuda.cu:94-130": "Shared Memory Optimized RNN CUDA Calculation", + "/RWKV-v2-RNN/cuda/timex_op.cpp": "CUDA Kernel for Timex Operations in PyTorch", + "/RWKV-v2-RNN/cuda/timex_op.cpp:1-20": "CUDA-Accelerated TimeX Operations for PyTorch", + "/RWKV-v2-RNN/cuda/timex_op.cpp:21-21": "Closing Brace in Function/Class Definitions", + "/RWKV-v2-RNN/run.py": "RWKV Model Training Script", + "/RWKV-v2-RNN/run.py:1-31": "RWKV-v2 RNN Initialization", + "/RWKV-v2-RNN/run.py:121-133": "Model Load or Train and Evaluate Script", + "/RWKV-v2-RNN/run.py:32-58": "Initializing RWKV Model Parameters", + "/RWKV-v2-RNN/run.py:59-89": "Evaluating Model on Dataset with Random Samples", + "/RWKV-v2-RNN/run.py:91-120": "RWKV-v2 RNN: First Run and Saving/Loading States", + "/RWKV-v2-RNN/src/model.py": "RWKV v2-RNN Optimization with GPT", + "/RWKV-v2-RNN/src/model.py:1-23": "RWKV v2-RNN Model Setup", + "/RWKV-v2-RNN/src/model.py:102-129": "RWKV TimeMix Module Initialization", + "/RWKV-v2-RNN/src/model.py:130-147": "Time-Decay Transformer Initialization", + "/RWKV-v2-RNN/src/model.py:148-177": "RWKV-v2 Model Initialization", + "/RWKV-v2-RNN/src/model.py:178-207": "RWKV Channel Mix Module", + "/RWKV-v2-RNN/src/model.py:208-242": "Defining GPT Model with RWKV Blocks", + "/RWKV-v2-RNN/src/model.py:23-47": "TimeX CUDA Implementation", + "/RWKV-v2-RNN/src/model.py:243-275": "GPT Model: RWKV-ffnPre Implementation", + "/RWKV-v2-RNN/src/model.py:276-304": "RWKV Model Configuration and Optimization", + "/RWKV-v2-RNN/src/model.py:305-329": "Optimizing Model Parameters with Weight Decay", + "/RWKV-v2-RNN/src/model.py:330-349": "Model Forward Pass Code", + "/RWKV-v2-RNN/src/model.py:48-73": "RWKV Model Initialization", + "/RWKV-v2-RNN/src/model.py:74-101": "Dynamic Layer Initialization in RWKV-v2 RNN", + "/RWKV-v2-RNN/src/model_run.py": "RWKV-v2 RNN Model Initialization", + "/RWKV-v2-RNN/src/model_run.py:1-37": "RWKV-RNN Model Initialization", + "/RWKV-v2-RNN/src/model_run.py:131-143": "Reshape and Multiply for Context", + "/RWKV-v2-RNN/src/model_run.py:38-69": "Nested Dictionary Storage with Clear, Save, Load Functions", + "/RWKV-v2-RNN/src/model_run.py:70-98": "Neural Network Model Methods", + "/RWKV-v2-RNN/src/model_run.py:99-130": "RWKV Model Language Generation", + "/RWKV-v2-RNN/src/trainer.py": "Trainer Config and Training Functionality", + "/RWKV-v2-RNN/src/trainer.py:1-36": "TrainerConfig: Params & Efficient GPU Setup", + "/RWKV-v2-RNN/src/trainer.py:118-132": "Learning Rate Multiplier Calculation", + "/RWKV-v2-RNN/src/trainer.py:133-154": "Learning Rate Config and Logging", + "/RWKV-v2-RNN/src/trainer.py:156-170": "Epoch-wise Token Counter and Training", + "/RWKV-v2-RNN/src/trainer.py:37-67": "Wandb-Assisted Trainer Initialization", + "/RWKV-v2-RNN/src/trainer.py:68-92": "Model Training Initialization and Configuration", + "/RWKV-v2-RNN/src/trainer.py:93-117": "Training DataLoader with Disabled Shuffling", + "/RWKV-v2-RNN/src/utils.py": "RWKV v2-RNN Dataset Class", + "/RWKV-v2-RNN/src/utils.py:1-33": "Custom Dataset Class for RWKV v2-RNN", + "/RWKV-v2-RNN/src/utils.py:34-59": "RWKV Tokenizer Class", + "/RWKV-v2-RNN/src/utils.py:60-90": "RWKV-v2 RNN Word Table Initialization", + "/RWKV-v2-RNN/src/utils.py:92-122": "Sample from Probabilities", + "/RWKV-v2-RNN/train.py": "RWKV-v2 RNN/GPT Model Training and Optimization", + "/RWKV-v2-RNN/train.py:1-29": "RWKV-v2 RNN Training Script", + "/RWKV-v2-RNN/train.py:30-56": "Training RWKV Model Parameters", + "/RWKV-v2-RNN/train.py:57-82": "GPT Model Training with RWKV-v2-RNN", + "/RWKV-v2-RNN/train.py:84-98": "Train and Save RWKV-v2 Model", + "/RWKV-v3/cuda/timex_cuda.cu": "CUDA RNN Dot Product Optimization", + "/RWKV-v3/cuda/timex_op.cpp": "Timex Operations CUDA Kernel", + "/RWKV-v3/run.py": "RWKV-v3 Text Generation & Tokenization", + "/RWKV-v3/run.py:1-30": "RWKV Model Setup", + "/RWKV-v3/run.py:31-58": "Loading and Tokenizing RWKV Model", + "/RWKV-v3/run.py:58-89": "Model Preparation and Processing", + "/RWKV-v3/run.py:90-98": "RWKV Text Generation and Timing", + "/RWKV-v3/src/model.py": "Efficient Time-based RWKV LM with GPT Model", + "/RWKV-v3/src/model.py:1-25": "RWKV Language Model: PyTorch CUDA Implementation", + "/RWKV-v3/src/model.py:128-148": "Attention Mechanism Initialization in RWKV-v3 Model", + "/RWKV-v3/src/model.py:150-175": "Time-Mixing Attention Model Initialization", + "/RWKV-v3/src/model.py:176-204": "RWKV Channel Mix Module", + "/RWKV-v3/src/model.py:205-233": "RWKV-v3 Time-based Mixing GPT Model", + "/RWKV-v3/src/model.py:234-266": "RWKV Model Block Definition", + "/RWKV-v3/src/model.py:267-297": "GPT Model Initialization", + "/RWKV-v3/src/model.py:27-50": "TimeX Operation Class", + "/RWKV-v3/src/model.py:299-324": "Weight Decay and Optimizer Configuring in Model", + "/RWKV-v3/src/model.py:325-354": "RWKV Model and Optimization", + "/RWKV-v3/src/model.py:355-363": "Model Output and Loss Calculation", + "/RWKV-v3/src/model.py:51-69": "RWKV-v3 Weight Initialization", + "/RWKV-v3/src/model.py:70-98": "Dynamic Weight Initialization for Neural Network Layers", + "/RWKV-v3/src/model.py:99-127": "RWKV TimeMix: Orthogonal/Normal Initialization", + "/RWKV-v3/src/model_run.py": "RWKV-v3 Transformer Model Initialization", + "/RWKV-v3/src/model_run.py:1-30": "RWKV Channel Mixing Module", + "/RWKV-v3/src/model_run.py:116-146": "RWKV-GPT Model Architecture", + "/RWKV-v3/src/model_run.py:147-177": "RWKV-v3 Model Initialization", + "/RWKV-v3/src/model_run.py:179-207": "Loading and Initializing RWKV_RNN Model", + "/RWKV-v3/src/model_run.py:208-238": "Hierarchical Data Storage Class", + "/RWKV-v3/src/model_run.py:239-264": "Functions for RWKV-v3 Model", + "/RWKV-v3/src/model_run.py:265-294": "Layered LN, SA, FF Model Run", + "/RWKV-v3/src/model_run.py:296-319": "Context-Aware Attention Calculations", + "/RWKV-v3/src/model_run.py:32-57": "RWKV-v3 Model: Forward and TimeMix Initialization", + "/RWKV-v3/src/model_run.py:58-85": "Transformer Model Initialization and Forward Pass", + "/RWKV-v3/src/model_run.py:85-115": "Transformer Block with Time-Mix and Channel-Mix", + "/RWKV-v3/src/trainer.py": "Trainer: CUDA Optimizations and Data Handling", + "/RWKV-v3/src/trainer.py:1-36": "Training Language Model with Trainer Config", + "/RWKV-v3/src/trainer.py:118-133": "Warmup and Exponential Learning Rate Calculation", + "/RWKV-v3/src/trainer.py:134-155": "Dynamic Learning Rate Adjustment", + "/RWKV-v3/src/trainer.py:157-171": "Train and Save RWKV-v3 Model with Decaying Learning Rate", + "/RWKV-v3/src/trainer.py:37-67": "Trainer Class: WandB, Learning Rate Decay", + "/RWKV-v3/src/trainer.py:68-92": "CUDA-Powered Model Trainer Initialization", + "/RWKV-v3/src/trainer.py:93-117": "DataLoader Iteration and Training Process", + "/RWKV-v3/src/utils.py": "RWKV Dataset and Tokenizer", + "/RWKV-v3/src/utils.py:1-34": "Custom Dataset Class for RWKV Model", + "/RWKV-v3/src/utils.py:35-62": "Tokenizer: Converting Text to Numerical Reps", + "/RWKV-v3/src/utils.py:64-95": "Refine Context and Calculate Probs", + "/RWKV-v3/src/utils.py:96-122": "Top-P Sampling in RWKV-LM Utils", + "/RWKV-v3/train.py": "Training RWKV-LM and GPT Models Efficiently", + "/RWKV-v3/train.py:1-22": "Training RWKV-LM with GPT Model", + "/RWKV-v3/train.py:113-118": "Saving Model State with Timestamp", + "/RWKV-v3/train.py:23-48": "Optimizing RWKV-v3 Training Parameters", + "/RWKV-v3/train.py:50-65": "Optimizing RWKV-v3 Training Parameters", + "/RWKV-v3/train.py:67-96": "RWKV Language Model Training Parameters", + "/RWKV-v3/train.py:97-112": "GPT Model Training with Hyperparameters", + "/RWKV-v3/verify.py": "RWKV-GPT Verification Script", + "/RWKV-v3/verify.py:1-31": "RWKV-LM Verification with GPT Architecture", + "/RWKV-v3/verify.py:32-61": "RWKV-GPT & RNN Model Outputs from Checkpoint", + "/RWKV-v3/verify.py:62-65": "Padding and Training Model with Detach and Print", + "/RWKV-v4/cuda/wkv_cuda.cu": "Efficient RWKV-v4 CUDA Kernel with Optimized Execution", + "/RWKV-v4/cuda/wkv_cuda.cu:1-29": "CUDA RNN Forward Pass Kernel Function", + "/RWKV-v4/cuda/wkv_cuda.cu:122-125": "Efficient GPU Kernel Execution", + "/RWKV-v4/cuda/wkv_cuda.cu:31-56": "Backward Propagation Kernel", + "/RWKV-v4/cuda/wkv_cuda.cu:58-97": "Calculating Gradients for WKV Model", + "/RWKV-v4/cuda/wkv_cuda.cu:99-121": "CUDA Matrix Operations with RWKV-v4", + "/RWKV-v4/cuda/wkv_op.cpp": "RWKV-v4 GPU WKV Operations", + "/RWKV-v4/cuda/wkv_op.cpp:1-14": "WaveGrad RWKV Model C++ Functions", + "/RWKV-v4/cuda/wkv_op.cpp:15-21": "Wavelet Quantized Variational Kalman Filter Implementation", + "/RWKV-v4/run.py": "RWKV Language Model Text Generation", + "/RWKV-v4/run.py:1-26": "RWKV Model Setup and Configuration", + "/RWKV-v4/run.py:110-142": "RWKV-v4 Text Generation and Model Saving", + "/RWKV-v4/run.py:143-149": "Tokenizing and Timing", + "/RWKV-v4/run.py:28-60": "Model Parameter Setter", + "/RWKV-v4/run.py:61-87": "RWKV Model Setup and Inference", + "/RWKV-v4/run.py:87-109": "RWKV Tokenizer & Processing Efficiency", + "/RWKV-v4/src/binidx.py": "Binary Indexed Dataset Class", + "/RWKV-v4/src/binidx.py:1-48": "Binary Indexing in RWKV-v4", + "/RWKV-v4/src/binidx.py:100-141": "Binary Data File Index Class", + "/RWKV-v4/src/binidx.py:142-170": "Numpy Buffer Memory View and Indexing", + "/RWKV-v4/src/binidx.py:171-203": "Binary Indexing Functions", + "/RWKV-v4/src/binidx.py:205-216": "Document Index and Data File Existence Check", + "/RWKV-v4/src/binidx.py:49-72": "Index File Reader and Validator", + "/RWKV-v4/src/binidx.py:73-98": "Binary File Loading with Numpy Memmap", + "/RWKV-v4/src/model.py": "Time-Mixing Channel Model for RWKV v4", + "/RWKV-v4/src/model.py:1-32": "L2Wrap Loss Calculation", + "/RWKV-v4/src/model.py:104-126": "Fancy Initialization for Model Layers", + "/RWKV-v4/src/model.py:127-156": "Efficient Weight Matrix Initialization for RWKV Models", + "/RWKV-v4/src/model.py:157-184": "RWKV TimeMix Layer Initialization", + "/RWKV-v4/src/model.py:185-206": "Attention Layer Initialization in Transformer Model", + "/RWKV-v4/src/model.py:208-242": "Time-Shifting Channel Mixing Model", + "/RWKV-v4/src/model.py:243-270": "Time-Mixing Transformer Layer Initialization", + "/RWKV-v4/src/model.py:271-303": "Customizable GPT Model with Blocks", + "/RWKV-v4/src/model.py:305-333": "GPT Model Class with Layer Normalization", + "/RWKV-v4/src/model.py:33-55": "WKV Class and Computations", + "/RWKV-v4/src/model.py:334-364": "Model Initialization and Configuration", + "/RWKV-v4/src/model.py:365-387": "Optimizer Initialization for Models", + "/RWKV-v4/src/model.py:389-414": "Multi-Head Attention with Cross-Entropy Loss", + "/RWKV-v4/src/model.py:56-84": "Tensor Mode Initialization and Backward Pass", + "/RWKV-v4/src/model.py:85-103": "CUDA Backward Pass for RWKV Model", + "/RWKV-v4/src/model_run.py": "Efficient Transformer Model Execution with RWKV-v4", + "/RWKV-v4/src/model_run.py:1-25": "RWKV Language Model: CUDA-Friendly NLP", + "/RWKV-v4/src/model_run.py:125-155": "Time-Based Mixing Layer in RWKV Model", + "/RWKV-v4/src/model_run.py:156-186": "RWKV-GPT Model Initialization and Forward Pass", + "/RWKV-v4/src/model_run.py:187-217": "RWKV Model Initialization and Forward Pass", + "/RWKV-v4/src/model_run.py:219-242": "Head-Multihead Attention Model Initialization", + "/RWKV-v4/src/model_run.py:243-272": "RWKV-v4 Model Weights Loading and Processing", + "/RWKV-v4/src/model_run.py:26-48": "RWKV Model Initialization", + "/RWKV-v4/src/model_run.py:273-306": "Dynamic Variable Class with LN and FF Operations", + "/RWKV-v4/src/model_run.py:307-330": "Attention Mechanism in Transformer Models", + "/RWKV-v4/src/model_run.py:332-366": "RWKV Model Function: LN-FFN-SA Layers", + "/RWKV-v4/src/model_run.py:367-392": "Feed-Forward Network and Layer Normalization in RWKV-v4", + "/RWKV-v4/src/model_run.py:49-72": "Forward and Backward Functions for RWKV Model", + "/RWKV-v4/src/model_run.py:73-96": "CUDA-powered RWKV Model", + "/RWKV-v4/src/model_run.py:98-124": "RWKV TimeMix: Time Decay & Mixing Model", + "/RWKV-v4/src/trainer.py": "RWKV Trainer: Training and Evaluation", + "/RWKV-v4/src/trainer.py:1-34": "RWKV Trainer Configuration", + "/RWKV-v4/src/trainer.py:111-136": "RWKV Model Training with Dataloader", + "/RWKV-v4/src/trainer.py:137-156": "RWKV Learning Rate Calculation", + "/RWKV-v4/src/trainer.py:157-179": "Average Loss Calculation and Updates", + "/RWKV-v4/src/trainer.py:181-187": "Epoch-based Model Saving and Logging", + "/RWKV-v4/src/trainer.py:35-63": "Trainer Class and Model Training", + "/RWKV-v4/src/trainer.py:64-87": "Loading Model and Initializing Logging", + "/RWKV-v4/src/trainer.py:89-109": "Configure and Load Data for Training or Testing", + "/RWKV-v4/src/utils.py": "RWKV Dataset Generation", + "/RWKV-v4/src/utils.py:1-29": "Initializing Dataset in RWKV-v4", + "/RWKV-v4/src/utils.py:108-137": "Softmax Sampling with Top-P Control", + "/RWKV-v4/src/utils.py:139-153": "Utility Functions", + "/RWKV-v4/src/utils.py:30-53": "Vocab Size Setter", + "/RWKV-v4/src/utils.py:55-81": "RWKV Tokenizer Class", + "/RWKV-v4/src/utils.py:82-106": "Tokenizer and Word Table Initialization", + "/RWKV-v4/train.py": "Efficient RWKV-v4 Training with DeepSpeed", + "/RWKV-v4/train.py:1-19": "RWKV-v4 Import and Setup", + "/RWKV-v4/train.py:115-135": "Optimizing RWKV-v4 Training Parameters", + "/RWKV-v4/train.py:136-168": "Model Training Parameters Configuration", + "/RWKV-v4/train.py:169-190": "Data Loading and Preparation", + "/RWKV-v4/train.py:191-209": "RWKV Trainer Setup and Config", + "/RWKV-v4/train.py:21-42": "Train RWKV Text Model: Data and Config", + "/RWKV-v4/train.py:210-233": "Trainer Type Determination: RWKV-v4/train.py", + "/RWKV-v4/train.py:234-263": "DeepSpeed RWKV-v4 Training Config", + "/RWKV-v4/train.py:264-280": "Environment-Guided DeepSpeed Strategy", + "/RWKV-v4/train.py:43-65": "RWKV-v4 Training Setup", + "/RWKV-v4/train.py:66-88": "Environment Variable Configuration", + "/RWKV-v4/train.py:90-114": "RWKV-v4 Hyperparameter Configuration", + "/RWKV-v4/verify.py": "Initialize and Compare RWKV Models", + "/RWKV-v4/verify.py:1-26": "Consistent Model Verification", + "/RWKV-v4/verify.py:27-57": "RWKV-v4 Model Initialization", + "/RWKV-v4/verify.py:58-81": "RWKV-GPT Tokenization and Model Output", + "/RWKV-v4/verify.py:82-90": "RWKV-v4 Model Output Check", + "/RWKV-v4neo/chat.py": "Multilingual RWKV Chatbot Environment", + "/RWKV-v4neo/chat.py:1-31": "RWKV Language Model Setup and Configuration", + "/RWKV-v4neo/chat.py:119-156": "RWKV-v4neo Chatbot Code", + "/RWKV-v4neo/chat.py:157-194": "RNN Inference and Reply Function", + "/RWKV-v4neo/chat.py:195-230": "Chatbot Processing Function", + "/RWKV-v4neo/chat.py:232-255": "Chatbot Response Generation with RWKV-v4neo", + "/RWKV-v4neo/chat.py:256-289": "RWKV Chat: Token Generation and Error Handling", + "/RWKV-v4neo/chat.py:290-319": "Chatbot RWKV-v4neo Model Interaction", + "/RWKV-v4neo/chat.py:32-59": "RWKV-v4neo Hyperparameter Configuration", + "/RWKV-v4neo/chat.py:320-344": "RWKV Text Generator with Tokenization", + "/RWKV-v4neo/chat.py:345-361": "Chat Data Saving and User Input Handling", + "/RWKV-v4neo/chat.py:59-86": "AI Conversation Samples and Commands", + "/RWKV-v4neo/chat.py:87-118": "Multi-Language RWKV Chatbot", + "/RWKV-v4neo/cuda/wkv5_cuda.cu": "CUDA-Optimized Convolutional Neural Network", + "/RWKV-v4neo/cuda/wkv5_cuda.cu:1-36": "CUDA Kernel for Neural Network Layer Forward Pass" +} \ No newline at end of file diff --git a/docs/data/titles/1.json b/docs/data/titles/1.json new file mode 100644 index 00000000..11ecdc5a --- /dev/null +++ b/docs/data/titles/1.json @@ -0,0 +1,302 @@ +{ + "/RWKV-v4neo/cuda/wkv5_cuda.cu:105-149": "Cuda Array Manipulation & Synchronization", + "/RWKV-v4neo/cuda/wkv5_cuda.cu:150-197": "CUDA Convolution: Kernel Functions and Shared Memory", + "/RWKV-v4neo/cuda/wkv5_cuda.cu:197-202": "CUDA Kernel Launcher: cuda_backward", + "/RWKV-v4neo/cuda/wkv5_cuda.cu:37-66": "Matrix Multiplication CUDA Implementation", + "/RWKV-v4neo/cuda/wkv5_cuda.cu:67-104": "CUDA Parallel Recurrent Weight Updates", + "/RWKV-v4neo/cuda/wkv5_op.cpp": "CUDA-Optimized PyTorch Neural Network Operations", + "/RWKV-v4neo/cuda/wkv5_op.cpp:1-12": "Neural Network CUDA Implementation", + "/RWKV-v4neo/cuda/wkv5_op.cpp:12-22": "WKV5 Python Operations", + "/RWKV-v4neo/cuda/wkv_cuda.cu": "Efficient CUDA Implementation of RWKV Model", + "/RWKV-v4neo/cuda/wkv_cuda.cu:1-31": "Matrix Multiplication with Kernel Function", + "/RWKV-v4neo/cuda/wkv_cuda.cu:129-133": "CUDA Kernel Configuration: Optimal Threads, Blocks Distribution", + "/RWKV-v4neo/cuda/wkv_cuda.cu:32-58": "Matrix Multiplication Kernel", + "/RWKV-v4neo/cuda/wkv_cuda.cu:59-94": "Variable Initialization and Iteration for WKV-Cuda", + "/RWKV-v4neo/cuda/wkv_cuda.cu:96-128": "CUDA Implementation of RWKV Model Passes", + "/RWKV-v4neo/cuda/wkv_cuda_bf16.cu": "Cuda BFloat16 Neural Network Forward-Backward Pass", + "/RWKV-v4neo/cuda/wkv_cuda_bf16.cu:1-26": "BF16 Forward Pass on GPU", + "/RWKV-v4neo/cuda/wkv_cuda_bf16.cu:121-132": "Optimized CUDA Matrix Operation Functions", + "/RWKV-v4neo/cuda/wkv_cuda_bf16.cu:28-53": "Gradient Computation for Deep Learning Models", + "/RWKV-v4neo/cuda/wkv_cuda_bf16.cu:54-85": "BF16 Softmax Multiplication CUDA Code", + "/RWKV-v4neo/cuda/wkv_cuda_bf16.cu:86-120": "CUDA Neural Network Forward Pass", + "/RWKV-v4neo/cuda/wkv_op.cpp": "CUDA-Powered Torch Extensions for RWKV-v4neo", + "/RWKV-v4neo/cuda/wkv_op.cpp:1-10": "CUDA Torch Extension for RWKV-v4neo Forward and Backward Passes", + "/RWKV-v4neo/cuda/wkv_op.cpp:11-21": "PyBind11 Torch Module Definition", + "/RWKV-v4neo/cuda/wkv_op_bf16.cpp": "CUDA Functions for BF16 Torch Tensors", + "/RWKV-v4neo/cuda/wkv_op_bf16.cpp:1-14": "CUDA C++: BFloat16 Forward-Backward Functions", + "/RWKV-v4neo/cuda/wkv_op_bf16.cpp:15-25": "WKV Bindings for Forward and Backward Functions", + "/RWKV-v4neo/img_demoAE.py": "Binary Image Conversion with Neural Network", + "/RWKV-v4neo/img_demoAE.py:1-25": "Binary Conversion with RWKV", + "/RWKV-v4neo/img_demoAE.py:107-139": "Image Generation Neural Network Model with Mish Activation", + "/RWKV-v4neo/img_demoAE.py:140-165": "AE Image Generation and Processing", + "/RWKV-v4neo/img_demoAE.py:27-52": "R_ENCODER: Image Processing Class", + "/RWKV-v4neo/img_demoAE.py:54-82": "Neural Network Forward Pass Function", + "/RWKV-v4neo/img_demoAE.py:84-106": "RDECODER: Image Decoding ConvNet", + "/RWKV-v4neo/math_demo/run.py": "RWKV: Deep Learning NLP Model Development", + "/RWKV-v4neo/math_demo/run.py:1-25": "RWKV Language Model Tokenization", + "/RWKV-v4neo/math_demo/run.py:110-127": "Time-Mixing and Channel-Mixing Operations", + "/RWKV-v4neo/math_demo/run.py:128-150": "RWKV-v4neo: Tokenizing and Predicting Characters", + "/RWKV-v4neo/math_demo/run.py:26-53": "RWKV Tokenization and RNN Classes", + "/RWKV-v4neo/math_demo/run.py:54-77": "Channel-wise Mixing and Normalization Function", + "/RWKV-v4neo/math_demo/run.py:78-109": "Time-Dependent RNN Mixing and Forward Pass", + "/RWKV-v4neo/run.py": "Initialize and Run RWKV Language Model", + "/RWKV-v4neo/run.py:1-23": "RWKV Language Model Initialization", + "/RWKV-v4neo/run.py:119-156": "Initializing RWKV-v4neo Model for Text Generation", + "/RWKV-v4neo/run.py:157-194": "Neural Language Model Code Analysis", + "/RWKV-v4neo/run.py:196-227": "RWKV Token Prediction Model", + "/RWKV-v4neo/run.py:228-237": "Flushing Buffer Instantly", + "/RWKV-v4neo/run.py:24-56": "RWKV Model Configurator", + "/RWKV-v4neo/run.py:57-89": "RWKV-v4neo: Model Parameter Setup", + "/RWKV-v4neo/run.py:90-117": "AI Politics and History Chatbot", + "/RWKV-v4neo/src/binidx.py": "Memory-Mapped Indexed Dataset Handler", + "/RWKV-v4neo/src/binidx.py:1-48": "Indexed Dataset Handling in Binidx.py", + "/RWKV-v4neo/src/binidx.py:105-128": "Index File Analyzer", + "/RWKV-v4neo/src/binidx.py:129-158": "Binary Data Memory Management", + "/RWKV-v4neo/src/binidx.py:159-196": "Binary Index Class Definition", + "/RWKV-v4neo/src/binidx.py:197-226": "Cache-Backed Numpy Memory View", + "/RWKV-v4neo/src/binidx.py:227-261": "Binary Index Class and Methods", + "/RWKV-v4neo/src/binidx.py:262-269": "Prefetch Unsupported: Checks Index and Data File Existence", + "/RWKV-v4neo/src/binidx.py:49-76": "Binary File Header Writer", + "/RWKV-v4neo/src/binidx.py:78-104": "Binary File Writer Class", + "/RWKV-v4neo/src/dataset.py": "Custom PyTorch Dataset Management", + "/RWKV-v4neo/src/dataset.py:1-25": "Initializing MyDataset with \"binidx\"", + "/RWKV-v4neo/src/dataset.py:108-128": "Shuffling and Initializing Dataset", + "/RWKV-v4neo/src/dataset.py:129-150": "Robust WebDataset Initialization", + "/RWKV-v4neo/src/dataset.py:151-175": "Random Data Selection and Tensor Creation", + "/RWKV-v4neo/src/dataset.py:176-192": "Random Dataset Chunk Selection", + "/RWKV-v4neo/src/dataset.py:193-213": "Random Index Retrieval for Data Types", + "/RWKV-v4neo/src/dataset.py:214-234": "Random Index Generation and Pattern Detection", + "/RWKV-v4neo/src/dataset.py:236-245": "Filtering and Returning Data Snippets", + "/RWKV-v4neo/src/dataset.py:26-42": "Checks Version and Prepares Dataset", + "/RWKV-v4neo/src/dataset.py:43-60": "Tokenization Verification", + "/RWKV-v4neo/src/dataset.py:61-82": "Dataset Loading and Resizing", + "/RWKV-v4neo/src/dataset.py:83-107": "Dataset Construction in RWKV-v4neo", + "/RWKV-v4neo/src/model.py": "RWKV-v4neo Model: GPU Efficient Neural Network", + "/RWKV-v4neo/src/model.py:1-31": "Setting Up Torch Environment", + "/RWKV-v4neo/src/model.py:118-135": "CUDA-Powered BERT Transformer Gradients", + "/RWKV-v4neo/src/model.py:137-162": "RWKV TimeMix Module Class", + "/RWKV-v4neo/src/model.py:163-182": "Time-Related Model Component Parameters", + "/RWKV-v4neo/src/model.py:184-204": "Time-Shifted Model with Group Normalization", + "/RWKV-v4neo/src/model.py:204-228": "Attention Mechanism in Transformer Model Implemented", + "/RWKV-v4neo/src/model.py:229-251": "JIT Functions for Time Shifting and Tensor Operations", + "/RWKV-v4neo/src/model.py:252-281": "Layer Normalization in RWKV-v4neo", + "/RWKV-v4neo/src/model.py:283-311": "Tensor Manipulation and Reshaping", + "/RWKV-v4neo/src/model.py:312-334": "CUDA-RWKV5 Kernel Wrapper Function", + "/RWKV-v4neo/src/model.py:32-52": "CUDA-Powered WKV Class", + "/RWKV-v4neo/src/model.py:335-356": "Exponentiated Tensor Initialization and Backpropagation", + "/RWKV-v4neo/src/model.py:357-367": "Initializing Model Tensors and CUDA Calculations", + "/RWKV-v4neo/src/model.py:369-392": "RWKV TimeMix Initialization", + "/RWKV-v4neo/src/model.py:393-411": "Time-Learnable Parameters Initialization", + "/RWKV-v4neo/src/model.py:412-441": "Time-Shifted Mixing Linear Model", + "/RWKV-v4neo/src/model.py:442-472": "Introducing RWKV_TimeMix Class", + "/RWKV-v4neo/src/model.py:473-492": "Time-Based Transformer Layer Initialization", + "/RWKV-v4neo/src/model.py:493-510": "Transformer Model Initialization Code", + "/RWKV-v4neo/src/model.py:511-533": "Jit-Mix Timesteps for RWKV", + "/RWKV-v4neo/src/model.py:53-73": "Neural Model Computation and Backpropagation", + "/RWKV-v4neo/src/model.py:534-558": "Time-Shifting QKV Decomposition", + "/RWKV-v4neo/src/model.py:559-581": "RWKV Channel Mix Class Definition", + "/RWKV-v4neo/src/model.py:583-610": "MishGLU Class for Time-Shifting", + "/RWKV-v4neo/src/model.py:611-639": "Block Model with Linear Layers, Layer Normalization, and Mish Activation", + "/RWKV-v4neo/src/model.py:640-661": "Model Initialization and Layer Creation", + "/RWKV-v4neo/src/model.py:662-688": "Forward Model with Dropout, LayerNorm, Attention", + "/RWKV-v4neo/src/model.py:690-721": "Scaled Dot Product Attention in RWKV Model", + "/RWKV-v4neo/src/model.py:722-745": "Transformer Model Initialization", + "/RWKV-v4neo/src/model.py:74-93": "Neural Model CUDA Tensor Calculation", + "/RWKV-v4neo/src/model.py:747-775": "Optimizer Parameters Categorization", + "/RWKV-v4neo/src/model.py:777-797": "Layerwise Learning Rate Optimizer Groups", + "/RWKV-v4neo/src/model.py:798-810": "Optimizer Group Initialization for Model Parameters", + "/RWKV-v4neo/src/model.py:811-834": "Offloading Optimizer Model with FusedAdam", + "/RWKV-v4neo/src/model.py:835-860": "Checkpointing and Layer Execution Handler", + "/RWKV-v4neo/src/model.py:861-889": "Forward-Training Pass Model", + "/RWKV-v4neo/src/model.py:890-916": "Training Neural Network: Loss Calculation and All-Gather", + "/RWKV-v4neo/src/model.py:917-943": "Efficient Model Weight Initialization", + "/RWKV-v4neo/src/model.py:94-117": "Precise Model Data Type Conversion", + "/RWKV-v4neo/src/model.py:944-966": "Scale Factor Initialization", + "/RWKV-v4neo/src/model.py:967-981": "Orthogonal Initialization of Model Weights", + "/RWKV-v4neo/src/model_img.py": "Multi-Device RWKV-v4neo Model with VGG16", + "/RWKV-v4neo/src/model_img.py:1-30": "Efficient Deep Learning Model with L2pooling", + "/RWKV-v4neo/src/model_img.py:117-141": "Distance-Based Embedding Weights", + "/RWKV-v4neo/src/model_img.py:143-173": "Batch-Averaged Distance Scoring Model", + "/RWKV-v4neo/src/model_img.py:174-193": "Convolutional Layers and Batch Normalization", + "/RWKV-v4neo/src/model_img.py:194-221": "Convolutional Neural Network with Mish Activation", + "/RWKV-v4neo/src/model_img.py:222-245": "Decoding RWKV Model: Conv Layers and BatchNorm2d", + "/RWKV-v4neo/src/model_img.py:247-270": "RWKV-v4neo Model Architecture", + "/RWKV-v4neo/src/model_img.py:271-306": "RWKV-v4neo: LightningModule and Image Model", + "/RWKV-v4neo/src/model_img.py:307-332": "CLIP Model Initialization and Configuration", + "/RWKV-v4neo/src/model_img.py:31-63": "VGG16-Based Neural Network with Custom Convolutional Layer", + "/RWKV-v4neo/src/model_img.py:333-360": "Deepspeed-Optimizer: DeepSpeedCPUAdam or FusedAdam", + "/RWKV-v4neo/src/model_img.py:361-385": "Image Encoder-Decoder Model with Training Step", + "/RWKV-v4neo/src/model_img.py:386-413": "Loss Calculation in Deep Learning Model", + "/RWKV-v4neo/src/model_img.py:414-446": "Model Parameter Dictionary Creation", + "/RWKV-v4neo/src/model_img.py:64-86": "VGG-19 Feature Pyramid Model", + "/RWKV-v4neo/src/model_img.py:87-116": "Normalizing Parameters and Loading Weights", + "/RWKV-v4neo/src/model_run.py": "RWKV Model Optimization and Processing", + "/RWKV-v4neo/src/model_run.py:1-33": "Initializing RWKV Language Model in PyTorch", + "/RWKV-v4neo/src/model_run.py:116-140": "Feed-Forward Network with Learned Weights", + "/RWKV-v4neo/src/model_run.py:142-160": "Weighted Average Calculation for RWKV-v4neo", + "/RWKV-v4neo/src/model_run.py:161-200": "Matrix Multiplication with Maximum Operations", + "/RWKV-v4neo/src/model_run.py:201-226": "Input Data Processing in Deep Learning Model", + "/RWKV-v4neo/src/model_run.py:228-237": "Rescaling Layer Normalization", + "/RWKV-v4neo/src/model_run.py:35-61": "RWKV_RNN Model Initialization", + "/RWKV-v4neo/src/model_run.py:62-85": "Model Parameter Manager", + "/RWKV-v4neo/src/model_run.py:86-114": "Organizing Weights in RWKV-v4neo Model", + "/RWKV-v4neo/src/trainer.py": "PyTorch Lightning S3 Savings", + "/RWKV-v4neo/src/trainer.py:1-30": "Distributed S3 Training with Deepspeed", + "/RWKV-v4neo/src/trainer.py:121-142": "Trainer Performance Metrics Calculation", + "/RWKV-v4neo/src/trainer.py:143-164": "Trainer: Model Checkpoint and Metrics Logging", + "/RWKV-v4neo/src/trainer.py:165-182": "Epoch End Handler for MyDataset", + "/RWKV-v4neo/src/trainer.py:183-208": "Model Saving and Epoch Tracking", + "/RWKV-v4neo/src/trainer.py:209-233": "Load, Check, Reshape, and Interpolate Dictionary Data", + "/RWKV-v4neo/src/trainer.py:234-243": "Model Training Checkpoint Saving and Progression", + "/RWKV-v4neo/src/trainer.py:31-51": "LR Scheduler and GPU Cache Cleaner", + "/RWKV-v4neo/src/trainer.py:53-72": "Dynamic Learning Rate Adjustment based on Exit Tokens", + "/RWKV-v4neo/src/trainer.py:73-97": "Dynamic Learning Rate Update", + "/RWKV-v4neo/src/trainer.py:98-120": "Exception-Handling Flush: Train Tracer Init", + "/RWKV-v4neo/src/utils.py": "Prime Number Verification Utility", + "/RWKV-v4neo/src/utils.py:1-30": "Tokenizing Class with Time Recording", + "/RWKV-v4neo/src/utils.py:124-130": "Prime Verification Code", + "/RWKV-v4neo/src/utils.py:32-60": "Text Generation Model Utilities", + "/RWKV-v4neo/src/utils.py:61-85": "Prime Number Calculation: Fermat & Miller-Rabin", + "/RWKV-v4neo/src/utils.py:86-122": "Prime Testing Algorithms: Fermat and Miller-Rabin", + "/RWKV-v4neo/train.py": "Train RWKV Model with PyTorch Lightning", + "/RWKV-v4neo/train.py:1-23": "Train RWKV Language Model with PyTorch Lightning", + "/RWKV-v4neo/train.py:112-129": "Defining CLI Arguments for Model Training", + "/RWKV-v4neo/train.py:130-153": "Setting Up Training Environment", + "/RWKV-v4neo/train.py:154-178": "Customizing RWKV-v4neo Model Training Arguments", + "/RWKV-v4neo/train.py:179-208": "Directory Creation and Value Adjustments", + "/RWKV-v4neo/train.py:209-233": "Epoch Count and Model Selection", + "/RWKV-v4neo/train.py:234-258": "RWKV Model Loading and Configuration", + "/RWKV-v4neo/train.py:24-39": "Training RWKV on Enwik8 and Fine-tuning V4Neo", + "/RWKV-v4neo/train.py:259-280": "Variables and Learning Rate Details", + "/RWKV-v4neo/train.py:282-308": "Precision and CUDNN Adjustments", + "/RWKV-v4neo/train.py:310-335": "Initializing and Loading RWKV Model", + "/RWKV-v4neo/train.py:336-358": "Load and Verify Model Checkpoint", + "/RWKV-v4neo/train.py:359-378": "Trainer Object with DeepSpeed Strategy", + "/RWKV-v4neo/train.py:378-383": "Configure Data Loader and Fit Model", + "/RWKV-v4neo/train.py:40-54": "RWKV-v4neo Fine-tuning with 8 GPUs and BF16", + "/RWKV-v4neo/train.py:56-69": "Command Line Arguments Control RWKV Training", + "/RWKV-v4neo/train.py:69-83": "RWKV-v4neo Training Parameters", + "/RWKV-v4neo/train.py:84-96": "Optimizing Hyperparameters with Argparse in RWKV-v4neo/train.py", + "/RWKV-v4neo/train.py:97-111": "Command Line Arguments in RWKV-v4neo Training", + "/RWKV-v4neo/verify.py": "RWKV Model Training and Output Printing", + "/RWKV-v4neo/verify.py:1-27": "RWKV Language Model Verification and Setup", + "/RWKV-v4neo/verify.py:28-66": "Initialize and Load RWKV Model", + "/RWKV-v4neo/verify.py:67-91": "Verify and Set Float Mode", + "/RWKV-v4neo/verify.py:92-104": "RWKV-RNN Output Printing in verify.py", + "/RWKV-v5/cuda/wkv5_cuda.cu": "Efficient CUDA ConvNet Forward Pass", + "/RWKV-v5/cuda/wkv5_op.cpp": "RWKV-v5 Neural Network Operations with CUDA & BFloat16", + "/RWKV-v5/demo.jsonl": "AI Applications and Legal Battles", + "/RWKV-v5/demo.jsonl:1-1": "Sydney Stabbing Spree: One Dead, Two Charged", + "/RWKV-v5/demo.jsonl:10-10": "Iran's Oil Outlook Under Nuclear Deal", + "/RWKV-v5/demo.jsonl:10-11": "Saudi Doubts on Tehran Deal Challenges Iran's OPEC Output", + "/RWKV-v5/demo.jsonl:100-100": "Tobacco and Fast Food Near Schools: A Concerning Trend", + "/RWKV-v5/demo.jsonl:100-101": "Divided JSON Entry Examples", + "/RWKV-v5/demo.jsonl:101-101": "Language Identification: Swedish Sentence", + "/RWKV-v5/demo.jsonl:101-103": "Multilingual Sentiment Analysis Assistant", + "/RWKV-v5/demo.jsonl:103-103": "Marked for Vengeance", + "/RWKV-v5/demo.jsonl:103-104": "Cost-Benefit Analysis for Global Issues", + "/RWKV-v5/demo.jsonl:104-104": "Incorrect Copenhagen Consensus Cadence", + "/RWKV-v5/demo.jsonl:105-106": "AI Analyzing User's Drawing Hypothesis", + "/RWKV-v5/demo.jsonl:106-106": "Contradicting Hypothesis", + "/RWKV-v5/demo.jsonl:106-107": "Abuse, Batteries, and Deception", + "/RWKV-v5/demo.jsonl:107-108": "Single-Cell Battery Concept Explained", + "/RWKV-v5/demo.jsonl:108-110": "AI-Generated Article Titles and Restaurant Descriptions", + "/RWKV-v5/demo.jsonl:11-11": "Aidan Somers: Irish Hurler and Accomplishments", + "/RWKV-v5/demo.jsonl:11-12": "Manure Classification Regulations", + "/RWKV-v5/demo.jsonl:110-111": "Common Sense Completion Test", + "/RWKV-v5/demo.jsonl:111-111": "Walk and Errands: Positive Mood", + "/RWKV-v5/demo.jsonl:111-112": "Happiness Detection Code", + "/RWKV-v5/demo.jsonl:112-113": "Galician to Portuguese Translation Task", + "/RWKV-v5/demo.jsonl:113-113": "Effortless Oven Door Cleaning", + "/RWKV-v5/demo.jsonl:113-114": "Double-Pane Window Cleaning Efficiency", + "/RWKV-v5/demo.jsonl:114-114": "Escape and Triumph", + "/RWKV-v5/demo.jsonl:114-115": "Sentiment Analysis of Customer Reviews", + "/RWKV-v5/demo.jsonl:115-116": "Summarizing Negative Reviews", + "/RWKV-v5/demo.jsonl:116-116": "Restructuring HR at Enron: Reporting Directly to Chairman", + "/RWKV-v5/demo.jsonl:116-117": "HR Team Roles Update", + "/RWKV-v5/demo.jsonl:117-118": "Capitalization Algorithm: Sentence Identification and Correction", + "/RWKV-v5/demo.jsonl:118-118": "Peeing Differences: Girls vs Women", + "/RWKV-v5/demo.jsonl:118-119": "Female Urination Devices: Overcoming Challenges", + "/RWKV-v5/demo.jsonl:119-120": "Self-Defense Response to Jesse's Attack", + "/RWKV-v5/demo.jsonl:12-12": "Approved Dried Manure and Poultry Droppings", + "/RWKV-v5/demo.jsonl:12-13": "Translator: RWKV-v5", + "/RWKV-v5/demo.jsonl:120-121": "Subdue Jesse for Self-Defense", + "/RWKV-v5/demo.jsonl:121-121": "Hector's Arrival on Saturn 3", + "/RWKV-v5/demo.jsonl:121-122": "Saturn 3: Hector's Rampage", + "/RWKV-v5/demo.jsonl:122-123": "Faster Water, More Energy", + "/RWKV-v5/demo.jsonl:123-124": "Clothespins or Paperclips for Chip Bags", + "/RWKV-v5/demo.jsonl:124-124": "Sealing Chip Bags: Accessible vs Traditional Methods", + "/RWKV-v5/demo.jsonl:124-125": "Chips Closure: Needle vs. Clothespins", + "/RWKV-v5/demo.jsonl:125-127": "Turkish Alphabet and Cross-Border Police Cooperation", + "/RWKV-v5/demo.jsonl:127-127": "Scrutinizing Hodgson's Choice: Sterling's Fatigue and Performance", + "/RWKV-v5/demo.jsonl:127-128": "Sterling's Schedule: A Key Factor in Performance", + "/RWKV-v5/demo.jsonl:128-128": "Deceitful Filmmaker Fakes Signature, Divorces Wife", + "/RWKV-v5/demo.jsonl:128-129": "JSON Data Storage or Processing", + "/RWKV-v5/demo.jsonl:129-130": "Sentiment Analysis: Positive Review Detected", + "/RWKV-v5/demo.jsonl:13-14": "Multilingual Text Translation and Odd Statement Generation", + "/RWKV-v5/demo.jsonl:130-132": "Multi-lingual Text File with Toddler Movement Quiz", + "/RWKV-v5/demo.jsonl:132-132": "Accurate Sentiment Analysis of Tweets", + "/RWKV-v5/demo.jsonl:132-133": "Hypothesis Verification through Text Analysis", + "/RWKV-v5/demo.jsonl:133-133": "Vitali-Carathéodory Approximation Theorem", + "/RWKV-v5/demo.jsonl:133-134": "Vitali-Carathéodory Theorem: Integrable Function Approximation", + "/RWKV-v5/demo.jsonl:134-135": "RWKV-v5 JSON Review Task", + "/RWKV-v5/demo.jsonl:135-135": "Navigating Betrayal's Aftermath: Mary's Journey", + "/RWKV-v5/demo.jsonl:135-136": "Film and Playground Connection", + "/RWKV-v5/demo.jsonl:136-136": "Movie Database Code Analysis", + "/RWKV-v5/demo.jsonl:136-137": "Electric Dreams: San Francisco Architect", + "/RWKV-v5/demo.jsonl:137-137": "Google-Fitbit Acquisition Talks Surface", + "/RWKV-v5/demo.jsonl:137-138": "Google Acquisition JSON Snippet", + "/RWKV-v5/demo.jsonl:138-138": "AI Combats EU Human Trafficking and Prostitution", + "/RWKV-v5/demo.jsonl:138-139": "Cavemen's Disappointing Debut", + "/RWKV-v5/demo.jsonl:139-141": "Negative Review and Translation Task", + "/RWKV-v5/demo.jsonl:141-141": "Context-Based AI Question Answering", + "/RWKV-v5/demo.jsonl:141-142": "Analyzing User Queries for Statute Formation", + "/RWKV-v5/demo.jsonl:142-143": "How are Laws Made?", + "/RWKV-v5/demo.jsonl:143-144": "Multiple-Choice Painful Decision", + "/RWKV-v5/demo.jsonl:144-144": "Negative Dining Experience", + "/RWKV-v5/demo.jsonl:144-145": "Expensive Disappointment", + "/RWKV-v5/demo.jsonl:145-147": "Turkish Translation Engine with Abbreviation Detection", + "/RWKV-v5/demo.jsonl:147-147": "Step-by-Step Linear Equation Solution", + "/RWKV-v5/demo.jsonl:148-148": "Snoring and Tim's Struggle", + "/RWKV-v5/demo.jsonl:148-150": "JSON Objects and Semantic Analysis", + "/RWKV-v5/demo.jsonl:15-16": "AI Explained like to a Five-Year-Old", + "/RWKV-v5/demo.jsonl:150-150": "Tumors Replaced with Cavities and Scar Tissue", + "/RWKV-v5/demo.jsonl:150-151": "Empty Cavities Instead of Tumors", + "/RWKV-v5/demo.jsonl:151-152": "Non-Consecutive President", + "/RWKV-v5/demo.jsonl:152-153": "AI Interaction Scenario: Eddie & Milo", + "/RWKV-v5/demo.jsonl:153-154": "Geotourism Protection and Jobless Woes", + "/RWKV-v5/demo.jsonl:154-154": "UK Prime Minister Faces A4e Fraud Probe Pressure", + "/RWKV-v5/demo.jsonl:154-155": "Upscale Japanese Café, Rated Highly but Not Family-Friendly", + "/RWKV-v5/demo.jsonl:156-156": "Madoff Auction Raises $500K for Victims", + "/RWKV-v5/demo.jsonl:156-157": "Madoff's Seized Possessions: A Ponzi Scheme's Downfall", + "/RWKV-v5/demo.jsonl:157-158": "Word Boundary Detection", + "/RWKV-v5/demo.jsonl:158-159": "Word Boundary Correction", + "/RWKV-v5/demo.jsonl:159-159": "Sorting Dictionaries by First, Then Second", + "/RWKV-v5/demo.jsonl:159-160": "Family-Friendly Xname Coffee Shop Near City Centre", + "/RWKV-v5/demo.jsonl:16-17": "AI on Kazuma's Quest: The Band and Dixie's Tale", + "/RWKV-v5/demo.jsonl:160-161": "Extract and Categorize Information", + "/RWKV-v5/demo.jsonl:161-161": "St. Patrick's Rugby Fest in Dublin", + "/RWKV-v5/demo.jsonl:162-162": "Choose the Right Lift", + "/RWKV-v5/demo.jsonl:162-163": "Mixed Reviews for Nail Spa", + "/RWKV-v5/demo.jsonl:163-163": "Unsatisfactory Beauty Service", + "/RWKV-v5/demo.jsonl:163-165": "Multilingual Sentiment and Translation Tasks", + "/RWKV-v5/demo.jsonl:165-166": "Summary and Choose-Your-Own Ending", + "/RWKV-v5/demo.jsonl:166-167": "Clown Circus Scene Analysis", + "/RWKV-v5/demo.jsonl:167-168": "From Juvenile to Delightful", + "/RWKV-v5/demo.jsonl:168-169": "Exploring Hall's Impact on McCarthy", + "/RWKV-v5/demo.jsonl:169-170": "Joint Efforts in Air Defense Suppression", + "/RWKV-v5/demo.jsonl:17-17": "Manga to Anime Journey: History and Legacy", + "/RWKV-v5/demo.jsonl:17-18": "Rythm's 'Utatane' and 'Mugen Factory': A Journey in Japanese Music", + "/RWKV-v5/demo.jsonl:170-170": "Japanese-Indonesian Translation Checker", + "/RWKV-v5/demo.jsonl:170-171": "Multilingual Sentence Comparison and Translation", + "/RWKV-v5/demo.jsonl:171-172": "Sentence Translation Algorithm", + "/RWKV-v5/demo.jsonl:172-172": "Symposium as Equal Justice Debate Platform", + "/RWKV-v5/demo.jsonl:172-173": "Translation Verification with RWKV-v5", + "/RWKV-v5/demo.jsonl:173-174": "Incentive Program Ineffectiveness", + "/RWKV-v5/demo.jsonl:174-174": "Early HIV Treatment: Progress and Challenges", + "/RWKV-v5/demo.jsonl:174-175": "HIV Treatment Life Expectancy Study", + "/RWKV-v5/demo.jsonl:175-177": "AI Evaluates Product Reviews' Flattery Levels", + "/RWKV-v5/demo.jsonl:177-177": "Saints Stun Falcons with 2004 Blocked Punt", + "/RWKV-v5/demo.jsonl:177-178": "Saints Triumph over Falcons in Football Game", + "/RWKV-v5/demo.jsonl:178-178": "Undercover Battles in New World" +} \ No newline at end of file diff --git a/docs/data/titles/2.json b/docs/data/titles/2.json new file mode 100644 index 00000000..4c386679 --- /dev/null +++ b/docs/data/titles/2.json @@ -0,0 +1,177 @@ +{ + "/RWKV-v5/demo.jsonl:178-180": "Language Processing AI: Bridging Communication Gaps", + "/RWKV-v5/demo.jsonl:18-18": "Treaty of Paris and Gerard Brantsen", + "/RWKV-v5/demo.jsonl:18-19": "Treaty of Paris (1783)", + "/RWKV-v5/demo.jsonl:180-180": "Labeling Conversation Formality", + "/RWKV-v5/demo.jsonl:180-181": "Formal Conversation Labeling", + "/RWKV-v5/demo.jsonl:181-181": "Balancing Drug Abuse Penalties and Patient Access", + "/RWKV-v5/demo.jsonl:181-182": "JSON Pair: Legalizing Marijuana, Conference Talk Scheduling", + "/RWKV-v5/demo.jsonl:2-2": "Stratification's Impact on Nutrient Mixing", + "/RWKV-v5/demo.jsonl:2-3": "Lakes and Nutrient Levels: High vs. Stratified", + "/RWKV-v5/demo.jsonl:20-21": "Interactive User-Assistant Q&A", + "/RWKV-v5/demo.jsonl:21-22": "Product Review Sentiment Analysis", + "/RWKV-v5/demo.jsonl:22-24": "AI Assistant Review Analyzer", + "/RWKV-v5/demo.jsonl:24-24": "Lonely Girl in the Park", + "/RWKV-v5/demo.jsonl:24-25": "Reasons for Inaction", + "/RWKV-v5/demo.jsonl:25-26": "Buffalo Springfield Box Set Details", + "/RWKV-v5/demo.jsonl:26-26": "George's Time-Hopping Adventure", + "/RWKV-v5/demo.jsonl:26-27": "George's Lava Escape", + "/RWKV-v5/demo.jsonl:27-27": "Loss and Blame: Victoria's Grief", + "/RWKV-v5/demo.jsonl:27-29": "Two Questions, Two Answers", + "/RWKV-v5/demo.jsonl:29-30": "HDTV's European Debut: Vienna Concert 2004", + "/RWKV-v5/demo.jsonl:3-3": "Master Xbox Screenshots with Game DVR", + "/RWKV-v5/demo.jsonl:30-30": "Harold's Horse Racing Victory", + "/RWKV-v5/demo.jsonl:30-31": "Heist and Hierarchy", + "/RWKV-v5/demo.jsonl:31-31": "Metropolitan Archbishop Roles and Responsibilities", + "/RWKV-v5/demo.jsonl:31-32": "AI Learns Ecclesiastical Jurisdiction and 'MGS V' Open-World", + "/RWKV-v5/demo.jsonl:32-33": "Eddy Merckx: Cycling's Unmatched Legend", + "/RWKV-v5/demo.jsonl:33-33": "Career Progression of Eddy Merckx", + "/RWKV-v5/demo.jsonl:33-34": "Eddy Merckx: Cycling Champion and Rainbow Jersey Holder", + "/RWKV-v5/demo.jsonl:34-35": "Frequency of \"a\" in Sentences", + "/RWKV-v5/demo.jsonl:35-35": "Starvation in a Town: Not Cardiff", + "/RWKV-v5/demo.jsonl:36-37": "Negative Review Verdict", + "/RWKV-v5/demo.jsonl:37-38": "Sentiment Analysis for Movie Reviews", + "/RWKV-v5/demo.jsonl:38-38": "Begin Again: A Mixed Review", + "/RWKV-v5/demo.jsonl:38-39": "Healing Melodies, Predictable Beats", + "/RWKV-v5/demo.jsonl:39-39": "Mouse Family's American Journey", + "/RWKV-v5/demo.jsonl:39-40": "Scotland's Revolutionary Moment", + "/RWKV-v5/demo.jsonl:4-5": "Capitalization Correction AI Demo", + "/RWKV-v5/demo.jsonl:40-40": "Scotland's Stunning Views and Funny Fanatics", + "/RWKV-v5/demo.jsonl:40-42": "Amphibian Quiz Hints: Chemicals", + "/RWKV-v5/demo.jsonl:42-42": "Mystery of the Zeko Dagger", + "/RWKV-v5/demo.jsonl:42-44": "Incomplete JSON Scenarios", + "/RWKV-v5/demo.jsonl:44-45": "NLP-Based Response Generator", + "/RWKV-v5/demo.jsonl:45-45": "The Breeders: A Turbulent History and Comeback", + "/RWKV-v5/demo.jsonl:46-46": "Removing Beetle Headlights", + "/RWKV-v5/demo.jsonl:46-47": "Removing Beetle Headlights: Step-by-Step Guide", + "/RWKV-v5/demo.jsonl:47-47": "Determining Tweet Sentiment", + "/RWKV-v5/demo.jsonl:47-48": "Restaurant JSON Data Snippet", + "/RWKV-v5/demo.jsonl:48-49": "Establishment Info Parser", + "/RWKV-v5/demo.jsonl:49-50": "Cloud Computing Project Topics: Research Abroad vs Domestic", + "/RWKV-v5/demo.jsonl:5-5": "Text Parsing Code: Breaks Down Company Details with English Formatting", + "/RWKV-v5/demo.jsonl:5-7": "Sentiment Analysis for Text Clarity", + "/RWKV-v5/demo.jsonl:50-51": "Saghar's 2,000+ Song Compositions", + "/RWKV-v5/demo.jsonl:51-51": "Paragraph Categorization: Artist Detection", + "/RWKV-v5/demo.jsonl:51-52": "Johnny's Sunday Lunch Activity", + "/RWKV-v5/demo.jsonl:52-52": "Johnny's Busy Weekend Activities", + "/RWKV-v5/demo.jsonl:52-53": "Title: \"Code as a Reflection\"", + "/RWKV-v5/demo.jsonl:53-53": "Christie's Rising Role in Trump's Organization", + "/RWKV-v5/demo.jsonl:53-54": "Trump's Campaign Prep and Melody Barnes' Past Roles", + "/RWKV-v5/demo.jsonl:54-55": "AI Handles Office Holder Questions", + "/RWKV-v5/demo.jsonl:55-55": "Billion-Dollar Bracket Odds: One in 9.2 Quintillion", + "/RWKV-v5/demo.jsonl:55-56": "Perfect Brackets: Billion-Dollar Challenge", + "/RWKV-v5/demo.jsonl:56-56": "Important Dates and Policies Shared", + "/RWKV-v5/demo.jsonl:56-57": "School Uniform Debate Analysis", + "/RWKV-v5/demo.jsonl:57-57": "Positive Review: G-Shock's Endurance", + "/RWKV-v5/demo.jsonl:58-58": "Private Security Patrol: Deterrent Against Johannesburg Crime", + "/RWKV-v5/demo.jsonl:58-59": "Security Firms Ownership in SA: A Borat-Inspired Proposal", + "/RWKV-v5/demo.jsonl:59-61": "Language Translation and \"Borat\" Fun", + "/RWKV-v5/demo.jsonl:61-62": "Task: Generate Entailing Sentences in Fiction Genre", + "/RWKV-v5/demo.jsonl:62-63": "Understanding FMLA Eligibility", + "/RWKV-v5/demo.jsonl:63-64": "FMLA Eligibility Criteria System", + "/RWKV-v5/demo.jsonl:64-65": "Joining a Sorority: Understanding Greek System and NPC", + "/RWKV-v5/demo.jsonl:65-67": "Sorority 101: Joining Process and Movie Facts", + "/RWKV-v5/demo.jsonl:67-69": "Geo-AI Question Bot", + "/RWKV-v5/demo.jsonl:69-69": "Mastering Ollie for 180 Skateboard Trick", + "/RWKV-v5/demo.jsonl:69-70": "Mastering the 180 Skateboard Trick", + "/RWKV-v5/demo.jsonl:7-7": "The Tenors' Journey: A Snapshot", + "/RWKV-v5/demo.jsonl:7-8": "Artist Group Evolution", + "/RWKV-v5/demo.jsonl:70-71": "Story Completion AI", + "/RWKV-v5/demo.jsonl:72-72": "AI Analyzes Giraffe Claim in St. Estevam Text", + "/RWKV-v5/demo.jsonl:72-73": "Restaurant Analysis Engine", + "/RWKV-v5/demo.jsonl:73-75": "Various Tasks and Thoughts", + "/RWKV-v5/demo.jsonl:75-75": "Sentence Analysis and Key Statements Alignment", + "/RWKV-v5/demo.jsonl:75-76": "Text-Based Verification Algorithm", + "/RWKV-v5/demo.jsonl:76-77": "Generate Sentence with Given Words", + "/RWKV-v5/demo.jsonl:77-77": "Oscars and Tom Cruise's Nominations", + "/RWKV-v5/demo.jsonl:77-79": "Entity Relationship Quiz: Mark Johnson", + "/RWKV-v5/demo.jsonl:79-79": "Customize Your Curry Soup", + "/RWKV-v5/demo.jsonl:79-81": "AI-Assisted Curry Guide", + "/RWKV-v5/demo.jsonl:8-8": "Content ID: Detecting Copyright Infringement in Videos", + "/RWKV-v5/demo.jsonl:81-82": "AnswerBot", + "/RWKV-v5/demo.jsonl:82-83": "Determine Entailment of Facts", + "/RWKV-v5/demo.jsonl:83-84": "Create a Restaurant Description", + "/RWKV-v5/demo.jsonl:84-85": "Merger of Betrusted and TruSecure Forms CyberTrust", + "/RWKV-v5/demo.jsonl:85-85": "Merger Forms CyberTrust: Betrusted & TruSecure Unite", + "/RWKV-v5/demo.jsonl:85-86": "Betrusted and TruSecure Merge to Form CyberTrust", + "/RWKV-v5/demo.jsonl:86-86": "Key Elements of Life Liquid Show Interview", + "/RWKV-v5/demo.jsonl:86-88": "AI Chatbot for Resistors and Character Identification", + "/RWKV-v5/demo.jsonl:88-89": "Cleverness Predicts Success", + "/RWKV-v5/demo.jsonl:89-90": "Pop Artist Tom UNK: Reclining Nudes and Tragic Loss", + "/RWKV-v5/demo.jsonl:9-9": "AI Analyzes Negative Book Review", + "/RWKV-v5/demo.jsonl:90-90": "Haltering a Horse: Step-by-Step Guide", + "/RWKV-v5/demo.jsonl:90-91": "Approaching Horses and End of First Crusade", + "/RWKV-v5/demo.jsonl:91-92": "Crusade's Impact and Linear Equation", + "/RWKV-v5/demo.jsonl:92-93": "Solving Linear Equations: -6c = 24", + "/RWKV-v5/demo.jsonl:93-94": "Algorithmic Premise Analysis", + "/RWKV-v5/demo.jsonl:94-94": "Boko Haram's Deadly Rampage in Baga", + "/RWKV-v5/demo.jsonl:94-95": "AI Trains on Boko Haram's Baga Attacks", + "/RWKV-v5/demo.jsonl:95-97": "Odd Multiplier", + "/RWKV-v5/demo.jsonl:97-98": "The Pickwick Papers", + "/RWKV-v5/demo.jsonl:98-98": "Multi-meaning of \"Feel\"", + "/RWKV-v5/demo.jsonl:99-100": "Judson's Invention and Task", + "/RWKV-v5/make_data.py": "Binary File Processing and Indexing with RWKV v5", + "/RWKV-v5/make_data.py:1-34": "RWKV Tokenization Tool", + "/RWKV-v5/make_data.py:100-133": "Shuffling Lines and Indexing Dataset", + "/RWKV-v5/make_data.py:134-158": "Exception-Handling DIX Decoder", + "/RWKV-v5/make_data.py:35-63": "RWKV-v5 Data Builder and Indexer", + "/RWKV-v5/make_data.py:64-99": "Prime Checker and Data Generator", + "/RWKV-v5/src/binidx.py": "Memory-Mapped Indexed Datasets: RWKV-v5 Binidx.py", + "/RWKV-v5/src/dataset.py": "RWKV Dataset: Model Training and Prediction", + "/RWKV-v5/src/dataset.py:1-25": "RWKV Dataset Class", + "/RWKV-v5/src/dataset.py:109-134": "Random Data Sampling in Dataset", + "/RWKV-v5/src/dataset.py:135-154": "Random Data Selection Algorithm", + "/RWKV-v5/src/dataset.py:155-177": "Slicing Data from Lists", + "/RWKV-v5/src/dataset.py:178-200": "Random Context Initialization and Masking", + "/RWKV-v5/src/dataset.py:26-42": "Data Processing with Version-Specific Chunks", + "/RWKV-v5/src/dataset.py:43-60": "Defining Dataset Class with Flexible Data Sizes", + "/RWKV-v5/src/dataset.py:61-80": "Building Token Arrays from Files", + "/RWKV-v5/src/dataset.py:81-107": "Vocabulary Initialization and JSON Conversion", + "/RWKV-v5/src/model.py": "DeepSpeed Transformer Optimization", + "/RWKV-v5/src/model.py:1-31": "PyTorch JIT Setup for RWKV-v5 Model", + "/RWKV-v5/src/model.py:120-138": "Time-Based Attention in RWKV-v5 Transformer Layer", + "/RWKV-v5/src/model.py:140-169": "Linear Layers and GroupNorm in RWKV-v5", + "/RWKV-v5/src/model.py:171-198": "Forward Pass Definition", + "/RWKV-v5/src/model.py:199-227": "MishGLU Layer Initialization", + "/RWKV-v5/src/model.py:228-256": "RWKV Model Architecture\nwith Time-Shifting, Mixing, & Mish", + "/RWKV-v5/src/model.py:257-276": "Dynamic Model Component Initialization", + "/RWKV-v5/src/model.py:278-305": "Implementing RWKV-v5 with Dropout and Attention", + "/RWKV-v5/src/model.py:306-338": "RWKV Model: L2Wrap Loss Computation", + "/RWKV-v5/src/model.py:32-55": "WKV-5 CUDA Kernel Integration", + "/RWKV-v5/src/model.py:339-366": "Dynamic Model Configurations in RWKV-v5", + "/RWKV-v5/src/model.py:367-394": "Dynamic Learning Rates Assignment", + "/RWKV-v5/src/model.py:395-412": "Dynamic Optimizer Groups", + "/RWKV-v5/src/model.py:413-425": "Optimizer Setup for Model", + "/RWKV-v5/src/model.py:425-449": "Initialize Adam Optimizer and Forward Function", + "/RWKV-v5/src/model.py:450-478": "Multi-Head Attention in Transformer Models", + "/RWKV-v5/src/model.py:480-503": "Training Step: Model Loss and Logits Calculation", + "/RWKV-v5/src/model.py:504-530": "Batch Loss Calculation with GPU Tracking", + "/RWKV-v5/src/model.py:531-559": "Model Weight Initialization", + "/RWKV-v5/src/model.py:56-82": "Forward-Backward Pass Model Implementation", + "/RWKV-v5/src/model.py:560-586": "Scaled Weight Initialization in RWKV Model", + "/RWKV-v5/src/model.py:587-594": "bfloat16 Conversion for Memory Efficiency", + "/RWKV-v5/src/model.py:83-95": "CUDA-Powered WKV-5 Model Initialization", + "/RWKV-v5/src/model.py:97-119": "RWKV_TimeMix_RWKV5 Class Definition", + "/RWKV-v5/src/trainer.py": "S3 Saving, Learning Rate Scheduling & Model Loading", + "/RWKV-v5/src/utils.py": "Prime Number Verification Tool", + "/RWKV-v5/tokenizer/rwkv_tokenizer.py": "RWKV Trie Tokenizer", + "/RWKV-v5/tokenizer/rwkv_tokenizer.py:1-32": "RWKV Trie Tokenizer", + "/RWKV-v5/tokenizer/rwkv_tokenizer.py:33-66": "RWKV TRIE Tokenizer: Efficient Indexing", + "/RWKV-v5/tokenizer/rwkv_tokenizer.py:68-103": "Trie-Based Tokenizer", + "/RWKV-v5/train.py": "Training RWKV Language Model with Pytorch Lightning", + "/RWKV-v5/train.py:1-23": "RWKV Language Model Training Setup", + "/RWKV-v5/train.py:104-126": "Training Script Configuration", + "/RWKV-v5/train.py:127-154": "Epoch Count Adjustment", + "/RWKV-v5/train.py:155-182": "Max-Value Model Loading", + "/RWKV-v5/train.py:183-200": "RWKV-5 Training Details", + "/RWKV-v5/train.py:201-226": "RWKV-v5 Training Setup", + "/RWKV-v5/train.py:227-256": "Model Training Environment Setup", + "/RWKV-v5/train.py:24-36": "RWKV-v5 Argument Parsing", + "/RWKV-v5/train.py:257-280": "Checkpoint Model Loader", + "/RWKV-v5/train.py:281-299": "Trainer Object Initialization for PyTorch Lightning", + "/RWKV-v5/train.py:300-309": "Train RWKV Model with Deepspeed", + "/RWKV-v5/train.py:37-49": "Training Parameters: RWKV-LM's \"train.py\"", + "/RWKV-v5/train.py:50-63": "Optimize ML Model Arguments with Argparse", + "/RWKV-v5/train.py:64-80": "Command Line Args for RWKV-v5 Training", + "/RWKV-v5/train.py:81-103": "Training RWKV-v5 with Arguments" +} \ No newline at end of file diff --git a/docs/doc/03d1c3ed-92d1-438e-acd3-75d35d7ee872.json b/docs/doc/03d1c3ed-92d1-438e-acd3-75d35d7ee872.json new file mode 100644 index 00000000..d62888be --- /dev/null +++ b/docs/doc/03d1c3ed-92d1-438e-acd3-75d35d7ee872.json @@ -0,0 +1,60 @@ +{ + "summary": "This code initializes a transformer model for RWKV language implementation with time-shifted inputs, utilizing convolutions and attention mechanisms. It performs layer normalization, self-attention, feed-forward operations on input 'x' using layers from the 'w' object and returns results after applying block-specific weights and calculations for context-aware generation in RWKV-v3 model.", + "details": [ + { + "comment": "The code snippet is part of the RWKV language model implementation. It defines constants and a class for channel mixing operations within the model. The `RWKV_CFG` namespace holds various configuration values, and the `RWKV_ChannelMix` class represents a module with time-based channel mixing functionality using time shift, key, and query mixing parameters.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/src/model_run.py\":0-29", + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nimport types\nimport copy\nimport torch\nimport math\nfrom torch.nn import functional as F\nimport torch.nn as nn\nRWKV_K_CLAMP = 60\nRWKV_K_EPS = 1e-8\nRWKV_HEAD_QK_DIM = 256\nprint(f'\\nRWKV_K_CLAMP {RWKV_K_CLAMP} RWKV_K_EPS {RWKV_K_EPS} RWKV_HEAD_QK_DIM {RWKV_HEAD_QK_DIM}\\n')\nDEBUG_TIME = False # True False - show trained time-coeffs\n############################################################################################################\nRWKV_CFG = types.SimpleNamespace()\nclass RWKV_ChannelMix(nn.Module):\n def __init__(self, layer_id):\n super().__init__()\n self.layer_id = layer_id\n self.time_shift = nn.ZeroPad2d((0,0,1,-1))\n self.time_mix_k = nn.Parameter(torch.ones(1, 1, RWKV_CFG.n_embd))\n self.time_mix_r = nn.Parameter(torch.ones(1, 1, RWKV_CFG.n_embd))" + }, + { + "comment": "RWKV-v3 model's forward function computes key-value pairs based on input tensor, using linear layers and element-wise operations. RWKV_TimeMix class initializes parameters for time-related operations in the model.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/src/model_run.py\":31-56", + "content": " hidden_sz = 4 * RWKV_CFG.n_embd\n self.key = nn.Linear(RWKV_CFG.n_embd, hidden_sz, bias=False)\n self.receptance = nn.Linear(RWKV_CFG.n_embd, RWKV_CFG.n_embd, bias=False)\n self.value = nn.Linear(hidden_sz, RWKV_CFG.n_embd, bias=False)\n def forward(self, x):\n xx = self.time_shift(x)\n xk = x * self.time_mix_k + xx * (1 - self.time_mix_k)\n xr = x * self.time_mix_r + xx * (1 - self.time_mix_r)\n k = self.key(xk)\n k = torch.square(torch.relu(k))\n kv = self.value(k)\n rkv = torch.sigmoid(self.receptance(xr)) * kv\n return rkv\nclass RWKV_TimeMix(nn.Module):\n def __init__(self, layer_id):\n super().__init__()\n self.layer_id = layer_id\n self.time_decay = nn.Parameter(torch.ones(RWKV_CFG.n_embd, 1))\n self.time_curve = torch.tensor([-(RWKV_CFG.ctx_len - 2 - i) for i in range(RWKV_CFG.ctx_len-1)]).unsqueeze(0)\n self.time_first = nn.Parameter(torch.ones(RWKV_CFG.n_embd, 1) * math.log(0.3))\n self.time_shift = nn.ZeroPad2d((0,0,1,-1))" + }, + { + "comment": "This code initializes parameters for a transformer model and defines its forward pass. It uses time-shifted inputs, applies multiplication with learnable mixing factors, and feeds them into separate key, value, and receptance linear layers before clamping the keys, applying exponential function, and element-wise multiplying with values to obtain kv outputs.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/src/model_run.py\":57-84", + "content": " self.time_mix_k = nn.Parameter(torch.ones(1,1,RWKV_CFG.n_embd))\n self.time_mix_v = nn.Parameter(torch.ones(1,1,RWKV_CFG.n_embd))\n self.time_mix_r = nn.Parameter(torch.ones(1,1,RWKV_CFG.n_embd))\n self.key = nn.Linear(RWKV_CFG.n_embd, RWKV_CFG.n_embd, bias=False)\n self.value = nn.Linear(RWKV_CFG.n_embd, RWKV_CFG.n_embd, bias=False)\n self.receptance = nn.Linear(RWKV_CFG.n_embd, RWKV_CFG.n_embd, bias=False)\n self.output = nn.Linear(RWKV_CFG.n_embd, RWKV_CFG.n_embd, bias=False)\n def forward(self, x):\n B, T, C = x.size()\n xx = self.time_shift(x)\n xk = x * self.time_mix_k + xx * (1 - self.time_mix_k)\n xv = x * self.time_mix_v + xx * (1 - self.time_mix_v)\n xr = x * self.time_mix_r + xx * (1 - self.time_mix_r)\n k = self.key(xk).transpose(-1, -2)\n v = self.value(xv).transpose(-1, -2)\n r = self.receptance(xr)\n k = torch.clamp(k, max=RWKV_K_CLAMP)\n k = torch.exp(k)\n kv = k * v\n sel" + }, + { + "comment": "Code snippet defines a `Block` class and its forward pass for a transformer model. The block consists of layer normalizations, an attention mechanism (`RWKV_TimeMix`), feed-forward network (`RWKV_ChannelMix`), and optional pre-feed-forward layer (`RWKV_ffnPre`) for the first block only. The time dimension is handled by `time_decay`, `time_curve`, and `time_first` variables, which are used to compute the weights for the convolutions. These weights are then applied to the input through 1D convolutions (`F.conv1d`) after padding the inputs with `nn.ZeroPad2d`. Finally, the output is passed through an activation function (`torch.sigmoid`) and a final layer normalization before being returned.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/src/model_run.py\":84-114", + "content": "f.time_w = torch.cat([torch.exp(self.time_decay) * self.time_curve.to(self.time_decay.device), self.time_first], dim=-1)\n w = torch.exp(self.time_w)\n w = w[:,-T:].unsqueeze(1)\n wkv = F.conv1d(nn.ZeroPad2d((T-1, 0, 0, 0))(kv), w, groups=C)\n wk = F.conv1d(nn.ZeroPad2d((T-1, 0, 0, 0))(k), w, groups=C) + RWKV_K_EPS\n rwkv = torch.sigmoid(r) * (wkv / wk).transpose(-1, -2)\n rwkv = self.output(rwkv)\n return rwkv\nclass Block(nn.Module):\n def __init__(self, layer_id):\n super().__init__()\n self.layer_id = layer_id\n self.ln1 = nn.LayerNorm(RWKV_CFG.n_embd)\n self.ln2 = nn.LayerNorm(RWKV_CFG.n_embd)\n if self.layer_id == 0:\n self.ln0 = nn.LayerNorm(RWKV_CFG.n_embd)\n if self.layer_id == 0 and RWKV_CFG.model_type == 'RWKV-ffnPre':\n self.ffnPre = RWKV_ChannelMix(layer_id+1000)\n else:\n self.att = RWKV_TimeMix(layer_id)\n self.ffn = RWKV_ChannelMix(layer_id)\n def forward(self, x):\n if self.layer_id == 0:" + }, + { + "comment": "The code snippet is a part of the RWKV-GPT class, which inherits from nn.Module in PyTorch. The class defines the architecture of the RWKV model, including embedding layer, layers with residual connections and normalization, and output layers. It takes inputs like MODEL_NAME, RUN_DEVICE, model_type, vocab_size, n_layer, n_embd, and ctx_len as parameters. The code block defines the initialization of the model components and applies layer normalization and linear transformations for the input and output layers.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/src/model_run.py\":115-145", + "content": " x = self.ln0(x)\n if self.layer_id == 0 and RWKV_CFG.model_type == 'RWKV-ffnPre':\n x = x + self.ffnPre(self.ln1(x))\n else:\n x = x + self.att(self.ln1(x))\n x = x + self.ffn(self.ln2(x))\n return x\nclass RWKV_GPT(nn.Module):\n def __init__(self, MODEL_NAME, RUN_DEVICE, model_type, vocab_size, n_layer, n_embd, ctx_len):\n global RWKV_CFG\n super().__init__()\n RWKV_CFG.RUN_DEVICE = RUN_DEVICE\n RWKV_CFG.model_type = model_type\n RWKV_CFG.vocab_size = vocab_size\n RWKV_CFG.n_layer = n_layer\n RWKV_CFG.n_embd = n_embd\n RWKV_CFG.ctx_len = ctx_len\n print('\\nloading RWKV-GPT', MODEL_NAME)\n self.emb = nn.Embedding(vocab_size, n_embd)\n self.blocks = nn.Sequential(*[Block(i) for i in range(n_layer)])\n self.ln_out = nn.LayerNorm(n_embd)\n self.head = nn.Linear(n_embd, vocab_size, bias=False)\n if RWKV_HEAD_QK_DIM > 0:\n self.head_q = nn.Linear(n_embd, RWKV_HEAD_QK_DIM, bias=False)" + }, + { + "comment": "This code initializes a model for the RWKV-v3 language model. It sets the head_q scale init and head_k's scale init, registers a copy mask, assigns ctx_len, loads state from a model file, and defines a forward function that performs forward propagation on input idx. If RWKV_HEAD_QK_DIM is greater than 0, it computes the context vector c using attention between query q and key k, masks self-attention with copy_mask, and adds c to head output before returning x.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/src/model_run.py\":146-176", + "content": " self.head_q.scale_init = 0\n self.head_k = nn.Linear(n_embd, RWKV_HEAD_QK_DIM, bias=False)\n self.head_k.scale_init = 0.1\n self.register_buffer(\"copy_mask\", torch.tril(\n torch.ones(ctx_len, ctx_len)))\n self.ctx_len = ctx_len\n self.eval()\n self.load_state_dict(torch.load(MODEL_NAME + '.pth'))\n self.eval()\n def forward(self, idx):\n B, T = idx.size()\n assert T <= self.ctx_len, \"Cannot forward, because len(input) > model ctx_len.\"\n x = self.emb(idx)\n x = self.blocks(x)\n x = self.ln_out(x)\n if RWKV_HEAD_QK_DIM > 0:\n q = self.head_q(x)[:, :T, :]\n k = self.head_k(x)[:, :T, :]\n c = (q @ k.transpose(-2, -1)) * (1.0 / RWKV_HEAD_QK_DIM)\n c = c.masked_fill(self.copy_mask[:T, :T] == 0, 0)\n c = c @ F.one_hot(idx, num_classes=RWKV_CFG.vocab_size).float()\n x = self.head(x) + c\n else:\n x = self.head(x) \n return x" + }, + { + "comment": "The code defines a class called `RWKV_RNN` and initializes its attributes with provided parameters. It loads the model weights from the specified file, performing transformations on certain keys if required. Debugging options are also available for time-related variables.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/src/model_run.py\":178-206", + "content": "############################################################################################################\nclass RWKV_RNN():\n def __init__(self, MODEL_NAME, RUN_DEVICE, model_type, n_layer, n_embd, ctx_len):\n self.RUN_DEVICE = RUN_DEVICE\n self.model_type = model_type\n self.n_layer = n_layer\n self.n_embd = n_embd\n self.ctx_len = ctx_len\n self.w = types.SimpleNamespace()\n w = torch.load(MODEL_NAME + '.pth',\n map_location=torch.device(RUN_DEVICE))\n for x in w.keys():\n if '.time_' in x:\n w[x] = w[x].squeeze()\n if '.time_decay' in x:\n w[x] = torch.exp(-torch.exp(w[x]))\n if '.time_first' in x:\n w[x] = torch.exp(w[x])\n if DEBUG_TIME and '.time_' in x:\n print(x, w[x].squeeze().cpu().numpy())\n xx = x.split('.')\n here = self.w\n for i in range(len(xx)):\n if xx[i].isdigit():\n ii = int(xx[i])" + }, + { + "comment": "This code creates a hierarchical object structure using SimpleNamespace and dictionaries. It can be used to store and retrieve data in a nested manner. The clear method resets the stored data, while save and load methods allow copying the state of one instance to another.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/src/model_run.py\":207-237", + "content": " if ii not in here:\n here[ii] = types.SimpleNamespace()\n here = here[ii]\n else:\n if i == len(xx) - 1:\n setattr(here, xx[i], w[x])\n elif not hasattr(here, xx[i]):\n if xx[i+1].isdigit():\n setattr(here, xx[i], {})\n else:\n setattr(here, xx[i], types.SimpleNamespace())\n here = getattr(here, xx[i])\n self.clear()\n def clear(self):\n self.xx = {}\n self.aa = {}\n self.bb = {}\n self.hk = None\n def save(self, target):\n target.xx = copy.deepcopy(self.xx)\n target.aa = copy.deepcopy(self.aa)\n target.bb = copy.deepcopy(self.bb)\n target.hk = copy.deepcopy(self.hk)\n def load(self, target):\n self.xx = copy.deepcopy(target.xx)\n self.aa = copy.deepcopy(target.aa)\n self.bb = copy.deepcopy(target.bb)" + }, + { + "comment": "The code defines three functions: `hk`, `LN`, and `FF`. The `hk` function copies the target's hk attribute. The `LN` function performs layer normalization on the input `xx` with provided weights `w`. The `FF` function implements a feed-forward layer, where it mixes `xx` with previous `name` values and applies sigmoid and relu functions before multiplying by weights. It also initializes `sa`, `aa`, and `bb` variables for subsequent SA operation.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/src/model_run.py\":238-263", + "content": " self.hk = copy.deepcopy(target.hk)\n def LN(self, xx, w):\n return F.layer_norm(xx, (self.n_embd,), weight=w.weight, bias=w.bias)\n def FF(self, xx, w, name):\n if name not in self.xx:\n self.xx[name] = torch.zeros(self.n_embd, device=self.RUN_DEVICE)\n xk = xx * w.time_mix_k + self.xx[name] * (1 - w.time_mix_k)\n xr = xx * w.time_mix_r + self.xx[name] * (1 - w.time_mix_r)\n self.xx[name] = xx\n r = torch.sigmoid(w.receptance.weight @ xr)\n k = torch.square(torch.relu(w.key.weight @ xk))\n kv = w.value.weight @ k\n return r * kv\n def SA(self, xx, w, name):\n if name not in self.xx:\n self.xx[name] = torch.zeros(self.n_embd, device=self.RUN_DEVICE)\n self.aa[name] = torch.zeros(self.n_embd, device=self.RUN_DEVICE)\n self.bb[name] = torch.zeros(self.n_embd, device=self.RUN_DEVICE)\n xk = xx * w.time_mix_k + self.xx[name] * (1 - w.time_mix_k)\n xv = xx * w.time_mix_v + self.xx[name] * (1 - w.time_mix_v)" + }, + { + "comment": "This code performs a sequence of operations on the input 'x' using layers from the 'w' object. It applies layer normalization (LN), self-attention (SA), and feed-forward network (FF) for each block in the model. The result is then returned after applying weights and calculations specific to each block and layer.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/src/model_run.py\":264-293", + "content": " xr = xx * w.time_mix_r + self.xx[name] * (1 - w.time_mix_r)\n self.xx[name] = xx\n r = torch.sigmoid(w.receptance.weight @ xr)\n k = torch.exp(torch.clamp(w.key.weight @ xk, max=RWKV_K_CLAMP))\n v = w.value.weight @ xv\n kv = k * v\n a = self.aa[name] + w.time_first * kv\n b = self.bb[name] + w.time_first * k\n self.aa[name] = w.time_decay * self.aa[name] + kv\n self.bb[name] = w.time_decay * self.bb[name] + k\n rwkv = r * a / (b + RWKV_K_EPS)\n return w.output.weight @ rwkv\n def run(self, ctx):\n w = self.w\n x = w.emb.weight[ctx[-1]]\n for i in range(self.n_layer):\n if i == 0:\n x = self.LN(x, w.blocks[i].ln0)\n if i == 0 and self.model_type == 'RWKV-ffnPre':\n x = x + self.FF(self.LN(x, w.blocks[i].ln1), w.blocks[i].ffnPre, f'ffnPre.{i}')\n else:\n x = x + self.SA(self.LN(x, w.blocks[i].ln1), w.blocks[i].att, f'att.{i}')\n x = x + self.FF(self.LN(x, w.blocks[i].ln2), w.blocks[i].ffn, f'ffn.{i}')" + }, + { + "comment": "This code is part of the RWKV-v3 model and performs attention calculations for context-aware generation. It uses a linear layer (LN) to normalize input x with w.ln_out, calculates attention vectors q and x, and stores them in self.hk. If RWKV_HEAD_QK_DIM is greater than 0, it performs attention calculations; otherwise, it skips the process. The output x is converted to a list and returned.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/src/model_run.py\":295-318", + "content": " x = self.LN(x, w.ln_out)\n if RWKV_HEAD_QK_DIM > 0:\n if self.hk == None:\n self.hk = (w.head_k.weight @ x).unsqueeze(0)\n else:\n self.hk = torch.cat(\n [self.hk, (w.head_k.weight @ x).unsqueeze(0)], dim=0)\n if self.hk.shape[0] > self.ctx_len:\n self.hk = self.hk[-self.ctx_len:, :]\n q = w.head_q.weight @ x\n x = w.head.weight @ x\n x = x.cpu().numpy().tolist()\n c = (self.hk @ q) / RWKV_HEAD_QK_DIM\n for i in range(len(c)):\n x[ctx[i]] += c[i]\n else:\n x = w.head.weight @ x\n x = x.cpu().numpy().tolist()\n return x" + } + ] +} \ No newline at end of file diff --git a/docs/doc/0455c725-09b6-4549-a677-aeb0a9a17a00.json b/docs/doc/0455c725-09b6-4549-a677-aeb0a9a17a00.json new file mode 100644 index 00000000..c3dfa206 --- /dev/null +++ b/docs/doc/0455c725-09b6-4549-a677-aeb0a9a17a00.json @@ -0,0 +1,30 @@ +{ + "summary": "The code initializes a language generation model class using layer normalization and attention, applying iterative layers with time decay to generate future sequence tokens. A context vector is calculated via matrix multiplications and added to the input list at corresponding indices.", + "details": [ + { + "comment": "The code initializes a RWKV_RNN class. It takes input parameters such as MODEL_NAME, RUN_DEVICE, model_type, n_layer, n_embd, and ctx_len. The class loads the weights from a .pth file located at MODEL_NAME+'.pth' using map_location set to RUN_DEVICE. It performs certain operations on the loaded weights for time-coefficients and assigns them to corresponding attributes within the class. If DEBUG_TIME is True, it prints the updated time coefficients.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v2-RNN/src/model_run.py\":0-36", + "content": "import types\nimport copy\nimport torch\nfrom torch.nn import functional as F\nRWKV_K_CLAMP = 60\nRWKV_K_EPS = 1e-16\nRWKV_HEAD_QK_DIM = 256\nDEBUG_TIME = False # True False - show trained time-coeffs\nclass RWKV_RNN():\n def __init__(self, MODEL_NAME, RUN_DEVICE, model_type, n_layer, n_embd, ctx_len):\n self.RUN_DEVICE = RUN_DEVICE\n self.model_type = model_type\n self.n_layer = n_layer\n self.n_embd = n_embd\n self.ctx_len = ctx_len\n self.w = types.SimpleNamespace()\n w = torch.load(MODEL_NAME + '.pth',\n map_location=torch.device(RUN_DEVICE))\n for x in w.keys():\n if '.time_' in x:\n w[x] = w[x].squeeze()\n if '.time_decay' in x:\n w[x] = torch.exp(-torch.exp(w[x]))\n if '.time_first' in x:\n w[x] = torch.exp(w[x])\n if DEBUG_TIME and '.time_' in x:\n print(x, w[x].squeeze().cpu().numpy())\n xx = x.split('.')\n here = self.w\n for i in range(len(xx)):" + }, + { + "comment": "This code represents a data structure that allows for nested dictionary-like storage with optional object types. It includes functions to clear, save, and load the data structure. The `clear` function resets all stored variables to empty states, while `save` and `load` allow for copying state between instances of this data structure. The code uses a combination of dictionaries and simple namespaces to handle various types of values within the storage.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v2-RNN/src/model_run.py\":37-68", + "content": " if xx[i].isdigit():\n ii = int(xx[i])\n if ii not in here:\n here[ii] = types.SimpleNamespace()\n here = here[ii]\n else:\n if i == len(xx) - 1:\n setattr(here, xx[i], w[x])\n elif not hasattr(here, xx[i]):\n if xx[i+1].isdigit():\n setattr(here, xx[i], {})\n else:\n setattr(here, xx[i], types.SimpleNamespace())\n here = getattr(here, xx[i])\n self.clear()\n def clear(self):\n self.xx = {}\n self.aa = {}\n self.bb = {}\n self.hk = None\n def save(self, target):\n target.xx = copy.deepcopy(self.xx)\n target.aa = copy.deepcopy(self.aa)\n target.bb = copy.deepcopy(self.bb)\n target.hk = copy.deepcopy(self.hk)\n def load(self, target):\n self.xx = copy.deepcopy(target.xx)\n self.aa = copy.deepcopy(target.aa)" + }, + { + "comment": "This code defines methods for a neural network model. It includes deep copying of variables, layer normalization, feed-forward operation, and scaled attention operation. The LN method performs layer normalization on input tensor xx using the weights w. The FF method applies a feed-forward operation to the input tensor xx using the weights w and stores intermediate results. The SA method applies the scaled attention operation to the input tensor xx using the weights w and stores intermediate results.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v2-RNN/src/model_run.py\":69-97", + "content": " self.bb = copy.deepcopy(target.bb)\n self.hk = copy.deepcopy(target.hk)\n def LN(self, xx, w):\n return F.layer_norm(xx, (self.n_embd,), weight=w.weight, bias=w.bias)\n def FF(self, xx, w, name):\n if name not in self.xx:\n self.xx[name] = torch.zeros(self.n_embd, device=self.RUN_DEVICE)\n x = xx * w.time_mix + self.xx[name] * (1 - w.time_mix)\n self.xx[name] = xx\n r = torch.sigmoid(w.receptance.weight @ x)\n k = torch.square(torch.relu(w.key.weight @ x))\n kv = w.value.weight @ k\n return r * kv\n def SA(self, xx, w, name):\n if name not in self.xx:\n self.xx[name] = torch.zeros(self.n_embd, device=self.RUN_DEVICE)\n self.aa[name] = torch.zeros(self.n_embd, device=self.RUN_DEVICE)\n self.bb[name] = torch.zeros(self.n_embd, device=self.RUN_DEVICE)\n x = xx * w.time_mix + self.xx[name] * (1 - w.time_mix)\n self.xx[name] = xx\n r = torch.sigmoid(w.receptance.weight @ x)\n k = torch.exp(torch.clamp(w.key.weight @ x, max=RWKV_K_CLAMP))" + }, + { + "comment": "This code is implementing the RWKV model for language generation. It applies layer normalization, self-attention, and feed-forward layers iteratively to generate output. The time decay mechanism is used to update the internal states of the model. Additionally, the code initializes the head keys for generating future sequence tokens.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v2-RNN/src/model_run.py\":98-129", + "content": " v = w.value.weight @ x\n kv = k * v\n a = self.aa[name] + w.time_first * kv\n b = self.bb[name] + w.time_first * k\n self.aa[name] = w.time_decay * self.aa[name] + kv\n self.bb[name] = w.time_decay * self.bb[name] + k\n rwkv = r * a / (b + RWKV_K_EPS)\n return w.output.weight @ rwkv\n def run(self, ctx):\n w = self.w\n x = w.emb.weight[ctx[-1]]\n for i in range(self.n_layer):\n x = self.LN(x, w.blocks[i].ln1)\n if i == 0 and self.model_type == 'RWKV-ffnPre':\n x = x + self.FF(x, w.blocks[i].ffnPre, f'ffnPre.{i}')\n else:\n x = x + self.SA(x, w.blocks[i].att, f'att.{i}')\n x = self.LN(x, w.blocks[i].ln2)\n x = x + self.FF(x, w.blocks[i].ffn, f'ffn.{i}')\n x = self.LN(x, w.ln_out)\n if self.hk == None:\n self.hk = (w.head_k.weight @ x).unsqueeze(0)\n else:\n self.hk = torch.cat(\n [self.hk, (w.head_k.weight @ x).unsqueeze(0)], dim=0)" + }, + { + "comment": "This code snippet reshapes the `hk` variable if its shape exceeds the context length, then performs matrix multiplications to calculate a context vector, and finally adds this vector to the input list at corresponding indices.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v2-RNN/src/model_run.py\":130-142", + "content": " if self.hk.shape[0] > self.ctx_len:\n self.hk = self.hk[-self.ctx_len:, :]\n q = w.head_q.weight @ x\n x = w.head.weight @ x\n x = x.cpu().numpy().tolist()\n c = (self.hk @ q) / RWKV_HEAD_QK_DIM\n for i in range(len(c)):\n x[ctx[i]] += c[i]\n return x" + } + ] +} \ No newline at end of file diff --git a/docs/doc/07967b10-6d09-4ce3-9a26-c769898931d5.json b/docs/doc/07967b10-6d09-4ce3-9a26-c769898931d5.json new file mode 100644 index 00000000..a91836b7 --- /dev/null +++ b/docs/doc/07967b10-6d09-4ce3-9a26-c769898931d5.json @@ -0,0 +1,15 @@ +{ + "summary": "This C++ code implements forward and backward neural network operations using PyTorch tensors, optimized for CUDA execution. It includes functions for BFloat16 data type, with Python module \"wkv5\" for forward and backward operations.", + "details": [ + { + "comment": "This code is a C++ implementation of forward and backward passes for an unknown neural network operation. It includes functions `cuda_forward` and `cuda_backward`, which are called by the corresponding `forward` and `backward` wrapper functions. The wrapper functions handle memory allocation, type conversion, and data pointers for PyTorch tensors. The code uses BFloat16 as the floating-point data type and is optimized for CUDA execution.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/cuda/wkv5_op.cpp\":0-11", + "content": "#include \n#include \"ATen/ATen.h\"\ntypedef at::BFloat16 bf16;\nvoid cuda_forward(int B, int T, int C, int H, bf16 *r, bf16 *k, bf16 *v, float *w, bf16 *u, bf16 *y);\nvoid cuda_backward(int B, int T, int C, int H, bf16 *r, bf16 *k, bf16 *v, float *w, float *ww, bf16 *u, bf16 *gy, bf16 *gr, bf16 *gk, bf16 *gv, bf16 *gw, bf16 *gu);\nvoid forward(int64_t B, int64_t T, int64_t C, int64_t H, torch::Tensor &r, torch::Tensor &k, torch::Tensor &v, torch::Tensor &w, torch::Tensor &u, torch::Tensor &y) {\n cuda_forward(B, T, C, H, r.data_ptr (), k.data_ptr (), v.data_ptr (), w.data_ptr (), u.data_ptr (), y.data_ptr ());\n}\nvoid backward(int64_t B, int64_t T, int64_t C, int64_t H, torch::Tensor &r, torch::Tensor &k, torch::Tensor &v, torch::Tensor &w, torch::Tensor &ww, torch::Tensor &u, torch::Tensor &gy, torch::Tensor &gr, torch::Tensor &gk, torch::Tensor &gv, torch::Tensor &gw, torch::Tensor &gu) {\n cuda_backward(B, T, C, H, r.data_ptr (), k.data_ptr" + }, + { + "comment": "This code defines a Python module for the \"wkv5\" function, which includes forward and backward operations. The variables declared represent different data pointers used in these operations.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/cuda/wkv5_op.cpp\":11-21", + "content": " (), v.data_ptr (), w.data_ptr (), ww.data_ptr (), u.data_ptr (), gy.data_ptr (), gr.data_ptr (), gk.data_ptr (), gv.data_ptr (), gw.data_ptr (), gu.data_ptr ());\n}\nPYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {\n m.def(\"forward\", &forward, \"wkv5 forward\");\n m.def(\"backward\", &backward, \"wkv5 backward\");\n}\nTORCH_LIBRARY(wkv5, m) {\n m.def(\"forward\", forward);\n m.def(\"backward\", backward);\n}" + } + ] +} \ No newline at end of file diff --git a/docs/doc/0b7b8700-c591-44cf-a1a1-8c1c6cae3da8.json b/docs/doc/0b7b8700-c591-44cf-a1a1-8c1c6cae3da8.json new file mode 100644 index 00000000..33c22834 --- /dev/null +++ b/docs/doc/0b7b8700-c591-44cf-a1a1-8c1c6cae3da8.json @@ -0,0 +1,25 @@ +{ + "summary": "This code defines a custom dataset class and tokenizer for RWKV language model, initializing objects with dictionaries and functions. It handles text generation using top-p sampling with temperature parameter and includes conversion functions for input values.", + "details": [ + { + "comment": "The code defines a custom dataset class for the RWKV language model, which reads in data and builds a token list. It then stores the unique tokens in a dictionary-like format and writes it to a 'vocab.json' file. The function prints the total number of tokens in the data and the number of unique tokens.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/src/utils.py\":0-33", + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nimport json\nimport random\nimport time\nimport math\nimport numpy as np\nimport torch\nimport torch.nn as nn\nfrom torch.nn import functional as F\nfrom torch.utils.data import Dataset\nclass Dataset(Dataset):\n def __init__(self, data, ctx_len, epoch_length_fixed):\n print('building token list...', end=' ')\n unique = sorted(list(set(data)))\n # print()\n # for u in unique:\n # print(u, end=' ')\n # print('\\n\\n')\n xx = 0\n xxObj = {}\n for u in unique:\n xxObj[xx] = u\n xx += 1\n with open('vocab.json', \"w\", encoding=\"utf-16\") as vocab_file:\n vocab_file.write(json.dumps(xxObj, ensure_ascii=False))\n data_size, vocab_size = len(data), len(unique)\n print('data has %d tokens, %d unique.' % (data_size, vocab_size))" + }, + { + "comment": "The code above initializes an object for a tokenizer that converts text data into numerical representations. The object contains dictionaries mapping characters to indices (stoi) and indices to characters (itos), context length, fixed epoch length, vocabulary size, and the actual text data. It also provides functions for getting elements at specific index and calculating lengths of the tokenizer instance. The tokenizer class is initialized with a Word Name file and an optional Unknown Character.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/src/utils.py\":34-61", + "content": " self.stoi = {ch: i for i, ch in enumerate(unique)}\n self.itos = {i: ch for i, ch in enumerate(unique)}\n self.ctx_len = ctx_len\n self.epoch_length_fixed = epoch_length_fixed\n self.vocab_size = vocab_size\n self.data = data\n def __len__(self):\n return self.epoch_length_fixed\n def __getitem__(self, idx):\n # cheat: pick a random spot in dataset\n i = np.random.randint(0, len(self.data) - (self.ctx_len + 1))\n chunk = self.data[i:i+self.ctx_len+1]\n dix = [self.stoi[s] for s in chunk]\n x = torch.tensor(dix[:-1], dtype=torch.long,\n device=torch.device('cuda'))\n y = torch.tensor(dix[1:], dtype=torch.long,\n device=torch.device('cuda'))\n return x, y\nclass TOKENIZER():\n def __init__(self, WORD_NAME, UNKNOWN_CHAR='\\ue083'):\n with open(WORD_NAME + '.json', \"r\", encoding=\"utf-16\") as result_file:\n self.word_table = json.load(result_file)\n self.vocab_size = len(self.word_table)" + }, + { + "comment": "Function `refine_context` strips and filters context strings.\n\"sample\\_logits\" calculates softmax probs, applies top\\_p if last char is newline, sorts probs, then...", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/src/utils.py\":63-94", + "content": " self.stoi = {v: int(k) for k, v in self.word_table.items()}\n self.itos = {int(k): v for k, v in self.word_table.items()}\n self.UNKNOWN_CHAR = self.stoi[UNKNOWN_CHAR]\n def refine_context(self, context):\n context = context.strip().split('\\n')\n for c in range(len(context)):\n context[c] = context[c].strip().strip('\\u3000').strip('\\r')\n context = list(filter(lambda c: c != '', context))\n context = '\\n' + ('\\n'.join(context)).strip()\n if context == '':\n context = '\\n'\n return context\n def sample_logits(self, out, x, ctx_len, temperature=1.0, top_p_usual=None, top_p_newline=None):\n # out[self.UNKNOWN_CHAR] = -float('Inf')\n lastChar = int(x[-1])\n probs = F.softmax(torch.tensor(out), dim=-1)\n if self.itos[lastChar] == '\\n':\n top_p = top_p_newline\n else:\n top_p = top_p_usual\n sorted_probs, s_index = torch.sort(probs, descending=True)\n # for j in range(30):\n # pp = sorted_probs[j].item()" + }, + { + "comment": "This code snippet is from the RWKV-LM project's \"utils.py\" file, and it appears to handle text generation using top-p sampling with a temperature parameter. The function generates a single token based on the given input and calculates cumulative probabilities. It then determines a cutoff value for the probabilities and sets any values below that cutoff to zero. If the temperature is not set to 1.0, it applies power normalization to the probabilities. Finally, it uses torch.multinomial to select one token based on the modified probabilities and returns it. The code also includes a to_float function for converting input values to floats and a set_seed function for setting random number generator seeds.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/src/utils.py\":95-121", + "content": " # if pp < 0.005:\n # break\n # ss = self.itos[int(s_index[j])].replace('\\n','_')\n # print(f'{math.floor(pp*100):>3.0f}{ss}', end='')\n # print('')\n cumulative_probs = torch.cumsum(sorted_probs, dim=-1).numpy()\n cutoff = float(sorted_probs[np.argmax(cumulative_probs > top_p)])\n probs[probs < cutoff] = 0\n # print(\"[\" + str(round(cutoff,4)) + ' ' + str(round(to_float(sum(probs)),3)) + \"]\", end = \"\")\n if temperature != 1.0:\n probs = probs.pow(1.0 / temperature)\n return torch.multinomial(probs, num_samples=1)[0]\ndef to_float(x):\n return x.cpu().detach().numpy().flatten()[0].astype(float)\ndef set_seed(seed):\n random.seed(seed)\n np.random.seed(seed)\n torch.manual_seed(seed)\n torch.cuda.manual_seed_all(seed)" + } + ] +} \ No newline at end of file diff --git a/docs/doc/0dfaf7bd-0997-4b96-91e8-5e8d6c0d83ed.json b/docs/doc/0dfaf7bd-0997-4b96-91e8-5e8d6c0d83ed.json new file mode 100644 index 00000000..1a22490d --- /dev/null +++ b/docs/doc/0dfaf7bd-0997-4b96-91e8-5e8d6c0d83ed.json @@ -0,0 +1,35 @@ +{ + "summary": "This code imports libraries and defines a function ToBinary for converting images to binary format. It includes an R_ENCODER class with layers for image processing and backward operation, as well as forward pass and neural network class for image decoding. The code also defines a neural network model for image generation using convolution layers and Mish activation.", + "details": [ + { + "comment": "This code imports necessary libraries and defines a function called ToBinary for converting input images to binary format. It uses RWKV Language Model and provides model_prefix and input_img variables for further processing.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/img_demoAE.py\":0-24", + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nimport torch, types, os\nimport numpy as np\nfrom PIL import Image\nimport torch.nn as nn\nfrom torch.nn import functional as F\nimport torchvision as vision\nimport torchvision.transforms as transforms\nnp.set_printoptions(precision=4, suppress=True, linewidth=200)\nprint(f'loading...')\n########################################################################################################\nmodel_prefix = 'test/image_trained/out-v7c_d8_256-224-13bit-OB32x0.5-201'\ninput_img = 'test/img_ae_test/test0.png'\n########################################################################################################\nclass ToBinary(torch.autograd.Function):\n @staticmethod\n def forward(ctx, x):\n return torch.floor(x + 0.5) # no need for noise when we have plenty of data" + }, + { + "comment": "This code defines a class named R_ENCODER inheriting from nn.Module, which includes several convolutional and batch normalization layers for image processing or feature extraction. The backward function is defined as a pass-through operation for gradient computation during backpropagation.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/img_demoAE.py\":26-51", + "content": " @staticmethod\n def backward(ctx, grad_output):\n return grad_output.clone() # pass-through\nclass R_ENCODER(nn.Module):\n def __init__(self, args):\n super().__init__()\n self.args = args\n dd = 8\n self.Bxx = nn.BatchNorm2d(dd*64)\n self.CIN = nn.Conv2d(3, dd, kernel_size=3, padding=1)\n self.Cx0 = nn.Conv2d(dd, 32, kernel_size=3, padding=1)\n self.Cx1 = nn.Conv2d(32, dd, kernel_size=3, padding=1)\n self.B00 = nn.BatchNorm2d(dd*4)\n self.C00 = nn.Conv2d(dd*4, 256, kernel_size=3, padding=1)\n self.C01 = nn.Conv2d(256, dd*4, kernel_size=3, padding=1)\n self.C02 = nn.Conv2d(dd*4, 256, kernel_size=3, padding=1)\n self.C03 = nn.Conv2d(256, dd*4, kernel_size=3, padding=1)\n self.B10 = nn.BatchNorm2d(dd*16)\n self.C10 = nn.Conv2d(dd*16, 256, kernel_size=3, padding=1)\n self.C11 = nn.Conv2d(256, dd*16, kernel_size=3, padding=1)\n self.C12 = nn.Conv2d(dd*16, 256, kernel_size=3, padding=1)\n self.C13 = nn.Conv2d(256, dd*16, kernel_size=3, padding=1)" + }, + { + "comment": "This code defines a forward pass function for a neural network layer. It applies various convolutions and batch normalization to input image 'img' after passing it through several activation functions, including Mish. The final result is passed through a sigmoid function before being returned.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/img_demoAE.py\":53-81", + "content": " self.B20 = nn.BatchNorm2d(dd*64)\n self.C20 = nn.Conv2d(dd*64, 256, kernel_size=3, padding=1)\n self.C21 = nn.Conv2d(256, dd*64, kernel_size=3, padding=1)\n self.C22 = nn.Conv2d(dd*64, 256, kernel_size=3, padding=1)\n self.C23 = nn.Conv2d(256, dd*64, kernel_size=3, padding=1)\n self.COUT = nn.Conv2d(dd*64, args.my_img_bit, kernel_size=3, padding=1)\n def forward(self, img):\n ACT = F.mish\n x = self.CIN(img)\n xx = self.Bxx(F.pixel_unshuffle(x, 8))\n x = x + self.Cx1(ACT(self.Cx0(x)))\n x = F.pixel_unshuffle(x, 2)\n x = x + self.C01(ACT(self.C00(ACT(self.B00(x)))))\n x = x + self.C03(ACT(self.C02(x)))\n x = F.pixel_unshuffle(x, 2)\n x = x + self.C11(ACT(self.C10(ACT(self.B10(x)))))\n x = x + self.C13(ACT(self.C12(x)))\n x = F.pixel_unshuffle(x, 2)\n x = x + self.C21(ACT(self.C20(ACT(self.B20(x)))))\n x = x + self.C23(ACT(self.C22(x)))\n x = self.COUT(x + xx)\n return torch.sigmoid(x)" + }, + { + "comment": "This code defines a class \"R_DECODER\" that inherits from the PyTorch's `nn.Module` and consists of multiple convolutional layers and batch normalization layers for image decoding. The class takes an argument \"args\", which contains information such as the number of image channels, and the kernel size of convolutions.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/img_demoAE.py\":83-105", + "content": "class R_DECODER(nn.Module):\n def __init__(self, args):\n super().__init__()\n self.args = args\n dd = 8\n self.CIN = nn.Conv2d(args.my_img_bit, dd*64, kernel_size=3, padding=1)\n self.B00 = nn.BatchNorm2d(dd*64)\n self.C00 = nn.Conv2d(dd*64, 256, kernel_size=3, padding=1)\n self.C01 = nn.Conv2d(256, dd*64, kernel_size=3, padding=1)\n self.C02 = nn.Conv2d(dd*64, 256, kernel_size=3, padding=1)\n self.C03 = nn.Conv2d(256, dd*64, kernel_size=3, padding=1)\n self.B10 = nn.BatchNorm2d(dd*16)\n self.C10 = nn.Conv2d(dd*16, 256, kernel_size=3, padding=1)\n self.C11 = nn.Conv2d(256, dd*16, kernel_size=3, padding=1)\n self.C12 = nn.Conv2d(dd*16, 256, kernel_size=3, padding=1)\n self.C13 = nn.Conv2d(256, dd*16, kernel_size=3, padding=1)\n self.B20 = nn.BatchNorm2d(dd*4)\n self.C20 = nn.Conv2d(dd*4, 256, kernel_size=3, padding=1)\n self.C21 = nn.Conv2d(256, dd*4, kernel_size=3, padding=1)\n self.C22 = nn.Conv2d(dd*4, 256, kernel_size=3, padding=1)" + }, + { + "comment": "This code defines a neural network model for image generation. It has multiple convolution layers and uses the Mish activation function. The model takes an input code, performs several operations with different convolution layers and pixel shuffling, and outputs a final tensor. The code also builds the model using given arguments.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/img_demoAE.py\":106-138", + "content": " self.C23 = nn.Conv2d(256, dd*4, kernel_size=3, padding=1)\n self.Cx0 = nn.Conv2d(dd, 32, kernel_size=3, padding=1)\n self.Cx1 = nn.Conv2d(32, dd, kernel_size=3, padding=1)\n self.COUT = nn.Conv2d(dd, 3, kernel_size=3, padding=1)\n def forward(self, code):\n ACT = F.mish\n x = self.CIN(code)\n x = x + self.C01(ACT(self.C00(ACT(self.B00(x)))))\n x = x + self.C03(ACT(self.C02(x)))\n x = F.pixel_shuffle(x, 2)\n x = x + self.C11(ACT(self.C10(ACT(self.B10(x)))))\n x = x + self.C13(ACT(self.C12(x)))\n x = F.pixel_shuffle(x, 2)\n x = x + self.C21(ACT(self.C20(ACT(self.B20(x)))))\n x = x + self.C23(ACT(self.C22(x)))\n x = F.pixel_shuffle(x, 2)\n x = x + self.Cx1(ACT(self.Cx0(x)))\n x = self.COUT(x)\n return torch.sigmoid(x)\n########################################################################################################\nprint(f'building model...')\nargs = types.SimpleNamespace()\nargs.my_img_bit = 13\nencoder = R_ENCODER(args).eval().cuda()" + }, + { + "comment": "The code is loading a pre-trained encoder and decoder model, applying image transformations, and generating an output image using the decoder. It then saves the output image in a specific format. The code also prints the shape and values of a tensor after performing some operations on it.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/img_demoAE.py\":139-164", + "content": "decoder = R_DECODER(args).eval().cuda()\nzpow = torch.tensor([2**i for i in range(0,13)]).reshape(13,1,1).cuda().long()\nencoder.load_state_dict(torch.load(f'{model_prefix}-E.pth'))\ndecoder.load_state_dict(torch.load(f'{model_prefix}-D.pth'))\n########################################################################################################\nprint(f'test image...')\nimg_transform = transforms.Compose([\n transforms.PILToTensor(),\n transforms.ConvertImageDtype(torch.float),\n transforms.Resize((224, 224))\n])\nwith torch.no_grad():\n img = img_transform(Image.open(input_img)).unsqueeze(0).cuda()\n z = encoder(img)\n z = ToBinary.apply(z)\n zz = torch.sum(z.squeeze().long() * zpow, dim=0)\n print(f'Code shape = {zz.shape}\\n{zz.cpu().numpy()}\\n')\n out = decoder(z)\n vision.utils.save_image(out, f\"{input_img.split('.')[0]}-out-13bit.jpg\")" + } + ] +} \ No newline at end of file diff --git a/docs/doc/0e73a558-94fe-4dc8-9b04-40098d068be6.json b/docs/doc/0e73a558-94fe-4dc8-9b04-40098d068be6.json new file mode 100644 index 00000000..b6595de7 --- /dev/null +++ b/docs/doc/0e73a558-94fe-4dc8-9b04-40098d068be6.json @@ -0,0 +1,220 @@ +{ + "summary": "This code installs and trains an enhanced RWKV language model with various features, including HuggingFace integration, optimized inference, bidirectional tests, decoder mixtures, multimodal tokens, and simplifies time-series prediction using ATT. It also showcases efficient sampling for image processing and includes time-mix layers, channel-mix layers, and attention mechanisms. RWKV v1 outperforms NER with the Head-QK trick, and proper initialization is crucial for convergence.", + "details": [ + { + "comment": "This code provides instructions for installing necessary packages and dependencies, preparing, training, and finetuning the RWKV language model. The author recommends using specific versions of Python and CUDA, as well as certain libraries. The code also includes an example loss curve and suggests using a specific package for running the model. The data should be in .jsonl format, and a provided script can be used to tokenize the data for training.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/README.md\":0-25", + "content": "# The RWKV Language Model (and my LM tricks)\n> RWKV homepage: https://www.rwkv.com/ https://wiki.rwkv.com/\n### HOW TO TEST TRAINING RWKV-5 on MiniPile (1.5G tokens) ##\nUse python 3.10 and cuda 11.7.1 / 11.7 (note torch2 + cuda12 has weird bugs and hurts model performance).\n```\npip install torch==1.13.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117\npip install pytorch-lightning==1.9.5 deepspeed==0.7.0 wandb ninja\ncd RWKV-v5/\n./demo-training-prepare.sh\n./demo-training-run.sh\n(you may want to log in to wandb first)\n```\nYour loss curve should look almost exactly the same as this, with the same ups and downs (if you use the same bsz & config):\n\nYou can run your model using https://pypi.org/project/rwkv/ (use \"rwkv_vocab_v20230424\" instead of \"20B_tokenizer.json\")\n### HOW TO FINETUNE RWKV-5 MODELS ##\nUse .jsonl format for your data (see https://huggingface.co/BlinkDL/rwkv-5-world for formats).\nUse https://github.com/BlinkDL/RWKV-LM/blob/main/RWKV-v5/make_data.py to tokenizer it into binidx suitable for training." + }, + { + "comment": "RWKV is a parallelizable RNN with transformer-level LLM performance, combining the best of both. It offers great performance, fast inference, VRAM savings, quick training, infinite ctx_len, and free sentence embeddings using final hidden states. Check out the latest version, RWKV-6, for impressive results.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/README.md\":27-37", + "content": "## RWKV: Parallelizable RNN with Transformer-level LLM Performance (pronounced as \"RwaKuv\", from 4 major params: R W K V)\nRWKV is an RNN with Transformer-level LLM performance, which can also be directly trained like a GPT transformer (parallelizable). And it's 100% attention-free. You only need the hidden state at position t to compute the state at position t+1. You can use the \"GPT\" mode to quickly compute the hidden state for the \"RNN\" mode.\nSo it's combining the best of RNN and transformer - **great performance, fast inference, saves VRAM, fast training, \"infinite\" ctx_len, and free sentence embedding** (using the final hidden state).\nOur latest version is **RWKV-6**, which is easily Mamba level, and simpler ;) https://twitter.com/BlinkDL_AI/status/1732791817073229881 https://twitter.com/BlinkDL_AI/status/1713967928209752128 (Preview models: https://huggingface.co/BlinkDL/temp )\n**RWKV-5 World v2 1.5B** Demo: https://huggingface.co/spaces/BlinkDL/RWKV-Gradio-1\n**RWKV-5 World v2 3B** Demo: https://huggingface.co/spaces/BlinkDL/RWKV-Gradio-2" + }, + { + "comment": "This code sets up an RWKV model with specific parameters and performs forward passes on different input sequences, printing the resulting logits. It also sets environment variables for JIT and CUDA usage, and links to various resources for the RWKV Runner GUI, raw weights, HF-compatible weights, and pip package.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/README.md\":39-60", + "content": "![RWKV-v5-benchmark-1](RWKV-v5-benchmark-1.png)\n**RWKV Runner GUI** https://github.com/josStorer/RWKV-Runner with one-click install and API\n**Raw cutting-edge RWKV weights:** https://huggingface.co/BlinkDL\n**HF-compatible RWKV weights:** https://huggingface.co/RWKV\n**RWKV pip package**: https://pypi.org/project/rwkv/\n```python\nos.environ[\"RWKV_JIT_ON\"] = '1'\nos.environ[\"RWKV_CUDA_ON\"] = '0' # if '1' then use CUDA kernel for seq mode (much faster)\nfrom rwkv.model import RWKV # pip install rwkv\nmodel = RWKV(model='/fsx/BlinkDL/HF-MODEL/rwkv-4-pile-1b5/RWKV-4-Pile-1B5-20220903-8040', strategy='cuda fp16')\nout, state = model.forward([187, 510, 1563, 310, 247], None) # use 20B_tokenizer.json\nprint(out.detach().cpu().numpy()) # get logits\nout, state = model.forward([187, 510], None)\nout, state = model.forward([1563], state) # RNN has state (use deepcopy if you want to clone it)\nout, state = model.forward([310, 247], state)\nprint(out.detach().cpu().numpy()) # same result as above" + }, + { + "comment": "This code snippet provides links to various RWKV-related projects and resources. It highlights the official website, social media, discord community, and a list of 200+ projects using RWKV. The list includes inference APIs, trainers, LoRA finetuning, and an AI digital assistant built with RWKV.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/README.md\":61-89", + "content": "```\n**nanoRWKV**: https://github.com/BlinkDL/nanoRWKV (does not require custom CUDA kernel to train, works for any GPU/CPU)\n## RWKV Discord: https://discord.gg/bDSBUMeFpc (7k+ members)\n**Twitter**: https://twitter.com/BlinkDL_AI\n**Homepage**: https://www.rwkv.com/\n**Cool Community RWKV Projects**:\nAll (200+) RWKV projects: https://github.com/search?o=desc&q=rwkv&s=updated&type=Repositories\nhttps://github.com/cgisky1980/ai00_rwkv_server Fastest GPU inference API with vulkan (good for nvidia/amd/intel), supports rwkv5\nhttps://github.com/cryscan/web-rwkv backend for ai00_rwkv_server, supports rwkv5\nhttps://github.com/saharNooby/rwkv.cpp Fast CPU/cuBLAS/CLBlast inference: int4/int8/fp16/fp32, supports rwkv5\nhttps://github.com/daquexian/faster-rwkv supports rwkv5\nhttps://github.com/mlc-ai/mlc-llm/pull/1275 supports rwkv5\nhttps://github.com/RWKV/RWKV-infctx-trainer Infctx trainer\nhttps://github.com/Blealtan/RWKV-LM-LoRA LoRA finetuning\nhttps://github.com/TheRamU/Fay/blob/main/README_EN.md Digital Assistant with RWKV" + }, + { + "comment": "This code contains links to various implementations and resources for the RWKV (Rogue-Wave Krause-Vitton) language model, including Python scripts, preprint papers, and a Discord community. It also showcases a cool paper using RWKV for Spiking Neural Networks and highlights potential compute resources available for running new ideas related to the model.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/README.md\":91-113", + "content": "https://github.com/harrisonvanderbyl/rwkv-cpp-cuda Fast GPU inference with cuda/amd/vulkan\n**RWKV v4 in 150 lines** (model, inference, text generation): https://github.com/BlinkDL/ChatRWKV/blob/main/RWKV_in_150_lines.py\n**\ud83d\udd25 RWKV v5 in 250 lines \ud83d\udd25** (with tokenizer too): https://github.com/BlinkDL/ChatRWKV/blob/main/RWKV_v5_demo.py\n**RWKV v4 preprint** https://arxiv.org/abs/2305.13048\n![RWKV-paper](RWKV-paper.png)\n**RWKV v4 introduction, and in 100 lines of numpy**: https://johanwind.github.io/2023/03/23/rwkv_overview.html https://johanwind.github.io/2023/03/23/rwkv_details.html\nRWKV v6 illustrated:\n![RWKV-v6](rwkv-x060.png)\nA cool paper (Spiking Neural Network) using RWKV: https://github.com/ridgerchu/SpikeGPT\nYou are welcome to join the RWKV discord https://discord.gg/bDSBUMeFpc to build upon it. We have plenty of potential compute (A100 40Gs) now (thanks to Stability and EleutherAI), so if you have interesting ideas I can run them.\n![RWKV-eval2](RWKV-eval2.png)\nRWKV [loss vs token position] for" + }, + { + "comment": "This code discusses the performance of RWKV language models with different context lengths, comparing them to GPT2-XL. It notes that RNNs are better for ASICs, RL, and resemble human writing, while the universe is like an RNN due to locality. It also mentions the training speed of a new RWKV model with BF16 context length 40.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/README.md\":113-127", + "content": " 10000 ctx4k+ documents in Pile. RWKV 1B5-4k is mostly flat after ctx1500, but 3B-4k and 7B-4k and 14B-4k have some slopes, and they are getting better. This debunks the old view that RNNs cannot model long ctxlens. We can predict that RWKV 100B will be great, and RWKV 1T is probably all you need :)\n![RWKV-ctxlen](RWKV-ctxlen.png)\nChatRWKV with RWKV 14B ctx8192:\n![RWKV-chat](RWKV-chat.png)\nI believe RNN is a better candidate for fundamental models, because: (1) It's more friendly for ASICs (no kv cache). (2) It's more friendly for RL. (3) When we write, our brain is more similar to RNN. (4) The universe is like an RNN too (because of locality). Transformers are non-local models.\nRWKV-3 1.5B on A40 (tf32) = always 0.015 sec/token, tested using simple pytorch code (no CUDA), GPU utilization 45%, VRAM 7823M\nGPT2-XL 1.3B on A40 (tf32) = 0.032 sec/token (for ctxlen 1000), tested using HF, GPU utilization 45% too (interesting), VRAM 9655M\nTraining speed: (new training code) RWKV-4 14B BF16 ctxlen40" + }, + { + "comment": "This code is discussing the performance of RWKV language model on different hardware setups, mentioning its image experiments and the idea to use it for txt2img diffusion. It also highlights the smooth training process with no loss spikes, open-source availability, fast inference on CPUs as well, and a simple working mechanism based on channels and decay speeds.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/README.md\":127-140", + "content": "96 = 114K tokens/s on 8x8 A100 80G (ZERO2+CP). (old training code) RWKV-4 1.5B BF16 ctxlen1024 = 106K tokens/s on 8xA100 40G.\nI am doing image experiments too (For example: https://huggingface.co/BlinkDL/clip-guided-binary-autoencoder) and RWKV will be able to do txt2img diffusion :) My idea: 256x256 rgb image -> 32x32x13bit latents -> apply RWKV to compute transition probability for each of the 32x32 grid -> pretend the grids are independent and \"diffuse\" using these probabilities.\nSmooth training - no loss spikes! (lr & bsz change around 15G tokens)\n![RWKV-loss](RWKV-loss.png)\n![RWKV-eval](RWKV-eval.png)\nAll of the trained models will be open-source. Inference is very fast (only matrix-vector multiplications, no matrix-matrix multiplications) even on CPUs, so you can even run a LLM on your phone.\nHow it works: RWKV gathers information to a number of channels, which are also decaying with different speeds as you move to the next token. It's very simple once you understand it.\n**RWKV is paral" + }, + { + "comment": "RWKV is an RNN model that allows adjustable time-decay for each channel. It can be fine-tuned into a non-parallelizable RNN and has potential for edge devices like phones due to its friendly nature. The code discusses future tasks like HuggingFace integration, optimized inference, bidirectional tests, and using image/audio/video tokens with decoder mixtures.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/README.md\":140-149", + "content": "lelizable because the time-decay of each channel is data-independent (and trainable)**. For example, in usual RNN you can adjust the time-decay of a channel from say 0.8 to 0.5 (these are called \"gates\"), while in RWKV you simply move the information from a W-0.8-channel to a W-0.5-channel to achieve the same effect. Moreover, you can fine-tune RWKV into a non-parallelizable RNN (then you can use outputs of later layers of the previous token) if you want extra performance.\n![RWKV-formula](RWKV-formula.png)\nHere are some of my TODOs. Let's work together :)\n* HuggingFace integration (check https://github.com/huggingface/transformers/issues/17230\n), and optimized CPU & iOS & Android & WASM & WebGL inference. RWKV is a RNN and very friendly for edge devices. Let's make it possible to run a LLM on your phone. \n* Test it on bidirectional & MLM tasks, and image & audio & video tokens. I think RWKV can support Encoder-Decoder via this: for each decoder token, use a learned mixture of [decoder previ" + }, + { + "comment": "The code is describing the improvements made in RWKV-4a over RWKV-4, including one extra attention to enhance difficult zero-shot tasks. The user feedback highlights the fast training and impressive results of character-based models on smaller datasets. The tweet from Sepp Hochreiter acknowledges the author (BlinkDL) in the EleutherAI Discord for their work.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/README.md\":149-160", + "content": "ous hidden state] & [encoder final hidden state]. Hence all decoder tokens will have access to the encoder output.\n* Now training RWKV-4a with one single tiny extra attention (just a few extra lines comparing with RWKV-4) to further improve some difficult zeroshot tasks (such as LAMBADA) for smaller models. See https://github.com/BlinkDL/RWKV-LM/commit/a268cd2e40351ee31c30c5f8a5d1266d35b41829\nUser feedback:\n> *I've so far toyed around the character-based model on our relatively small pre-training dataset (around 10GB of text), and the results are extremely good - similar ppl to models taking much, much longer to train.*\n> *dear god rwkv is fast. i switched to another tab after starting training it from scratch & when i returned it was emitting plausible english & maori words, i left to go microwave some coffee & when i came back it was producing fully grammatically correct sentences.*\nTweet from Sepp Hochreiter (thank you!): https://twitter.com/HochreiterSepp/status/1524270961314484227\nYou can find me (BlinkDL) in the EleutherAI Discord too: https://www.eleuther.ai/get-involved/" + }, + { + "comment": "This code provides a quick start guide for using the RWKV-LM language model, specifying important dependencies and compatibility details. It also gives instructions on how to use the model for inference, including running RWKV-4 Pile models, options for Colab usage, and links to web demos.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/README.md\":162-183", + "content": "![RWKV-demo](RWKV-demo.png)\n## Quick start\n**IMPORTANT: Use deepspeed==0.7.0 pytorch-lightning==1.9.5 torch==1.13.1+cu117 and cuda 11.7.1 or 11.7 (note torch2 + deepspeed has weird bugs and hurts model performance)**\nUse https://github.com/BlinkDL/RWKV-LM/tree/main/RWKV-v4neo (latest code, compatible with v4).\nHere is a great prompt for testing Q&A of LLMs. Works for any model: (found by minimizing ChatGPT ppls for RWKV 1.5B)\n```python\nprompt = f'\\nQ & A\\n\\nQuestion:\\n{qq}\\n\\nDetailed Expert Answer:\\n' # let the model generate after this\n```\n### Inference\n**Run RWKV-4 Pile models:** Download models from https://huggingface.co/BlinkDL. Set TOKEN_MODE = 'pile' in run.py and run it. It's fast even on CPU (the default mode).\n**Colab for RWKV-4 Pile 1.5B**: https://colab.research.google.com/drive/1F7tZoPZaWJf1fsCmZ5tjw6sYHiFOYVWM\nRun RWKV-4 Pile models in your browser (and onnx version): see this issue https://github.com/BlinkDL/RWKV-LM/issues/7\nRWKV-4 Web Demo: https://josephrocca.github.io/rwkv-v4-web/demo/ (note: only greedy sampling for now)" + }, + { + "comment": "This code provides instructions on how to train or fine-tune RWKV-4 from scratch using the provided script train.py and dataset enwik8. It also advises on adding weight decay and dropout for training on small data amounts, and mentions that the GPT version is faster to train and can extrapolate.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/README.md\":185-195", + "content": "For the old RWKV-2: see the release here for a 27M params model on enwik8 with 0.72 BPC(dev). Run run.py in https://github.com/BlinkDL/RWKV-LM/tree/main/RWKV-v2-RNN. You can even run it in your browser: https://github.com/BlinkDL/AI-Writer/tree/main/docs/eng https://blinkdl.github.io/AI-Writer/eng/ (this is using tf.js WASM single-thread mode).\n### Training / Fine-tuning\npip install deepspeed==0.7.0 // pip install pytorch-lightning==1.9.5 // torch 1.13.1+cu117\nNOTE: add weight decay (0.1 or 0.01) and dropout (0.1 or 0.01) when training on small amt of data. try x=x+dropout(att(x)) x=x+dropout(ffn(x)) x=dropout(x+att(x)) x=dropout(x+ffn(x)) etc.\n**Training RWKV-4 from scratch:** run train.py, which by default is using the enwik8 dataset (unzip https://data.deepai.org/enwik8.zip).\nYou will be training the \"GPT\" version because it's paralleziable and faster to train. RWKV-4 can extrapolate, so training with ctxLen 1024 can work for ctxLen of 2500+. You can fine-tune the model with longer ctxLen and it can quickly adapt to longer ctxLens." + }, + { + "comment": "This code provides instructions for fine-tuning RWKV-4 Pile models using provided scripts and resources. It mentions a Colab notebook for fine-tuning, how to convert large corpus to .bin and .idx format, and an ongoing work for infinite context length training.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/README.md\":197-217", + "content": "**Fine-tuning RWKV-4 Pile models:** use 'prepare-data.py' in https://github.com/BlinkDL/RWKV-v2-RNN-Pile/tree/main/RWKV-v3 to tokenize .txt into train.npy data. Then use https://github.com/BlinkDL/RWKV-LM/blob/main/RWKV-v4neo/train.py to train it.\nRead the inference code in src/model.py and try using the final hidden state\uff08.xx .aa .bb) as a faithful sentence embedding for other tasks. Probably you should begin with .xx and .aa/.bb (.aa divided by .bb).\nColab for fine-tuning RWKV-4 Pile models: https://colab.research.google.com/github/resloved/RWKV-notebooks/blob/master/RWKV_v4_RNN_Pile_Fine_Tuning.ipynb\n**Large corpus:** Use https://github.com/Abel2076/json2binidx_tool to convert .jsonl into .bin and .idx\nThe jsonl format sample (one line for each document):\n```\n{\"text\": \"This is the first document.\"}\n{\"text\": \"Hello\\nWorld\"}\n{\"text\": \"1+1=2\\n1+2=3\\n2+2=4\"}\n```\ngenerated by code like this:\n```\nss = json.dumps({\"text\": text}, ensure_ascii=False)\nout.write(ss + \"\\n\")\n```\n**Infinite ctxlen training (WIP):** https://github.com/Blealtan/RWKV-LM-LoRA/tree/dev-infctx" + }, + { + "comment": "This code discusses the usage of RWKV hidden state as text embedding, suggesting to collect statistics and normalize channels before training a linear classifier. It also mentions the latest design for RWKV-5 which includes multi-head architecture and LayerNorm for each head.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/README.md\":219-236", + "content": "### How to use RWKV hidden state as text embedding\nConsider RWKV 14B. The state has 200 vectors, that is, 5 vectors for each block: fp16 (xx), fp32 (aa), fp32 (bb), fp32 (pp), fp16 (xx).\nDo not avg pool because different vectors (xx aa bb pp xx) in the state have very different meanings and ranges. You can probably remove pp.\nI suggest firstly collect the mean+stdev statistics of each channel of each vector, and normalize all of them (note: the normalization should be data-indepedent and collected from various texts). Then train a linear classifer.\n## Towards RWKV-5 (just to record some new ideas)\n### Lastest Design\nRWKV-5 is multi-head and here shows one head. There is also a LayerNorm for each head (hence actually GroupNorm).\n$`\n\\begin{array}{|l|l|l|}\n\\hline & \\text { RWKV-4 with real-valued } k \\,\\&\\, v \\,\\&\\, u \\,\\&\\, w & \\text { RWKV-5 with matrix-valued } \\mathrm{k}^{\\dagger} \\mathrm{v} \\,\\&\\, \\mathrm{u} \\,\\&\\, \\mathrm{w} \\\\\n\\hline \\mathrm{y}_0 & \\mathrm{r}_0 \\frac{\\mathrm{uk}_0 \\mathrm{v}_0}{\\mathrm{uk}_0} & \\mathrm{r}_0\\left(\\mathrm{uk}_0^{\\dagger} \\mathrm{v}_0\\right) \\\\" + }, + { + "comment": "This code represents a matrix equation with three rows, each representing the linear combination of variables y1, y2, and y3. The coefficients r1, r2, and r3 determine the weightage for each variable in the linear combinations. The code also includes the addition of additional terms involving k0, v0, k1, v1, wk0, v2, uk2, v3, k2, wk1, and w^2k0.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/README.md\":237-244", + "content": "\\hline \\mathrm{y}_1 & \\mathrm{r}_1 \\frac{\\mathrm{uk}_1 \\mathrm{v}_1+\\mathrm{k}_0 \\mathrm{v}_0}{\\mathrm{uk}_1+\\mathrm{k}_0} & \\mathrm{r}_1\\left(\\mathrm{uk}_1^{\\dagger} \\mathrm{v}_1+\\mathrm{k}_0^{\\dagger} \\mathrm{v}_0\\right) \\\\\n\\hline \\mathrm{y}_2 & \\mathrm{r}_2 \\frac{\\mathrm{uk}_2 \\mathrm{v}_2+\\mathrm{k}_1 \\mathrm{v}_1+\\mathrm{wk}_0 \\mathrm{v}_0}{\\mathrm{uk}_2+\\mathrm{k}_1+\\mathrm{wk}_0} & \\mathrm{r}_2\\left(\\mathrm{uk}_2^{\\dagger} \\mathrm{v}_2+\\mathrm{k}_1^{\\dagger} \\mathrm{v}_1+\\mathrm{wk}_0^{\\dagger} \\mathrm{v}_0\\right) \\\\\n\\hline \\mathrm{y}_3 & \\mathrm{r}_3 \\frac{\\mathrm{uk}_3 \\mathrm{v}_3+\\mathrm{k}_2 \\mathrm{v}_2+\\mathrm{wk}_1 \\mathrm{v}_1+\\mathrm{w}^2 \\mathrm{k}_0 \\mathrm{v}_0}{\\mathrm{uk}_3+\\mathrm{k}_2+\\mathrm{wk}_1+\\mathrm{w}^2 \\mathrm{k}_0} & \\mathrm{r}_3\\left(\\mathrm{uk}_3^{\\dagger} \\mathrm{v}_3+\\mathrm{k}_2^{\\dagger} \\mathrm{v}_2+\\mathrm{wk}_1^{\\dagger} \\mathrm{v}_1+\\mathrm{w}^2 \\mathrm{k}_0^{\\dagger} \\mathrm{v}_0\\right) \\\\\n\\hline\n\\end{array}`$\n$`\\left[\\begin{array}{ll}\n\\mathrm{y}_{20} & \\cdots \\mathrm{y}_{2 \\mathrm{c}}" + }, + { + "comment": "This code seems to involve a series of matrix multiplication and addition operations, where the resulting vector is derived from a combination of three matrices (u, v, w) with their respective coefficients (k10, k20). The code appears to be part of a larger algorithm or model that uses this calculated vector as input for further computations.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/README.md\":245-264", + "content": "\\end{array}\\right]=\\left[\\begin{array}{lll}\n\\mathrm{r}_{20} & \\cdots & \\mathrm{r}_{2 \\mathrm{c}}\n\\end{array}\\right]`$\n$`\\left(\\left[\\begin{array}{ccc}\n\\mathrm{u}_{00} & \\cdots & \\mathrm{u}_{0 \\mathrm{c}} \\\\\n\\vdots & \\ddots & \\vdots \\\\\n\\mathrm{u}_{\\mathrm{c} 0} & \\cdots & \\mathrm{u}_{\\mathrm{cc}}\n\\end{array}\\right]\\left[\\begin{array}{ccc}\n\\mathrm{k}_{20} \\mathrm{v}_{20} & \\cdots & \\mathrm{k}_{20} \\mathrm{v}_{2 \\mathrm{c}} \\\\\n\\vdots & \\ddots & \\vdots \\\\\n\\mathrm{k}_{2 \\mathrm{c}} \\mathrm{v}_{20} & \\cdots & \\mathrm{k}_{2 \\mathrm{c}} \\mathrm{v}_{2 \\mathrm{c}}\n\\end{array}\\right]+\\left[\\begin{array}{ccc}\n\\mathrm{k}_{10} \\mathrm{v}_{10} & \\cdots & \\mathrm{k}_{10} \\mathrm{v}_{1 \\mathrm{c}} \\\\\n\\vdots & \\ddots & \\vdots \\\\\n\\mathrm{k}_{1 \\mathrm{c}} \\mathrm{v}_{10} & \\cdots & \\mathrm{k}_{1 \\mathrm{c}} \\mathrm{v}_{1 \\mathrm{c}}\n\\end{array}\\right]+\\left[\\begin{array}{ccc}\n\\mathrm{w}_{00} & \\cdots & \\mathrm{w}_{0 \\mathrm{c}} \\\\\n\\vdots & \\ddots & \\vdots \\\\\n\\mathrm{w}_{\\mathrm{c} 0} & \\cdots & \\mathrm{w}_{\\mathrm{cc}}\n\\end{array}\\right]\\left[\\begin{array}{ccc}" + }, + { + "comment": "This code initializes several learnable parameters for the time mixing operation in the RWKV-6 model. The `nn.Parameter` functions create random matrices to be used during the dynamic mix and decay process, with values uniformly distributed between -0.01 and 0.01. These parameters are used for both TimeMix and ChannelMix operations in the model.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/README.md\":265-283", + "content": "\\mathrm{k}_{00} \\mathrm{v}_{00} & \\cdots & \\mathrm{k}_{00} \\mathrm{v}_{0 c} \\\\\n\\vdots & \\ddots & \\vdots \\\\\n\\mathrm{k}_{0 \\mathrm{c}} \\mathrm{v}_{00} & \\cdots & \\mathrm{k}_{0 \\mathrm{c}} \\mathrm{v}_{0 c}\n\\end{array}\\right]\n\\right)`$\n### RWKV-6\nDynamic Mix & Dynamic Decay. Example (do this for both TimeMix & ChannelMix):\n```\nTIME_MIX_EXTRA_DIM = 32\nself.time_mix_k_w1 = nn.Parameter(torch.empty(args.n_embd, TIME_MIX_EXTRA_DIM).uniform_(-0.01, 0.01))\nself.time_mix_k_w2 = nn.Parameter(torch.zeros(TIME_MIX_EXTRA_DIM, args.n_embd))\nself.time_mix_v_w1 = nn.Parameter(torch.empty(args.n_embd, TIME_MIX_EXTRA_DIM).uniform_(-0.01, 0.01))\nself.time_mix_v_w2 = nn.Parameter(torch.zeros(TIME_MIX_EXTRA_DIM, args.n_embd))\nself.time_mix_r_w1 = nn.Parameter(torch.empty(args.n_embd, TIME_MIX_EXTRA_DIM).uniform_(-0.01, 0.01))\nself.time_mix_r_w2 = nn.Parameter(torch.zeros(TIME_MIX_EXTRA_DIM, args.n_embd))\nself.time_mix_g_w1 = nn.Parameter(torch.empty(args.n_embd, TIME_MIX_EXTRA_DIM).uniform_(-0.01, 0.01))\nself.time_mix_g_w2 = nn.Parameter(torch.zeros(TIME_MIX_EXTRA_DIM, args.n_embd))" + }, + { + "comment": "The code calculates the time-dependent mixing and shifting of input x for each color channel (k, v, r, g) using learned parameters. It then combines the shifted and mixed values to generate the final output. The RWKV-7 approach uses parallelized mode for fast state generation followed by a finetuned full RNN for sequential generation.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/README.md\":284-305", + "content": "...\ntime_mix_k = self.time_mix_k.view(1,1,-1) + (x @ self.time_mix_k_w1) @ self.time_mix_k_w2\ntime_mix_v = self.time_mix_v.view(1,1,-1) + (x @ self.time_mix_v_w1) @ self.time_mix_v_w2\ntime_mix_r = self.time_mix_r.view(1,1,-1) + (x @ self.time_mix_r_w1) @ self.time_mix_r_w2\ntime_mix_g = self.time_mix_g.view(1,1,-1) + (x @ self.time_mix_g_w1) @ self.time_mix_g_w2\nxx = self.time_shift(x)\nxk = x * time_mix_k + xx * (1 - time_mix_k)\nxv = x * time_mix_v + xx * (1 - time_mix_v)\nxr = x * time_mix_r + xx * (1 - time_mix_r)\nxg = x * time_mix_g + xx * (1 - time_mix_g)\n```\n![RWKV-v6](RWKV-v6.png)\n### RWKV-7\nUse parallelized mode to quickly generate the state, then use a finetuned full RNN (the layers of token n can use outputs of all layer of token n-1) for sequential generation.\n### Some old ideas\n1. Now time decay is like 0.999^T (0.999 is learnable). Change it to something like (0.999^T + 0.1) where 0.1 is learnable too. The 0.1 part will be kept forever. Or, A^T + B^T + C = fast-decay + slow-decay + constant" + }, + { + "comment": "1. Implement various formulas for decay, complex-valued decay, and positional encoding.\n2. Experiment with Lie groups like 3d rotation (SO(3)) for non-abelian RWKV.\n3. Consider analog or quantum computation optimization and photonic matrix-vector multiplication.\n4. Trainable initial hidden state and layerwise LR with Lion optimizer.\n5. Add 2D positional encoding for vision tasks.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/README.md\":305-324", + "content": ". Can even use different formulas (for example, K^2 instead of e^K for a decay component, or, without normalization).\n2. Use complex-valued decay (so, rotation instead of decay) in some channels.\n3. Inject some trainable and extrapolatable positional encoding?\n4. Aside from 2d rotation, we can try other Lie groups such as 3d rotation ( SO(3) ). Non-abelian RWKV lol.\n5. RWKV might be great on analog devices (search for Analog Matrix-vector multiplication & Photonic Matrix-vector multiplication). The RNN mode is very hardware-friendly (processing-in-memory). Can be a SNN too (https://github.com/ridgerchu/SpikeGPT). I wonder if it can be optimized for quantum computation.\n6. Trainable initial hidden state (xx aa bb pp xx).\n7. Layerwise (or even row/column-wise, elementwise) LR, and test Lion optimizer.\n### Vision Tasks\n1. I find it's good to add a 2d pos encoding:\n```\nself.pos_emb_x = nn.Parameter(torch.zeros((1,args.my_pos_emb,args.n_embd)))\nself.pos_emb_y = nn.Parameter(torch.zeros((args.my_pos_emb,1,args.n_embd)))" + }, + { + "comment": "This code snippet is adding a position embedding to the variable 'x' using pos_emb_x and pos_emb_y. It seems to be part of a language model that works with images of size N x N. The tokenShift value can vary depending on the image size, and different tokenShift styles can be tried for \"ATT\" and \"FFN\". Additionally, repeating the context and implementing bytes-aware embedding are mentioned as possible ways to improve memorization in the model.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/README.md\":325-341", + "content": "...\nx = x + pos_emb_x + pos_emb_y\n```\n2. In a BPE langauge model, it's the best to use [tokenShift of 1 token] (you can mix more tokens in a char-level English model). However you can try [tokenShift of N (or N-1) (or N+1) tokens] if the image size is N x N, because that will be like mixing [the token above the current positon (or the token above the to-be-predicted positon)] with [current token]. You can use try different tokenShift styles for \"ATT\" & \"FFN\", or mixing different tokenShift styles - such as mixing [token A] with [token A-1] and [token A-(N-1)] etc.\n### Misc\nMaybe we can improve memorization by simply repeating the context (I guess 2 times is enough). Example: Reference -> Reference(again) -> Question -> Answer\n#### Idea: Bytes-aware Embedding\nThe idea is to make sure each token in vocab understand its length and raw UTF-8 bytes.\nLet a = max(len(token)) for all token in vocab. Define AA : float[a][d_emb]\nLet b = max(len_in_utf8_bytes(token)) for all token in vocab. Define BB : float[b][256][d_emb]" + }, + { + "comment": "The code discusses an idea to improve tokenization by hardcoding channels with specific meanings. This allows for sharing of embeddings among similar tokens, such as \"abc\", \" abc\", and \" Abc\". However, the assumption made may not always hold true.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/README.md\":343-374", + "content": "For each token X in vocab, let [x0, x1, ..., xn] be its raw UTF-8 bytes. We will add some extra values to its embedding EMB(X):\nEMB(X) += AA[len(X)] + BB[0][x0] + BB[1][x1] + ... + BB[n][xn] (note: AA BB are learnable weights)\n* We can do this for the final Linear(d_emb, n_vocab) projection too.\n* We can use some small networks to generate AA and BB, for some extra regularization (for example, BB[m][xi] and BB[n][xi] should be related).\n#### Old Idea\nI have an idea to improve tokenization. We can hardcode some channels to have meanings. Example:\nChannel 0 = \"space\"\nChannel 1 = \"capitalize first letter\"\nChannel 2 = \"capitalize all letters\"\nTherefore:\nEmbedding of \"abc\": [0, 0, 0, x0, x1, x2 , ..]\nEmbedding of \" abc\": [1, 0, 0, x0, x1, x2, ..]\nEmbedding of \" Abc\": [1, 1, 0, x0, x1, x2, ..]\nEmbedding of \"ABC\": [0, 0, 1, x0, x1, x2, ...]\n......\nso they will share most of the embedding. And we can rapidly compute the output probability of all variations of \"abc\".\nNote: the above method is assuming that p(\" xyz\") / p(\"xyz\") is the same for any \"xyz\", which can be wrong." + }, + { + "comment": "This code discusses an idea to improve initial states in RWKV for better representation and model discovery. It suggests generating final states of all wiki documents, using the best document's final state as a Q's initial state, or training a model to directly generate optimal initial states. The approach is inspired by Apple's AFT and uses tricks like SmallInitEmb.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/README.md\":376-400", + "content": "Better: define emb_space emb_capitalize_first emb_capitalize_all to be a function of emb.\nMaybe the Best: let 'abc' ' abc' etc. to share the last 90% of their embeddings.\nAt this moment, all our tokenizers spend too many items to represent all variations of 'abc' ' abc' ' Abc' etc. Moreover the model cannot discover that these are actually similar if some of these variations are rare in the dataset. The method here can improve this. I plan to test this in a new version of RWKV.\n#### Idea: Better Initial States\nExample (single-round Q & A):\n1. Generate the final state of all wiki documents.\n2. For any user Q, find the best wiki document, and use its final state as the initial state.\n3. Train a model to directly generate the optimal initial state for any user Q.\nHowever this can be a bit more tricky for multi-round Q & A :)\n## How it works\nRWKV is inspired by Apple's AFT (https://arxiv.org/abs/2105.14103).\nMoreover it's using a number of my tricks, such as:\n* SmallInitEmb: https://github.com/BlinkDL" + }, + { + "comment": "This code snippet highlights various modifications and techniques used to improve the performance of transformer models. These include SmallInitEmb for embedding quality, token-shift for char-level models, Head-QK trick for learning to copy tokens, extra R-gate in FFN with reluSquared from Primer, better initialization using ZERO values, parameter transfer for faster convergence, and a custom CUDA kernel to speed up training.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/README.md\":400-412", + "content": "/SmallInitEmb (applicable to all transformers) which helps the embedding quality, and stabilizes Post-LN (which is what I am using).\n* Token-shift: https://github.com/BlinkDL/RWKV-LM#token-shift-time-shift-mixing (applicable to all transformers), especially helpful for char-level models.\n* Head-QK: https://github.com/BlinkDL/RWKV-LM#the-head-qk-trick-learning-to-copy-and-avoid-tokens (applicable to all transformers). Note: it's helpful, but I disabled it in the Pile model to keep it 100% RNN.\n* Extra R-gate in the FFN (applicable to all transformers). I am also using reluSquared from Primer.\n* Better initilization: I init most of the matrices to ZERO (see RWKV_Init in https://github.com/BlinkDL/RWKV-LM/blob/main/RWKV-v2-RNN/src/model.py).\n* You can transfer some parameters from a small model to a large model (note: I sort & smooth them too), for faster and better convergence (see https://www.reddit.com/r/MachineLearning/comments/umq908/r_rwkvv2rnn_a_parallelizable_rnn_with/).\n* My CUDA kernel: https://github.com/BlinkDL/RWKV-CUDA to speedup training." + }, + { + "comment": "Pseudocode outlines the RWKV-v2-RNN architecture, with factors working together to create a time-decay curve. The code shows improvements for RWKV-3, such as using different TimeMix factors for R/K/V in SA and FF layers, and applying preLN instead of postLN for stability and faster convergence.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/README.md\":414-440", + "content": "## The pseudocode (execution from top to bottom):\n![RWKV-v2-RNN](RWKV-v2-RNN.png)\nThe a b c d factors work together to build a time-decay curve: [X, 1, W, W^2, W^3, ...].\nWrite out the formulas for \"token at pos 2\" and \"token at pos 3\" and you will get the idea:\n* a and b: EMAs of kv and k.\n* c and d: these are a and b combined with \"self-attention\".\nkv / k is the memory mechanism. The token with high k can be remembered for a long duration, if W is close to 1 in the channel.\nThe R-gate is important for performance. k = info strength of this token (to be passed to future tokens). r = whether to apply the info to this token.\n## RWKV-3 improvements\nUse different trainable TimeMix factors for R / K / V in SA and FF layers. Example:\n```python\nxx = self.time_shift(x)\nxk = x * self.time_mix_k + xx * (1 - self.time_mix_k)\nxv = x * self.time_mix_v + xx * (1 - self.time_mix_v)\nxr = x * self.time_mix_r + xx * (1 - self.time_mix_r)\n```\nUse preLN instead of postLN (more stable & faster convergence):\n```python\nif self.layer_id == 0:" + }, + { + "comment": "This code snippet is part of the RWKV-3 GPT model implementation, which follows a similar structure to a usual preLN GPT. It uses an additional Layer Normalization (LN) after embedding. The input idx represents token indices, and the final output x is the logits. To achieve optimal results, emb should be initialized with small values using nn.init.uniform_(a=-1e-4, b=1e-4). Training uses Adam optimizer without weight decay or dropout, and it runs on 8 A100 GPUs with tf32 precision for better performance.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/README.md\":441-468", + "content": "\tx = self.ln0(x)\nx = x + self.att(self.ln1(x))\nx = x + self.ffn(self.ln2(x))\n```\n## Explaining the code for RWKV-3 GPT mode\n### The GPT mode - overview\nThe building blocks of RWKV-3 GPT mode are similar to that of a usual preLN GPT.\nThe only difference is an extra LN after embedding. Note you can absorb this LN into the embedding after finishing the training.\n```python\nx = self.emb(idx) # input: idx = token indices\nx = self.ln_emb(x) # extra LN after embedding\nx = x + self.att_0(self.ln_att_0(x)) # preLN\nx = x + self.ffn_0(self.ln_ffn_0(x))\n...\nx = x + self.att_n(self.ln_att_n(x))\nx = x + self.ffn_n(self.ln_ffn_n(x))\nx = self.ln_head(x) # final LN before projection\nx = self.head(x) # output: x = logits\n```\nIt is important to initialize emb to tiny values, such as nn.init.uniform_(a=-1e-4, b=1e-4), to utilize my trick https://github.com/BlinkDL/SmallInitEmb.\nFor the 1.5B RWKV-3, I use Adam (no wd, no dropout) optimizer on 8 * A100 40G.\nbatchSz = 32 * 896, ctxLen = 896. I am using tf32 so the batchSz is a bit small. " + }, + { + "comment": "This code block is implementing the ATT (attention) mechanism in the RWKV-3 model. It mixes the input x with the previous timestep using time_mix values, and then applies separate transformations to k, v, and r before clamping and applying an exponential function to k. The code also creates a W-curve for later use in the attention calculation.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/README.md\":470-495", + "content": "For the first 15B tokens, LR is fixed at 3e-4, and beta=(0.9, 0.99).\nThen I set beta=(0.9, 0.999), and do an exponential decay of LR, reaching 1e-5 at 332B tokens.\n### The GPT mode - ATT block\nThe RWKV-3 does not have any attention in the usual sense, but we will call this block ATT anyway.\n```python\nB, T, C = x.size() # x = (Batch,Time,Channel)\n# Mix x with the previous timestep to produce xk, xv, xr\nxx = self.time_shift(x) # self.time_shift = nn.ZeroPad2d((0,0,1,-1))\nxk = x * self.time_mix_k + xx * (1 - self.time_mix_k)\nxv = x * self.time_mix_v + xx * (1 - self.time_mix_v)\nxr = x * self.time_mix_r + xx * (1 - self.time_mix_r)\n# Use xk, xv, xr to produce k, v, r\nk = self.key(xk).transpose(-1, -2)\nv = self.value(xv).transpose(-1, -2)\nr = self.receptance(xr)\nk = torch.clamp(k, max=60) # clamp k to avoid overflow\nk = torch.exp(k)\nkv = k * v\n# Compute the W-curve = [e^(-n * e^time_decay), e^(-(n-1) * e^time_decay), ..., 1, e^(time_first)]\nself.time_w = torch.cat([torch.exp(self.time_decay) * self.time_curve.to(x.device), self.time_first], dim=-1)" + }, + { + "comment": "This code implements a time-mixing operation, which involves mixing the key and query with weights calculated based on a time vector. This is done using either the `TimeX` class or convolution operations depending on the device being used. The result is then passed through a sigmoid function and divided by another set of weights calculated in the same manner to obtain the final output. Additionally, an extra receptance gate is added, similar to the ATT block.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/README.md\":496-527", + "content": "w = torch.exp(self.time_w)\n# Use W to mix kv and k respectively. Add K_EPS to wk to avoid divide-by-zero\nif RUN_DEVICE == 'cuda':\n wkv = TimeX.apply(w, kv, B,C,T, 0)\n wk = TimeX.apply(w, k, B,C,T, K_EPS)\nelse:\n w = w[:,-T:].unsqueeze(1)\n wkv = F.conv1d(nn.ZeroPad2d((T-1, 0, 0, 0))(kv), w, groups=C)\n wk = F.conv1d(nn.ZeroPad2d((T-1, 0, 0, 0))(k), w, groups=C) + K_EPS\n# The RWKV formula\nrwkv = torch.sigmoid(r) * (wkv / wk).transpose(-1, -2)\nrwkv = self.output(rwkv) # final output projection\n```\nThe self.key, self.receptance, self.output matrices are all initialized to zero.\nThe time_mix, time_decay, time_first vectors are transferred from a smaller trained model (note: I sort & smooth them too).\n### The GPT mode - FFN block\nThe FFN block has three tricks comparing with the usual GPT:\n1. My time_mix trick.\n2. The sqReLU from the Primer paper.\n3. An extra receptance-gate (similar to the receptance-gate in ATT block).\n```python\n# Mix x with the previous timestep to produce xk, xr\nxx = self.time_shift(x)" + }, + { + "comment": "This code performs a FFN operation with time-based mixing and receptance-gate on the input x. The self.value and self.receptance matrices are initialized to zero. It uses relu, sigmoid functions, and matrix multiplication. This improves upon GPT's formula by simplifying it and reducing complexity from O(T^2) to a more efficient operation.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/README.md\":528-556", + "content": "xk = x * self.time_mix_k + xx * (1 - self.time_mix_k)\nxr = x * self.time_mix_r + xx * (1 - self.time_mix_r)\n# The usual FFN operation\nk = self.key(xk)\nk = torch.square(torch.relu(k)) # from the Primer paper\nkv = self.value(k)\n# Apply an extra receptance-gate to kv\nrkv = torch.sigmoid(self.receptance(xr)) * kv\nreturn rkv\n```\nThe self.value, self.receptance matrices are all initialized to zero.\n## RWKV-4 improvements\n![RWKV-v3-plan](RWKV-v3-plan.png)\n## From GPT to RWKV (the formulas)\nLet F[t] be the system state at t.\nLet x[t] be the new external input at t.\nIn GPT, predicting F[t+1] requires considering F[0], F[1], .. F[t]. So it takes O(T^2) to generate a length T sequence.\nThe **simplified formula** for GPT:\n![F[\\mathrm{t}+1]=\\frac{\\sum_{\\mathrm{i}=0}^{\\mathrm{t}} \\exp (\\mathbf{Q}x[\\mathrm{t}] * \\mathbf{K}F[\\mathrm{i}]) \\cdot(\\mathbf{V}F[\\mathrm{i}])}{\\sum_{\\mathrm{i}=0}^{\\mathrm{t}} \\exp (\\mathbf{Q}x[\\mathrm{t}] * \\mathbf{K}F[\\mathrm{i}])}](https://render.githubusercontent.com/render/math?math=%5Ccol" + }, + { + "comment": "This code represents a formula for RWKV's (a language model) time-series prediction. The formula takes the previous timestep's output, as well as several other parameters into account to generate the next output in the series. However, it suggests that this capability may not be fully utilized due to complexities in the loss landscape and limitations of current optimizers. It also compares this formula with a simpler version used for RWKV's parallel mode.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/README.md\":556-562", + "content": "or%7Bblack%7D%5Cdisplaystyle+F%5B%5Cmathrm%7Bt%7D%2B1%5D%3D%5Cfrac%7B%5Csum_%7B%5Cmathrm%7Bi%7D%3D0%7D%5E%7B%5Cmathrm%7Bt%7D%7D+%5Cexp+%28%5Cmathbf%7BQ%7Dx%5B%5Cmathrm%7Bt%7D%5D+%2A+%5Cmathbf%7BK%7DF%5B%5Cmathrm%7Bi%7D%5D%29+%5Ccdot%28%5Cmathbf%7BV%7DF%5B%5Cmathrm%7Bi%7D%5D%29%7D%7B%5Csum_%7B%5Cmathrm%7Bi%7D%3D0%7D%5E%7B%5Cmathrm%7Bt%7D%7D+%5Cexp+%28%5Cmathbf%7BQ%7Dx%5B%5Cmathrm%7Bt%7D%5D+%2A+%5Cmathbf%7BK%7DF%5B%5Cmathrm%7Bi%7D%5D%29%7D)\nIt's very capable in theory, however that **does not mean we can fully utilize its capability with usual optimizers**. I suspect the loss landscape is too difficult for our current methods.\nCompare with the **simplified formula** for RWKV (the parallel mode, looks similar to Apple's AFT):\n![F[\\mathrm{t}+1]=\\sigma(\\mathbf{R}x[\\mathrm{t}]) \\cdot \\frac{\\sum_{\\mathrm{i}=0}^{\\mathrm{t}} \\exp (\\mathbf{W} \\cdot(\\mathrm{t}-\\mathrm{i})) \\cdot \\exp (\\mathbf{K}F[\\mathrm{i}]) \\cdot(\\mathbf{V}F[\\mathrm{i}])}{\\sum_{\\mathrm{i}=0}^{\\mathrm{t}} \\exp (\\mathbf{W} \\cdot(\\ma" + }, + { + "comment": "This code is calculating the weighted sum of F[i] for all i from 0 to t, where F[t+1] is the output. The contribution of each F[i] is determined by exp(Q * x[t] * K * F[i]), with R, K, V being trainable matrices and W being a trainable vector. This process takes place in GPT.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/README.md\":562-566", + "content": "thrm{t}-\\mathrm{i})) \\cdot \\exp (\\mathbf{K }F[\\mathrm{i}])}](https://render.githubusercontent.com/render/math?math=%5Ccolor%7Bblack%7D%5Cdisplaystyle+F%5B%5Cmathrm%7Bt%7D%2B1%5D%3D%5Csigma%28%5Cmathbf%7BR%7Dx%5B%5Cmathrm%7Bt%7D%5D%29+%5Ccdot+%5Cfrac%7B%5Csum_%7B%5Cmathrm%7Bi%7D%3D0%7D%5E%7B%5Cmathrm%7Bt%7D%7D+%5Cexp+%28%5Cmathbf%7BW%7D+%5Ccdot%28%5Cmathrm%7Bt%7D-%5Cmathrm%7Bi%7D%29%29+%5Ccdot+%5Cexp+%28%5Cmathbf%7BK%7DF%5B%5Cmathrm%7Bi%7D%5D%29+%5Ccdot%28%5Cmathbf%7BV%7DF%5B%5Cmathrm%7Bi%7D%5D%29%7D%7B%5Csum_%7B%5Cmathrm%7Bi%7D%3D0%7D%5E%7B%5Cmathrm%7Bt%7D%7D+%5Cexp+%28%5Cmathbf%7BW%7D+%5Ccdot%28%5Cmathrm%7Bt%7D-%5Cmathrm%7Bi%7D%29%29+%5Ccdot+%5Cexp+%28%5Cmathbf%7BK+%7DF%5B%5Cmathrm%7Bi%7D%5D%29%7D)\nThe R, K, V are trainable matrices, and W is a trainable vector (time-decay factor for each channel).\nIn GPT, the contribution of F[i] to F[t+1] is weighted by ![ \\exp (\\mathbf{Q}x[\\mathrm{t}] * \\mathbf{K}F[\\mathrm{i}]) ](https://render.githubusercontent.com/render/math?math=%5Ccolor%7Bblack%" + }, + { + "comment": "The code represents the contribution of F[i] to F[t+1] in RWKV-2, where the contribution is weighted by a combination of sigmoid function and exponential terms involving time and input values. The sigmoid function is non-linear (sigmoid) and the receptance matrix affects the calculation with ![\\sigma(\\mathbf{R}x[\\mathrm{t}])](https://render.githubusercontent.com/render/math?math=%5Ccolor%7Bblack%7D%5Cdisplaystyle+%5Csigma%28%5Cmathbf%7BR%7Dx%5B%5Cmathrm%7Bt%7D%5D%29).", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/README.md\":566-571", + "content": "7D%5Cdisplaystyle++%5Cexp+%28%5Cmathbf%7BQ%7Dx%5B%5Cmathrm%7Bt%7D%5D+%2A+%5Cmathbf%7BK%7DF%5B%5Cmathrm%7Bi%7D%5D%29+).\nIn RWKV-2, the contribution of F[i] to F[t+1] is weighted by ![\\sigma(\\mathbf{R}x[\\mathrm{t}]) \\cdot \\exp (\\mathbf{W} \\cdot(\\mathrm{t}-\\mathrm{i})) \\cdot \\exp (\\mathbf{K}F[\\mathrm{i}]) ](https://render.githubusercontent.com/render/math?math=%5Ccolor%7Bblack%7D%5Cdisplaystyle+%5Csigma%28%5Cmathbf%7BR%7Dx%5B%5Cmathrm%7Bt%7D%5D%29+%5Ccdot+%5Cexp+%28%5Cmathbf%7BW%7D+%5Ccdot%28%5Cmathrm%7Bt%7D-%5Cmathrm%7Bi%7D%29%29+%5Ccdot+%5Cexp+%28%5Cmathbf%7BK%7DF%5B%5Cmathrm%7Bi%7D%5D%29+).\n* The ![\\sigma](https://render.githubusercontent.com/render/math?math=%5Ccolor%7Bblack%7D%5Cdisplaystyle+%5Csigma) is a non-linearity and we can use sigmoid. \n* Note ![\\sigma(\\mathbf{R}x[\\mathrm{t}])](https://render.githubusercontent.com/render/math?math=%5Ccolor%7Bblack%7D%5Cdisplaystyle+%5Csigma%28%5Cmathbf%7BR%7Dx%5B%5Cmathrm%7Bt%7D%5D%29) is not in the denominator, and I call R the \"receptance\".\n* T" + }, + { + "comment": "This code presents a time-decay factor using the exponential function and proposes rewriting it into a Recurrent Neural Network (RNN) formula. The RNN formula is defined for F[1] and F[2], involving sigmoid functions, matrices K, R, V, and input x.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/README.md\":571-577", + "content": "he ![\\exp (\\mathbf{W} \\cdot(\\mathrm{t}-\\mathrm{i}))](https://render.githubusercontent.com/render/math?math=%5Ccolor%7Bblack%7D%5Cdisplaystyle+%5Cexp+%28%5Cmathbf%7BW%7D+%5Ccdot%28%5Cmathrm%7Bt%7D-%5Cmathrm%7Bi%7D%29%29) is the time-decay factor. I proposed the same idea (scaling the attention by distance) in Aug 2020 and called it the \"time-weighting\" (check the commit history of https://github.com/BlinkDL/minGPT-tuned).\nHere comes the punchline: we can rewrite it into a RNN (recursive formula). Note:\n![F[1]=\\sigma(\\mathbf{R }x[0]) \\cdot \\frac{ \\exp (\\mathbf{K }F[0]) \\cdot(\\mathbf{V }F[0])}{\\exp (\\mathbf{K }F[0])}](https://render.githubusercontent.com/render/math?math=%5Ccolor%7Bblack%7D%5Cdisplaystyle+F%5B1%5D%3D%5Csigma%28%5Cmathbf%7BR+%7Dx%5B0%5D%29+%5Ccdot+%5Cfrac%7B+%5Cexp+%28%5Cmathbf%7BK+%7DF%5B0%5D%29+%5Ccdot%28%5Cmathbf%7BV+%7DF%5B0%5D%29%7D%7B%5Cexp+%28%5Cmathbf%7BK+%7DF%5B0%5D%29%7D)\n![F[2]=\\sigma(\\mathbf{R }x[1]) \\cdot \\frac{ \\exp (\\mathbf{K }F[1]) \\cdot(\\mathbf{V }F[1])+\\exp " + }, + { + "comment": "This code calculates F[t+1] using a combination of exponentiated matrices and sigmoid function, where A[t] and B[t] are additional terms involved in the computation.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/README.md\":577-581", + "content": "(\\mathbf{W} ) \\cdot \\exp (\\mathbf{K }F[0]) \\cdot(\\mathbf{V }F[0])}{ \\exp (\\mathbf{K }F[1])+\\exp (\\mathbf{W} ) \\cdot \\exp (\\mathbf{K }F[0])}](https://render.githubusercontent.com/render/math?math=%5Ccolor%7Bblack%7D%5Cdisplaystyle+F%5B2%5D%3D%5Csigma%28%5Cmathbf%7BR+%7Dx%5B1%5D%29+%5Ccdot+%5Cfrac%7B+%5Cexp+%28%5Cmathbf%7BK+%7DF%5B1%5D%29+%5Ccdot%28%5Cmathbf%7BV+%7DF%5B1%5D%29%2B%5Cexp+%28%5Cmathbf%7BW%7D+%29+%5Ccdot+%5Cexp+%28%5Cmathbf%7BK+%7DF%5B0%5D%29+%5Ccdot%28%5Cmathbf%7BV+%7DF%5B0%5D%29%7D%7B+%5Cexp+%28%5Cmathbf%7BK+%7DF%5B1%5D%29%2B%5Cexp+%28%5Cmathbf%7BW%7D+%29+%5Ccdot+%5Cexp+%28%5Cmathbf%7BK+%7DF%5B0%5D%29%7D)\nTherefore it's straightforward to verify:\n![F[t+1]=\\sigma(\\mathbf{R }x[t]) \\cdot \\frac{\\exp (\\mathbf{K}F[\\mathrm{t}]) \\cdot(\\mathbf{V}F[\\mathrm{t}])+\\exp (\\mathbf{W}) \\cdot A[\\mathrm{t}]}{ \\exp (\\mathbf{K}F[\\mathrm{t}])+\\exp (\\mathbf{W}) \\cdot B[\\mathrm{t}]}](https://render.githubusercontent.com/render/math?math=%5Ccolor%7Bblack%7D%5Cdisplaystyle+F%5Bt%2B1%5D%3D%5Csigma%28%" + }, + { + "comment": "This code represents a mathematical equation that involves matrix operations and exponential functions. It uses variables A[t] and B[t] as the numerator and denominator of the previous step, respectively. The code also mentions that RWKV is performant due to its similarity to repeatedly applying a general diagonalizable matrix.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/README.md\":581-595", + "content": "5Cmathbf%7BR+%7Dx%5Bt%5D%29+%5Ccdot+%5Cfrac%7B%5Cexp+%28%5Cmathbf%7BK%7DF%5B%5Cmathrm%7Bt%7D%5D%29+%5Ccdot%28%5Cmathbf%7BV%7DF%5B%5Cmathrm%7Bt%7D%5D%29%2B%5Cexp+%28%5Cmathbf%7BW%7D%29+%5Ccdot+A%5B%5Cmathrm%7Bt%7D%5D%7D%7B+%5Cexp+%28%5Cmathbf%7BK%7DF%5B%5Cmathrm%7Bt%7D%5D%29%2B%5Cexp+%28%5Cmathbf%7BW%7D%29+%5Ccdot+B%5B%5Cmathrm%7Bt%7D%5D%7D)\nwhere A[t] and B[t] are the numerator and denominator of the previous step, respectively.\nI believe RWKV is performant because W is like repeatedly applying a diagonal matrix. Note (P^{-1} D P)^n = P^{-1} D^n P, so it is similar to repeatedly applying a general diagonalizable matrix.\nMoreover it's possible to turn it into a continuous ODE (a bit similar to State Space Models). I will write about it later.\n## Star History\n[![Star History Chart](https://api.star-history.com/svg?repos=BlinkDL/RWKV-LM&type=Date)](https://star-history.com/#BlinkDL/RWKV-LM&Date)\n## Multimodal ideas\nI have an idea for [text --> 32x32 RGB image] using a LM (transformer, RWKV, etc.). Will test it soon." + }, + { + "comment": "This code discusses using a language model for image processing tasks. It employs LM loss, color quantization to reduce vocabulary size, 2D positional embeddings, and random rounding in data loading for more efficient image processing.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/README.md\":597-608", + "content": "Firstly, LM loss (instead of L2 loss), so the image will not be blurry.\nSecondly, color quantization. For example, only allowing 8 levels for R/G/B. Then the image vocab size is 8x8x8 = 512 (for each pixel), instead of 2^24.\nTherefore, a 32x32 RGB image = a len1024 sequence of vocab512 (image tokens), which is a typical input for usual LMs.\n(Later we can use diffusion models to upsample and generate RGB888 images. We might be able to use a LM for this too.)\nThirdly, 2D positional embeddings that are easy for the model to understand.\nFor example, add one-hot X & Y coords to the first 64(=32+32) channels. Say if the pixel is at x=8, y=20, then we will add 1 to channel 8 and channel 52 (=32+20).\nMoreover probably we can add the float X & Y coords (normalized to 0~1 range) to another 2 channels. And other periodic pos. encoding might help too (will test). \nFinally, RandRound when doing the color quantization in the DataLoader.\nFor example, if the float level is 4.578, then there is a 57.8% chance to use 5, and (1-57.8%) chance to use 4." + }, + { + "comment": "This code snippet discusses a method for sampling a large dataset to train the model. It suggests picking a prime number p just less than x (total chunks), ensuring that p = 2 (mod 3). The author then recommends using (step * step * step) mod p for sampling and adds some randomness by adjusting the step value. This method aims to sample the dataset deterministically while maintaining randomness.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/README.md\":609-626", + "content": "And we can allow both 4 and 5 in the prediction, but the loss will be higher if the prediction is 4.\nMulti-task training might help too. I will try this dataset format:\n[TxtFirst] [Desc of Img (txt tokens)] [Img] [img tokens]\nand sometimes\n[ImgFirst] [img tokens] [Txt] [Desc of Img (txt tokens)]\n... the order of the imgs should be randomized in the DataLoader, and [TxtFirst] [ImgFirst] [Img] [Txt] are special tokens\nand do random sampling of the full dataset. So sometimes the model will see the img tokens first and then the corresponding txt tokens, which is a [img -> txt] task. And the model will see some partial imgs and partial txts. I think a char-level LM might help the model to write correct text on images.\n## How to sample a large dataset (for training)\nI am using a trick to sample the Pile deterministically yet randomly enough.\nLet's say the pile has x chunks (a chunk = ctx_len tokens).\npick a prime number p just less than x, and make sure p = 2 (mod 3).\nUse (step * step * step) mod p to sample it. Add some bias to step for extra randomness." + }, + { + "comment": "This code discusses two methods: top-p-x sampling for inference and a new method to find better learning rate schedules using the variational method of loss curve. The top-p-x sampling is similar to top-p but keeps all tokens with probability > x, and suggests trying x=0.01. For learning rates, the code proposes an efficient method to predict loss curves and fit parameters, suggesting a fixed-then-decay LR schedule.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/README.md\":628-644", + "content": "## The top-p-x sampling method (for inference)\nWe propose a new sampling method called top-p-x:\nit's like top-p, and the only difference is you also keep all tokens whose prob > x.\nTry x = 0.01 first.\n## Better Learning Rate Schedule via Variantional Method of Loss Curve\nI propose a simple new method to find better LR schedules. The method is cost-efficient and practical for large LMs. The takeaway is we can model the loss curve dynamics (phenomenology) w.r.t. the LR, and a nice closed-form LR curve can be directly computed from it using variantional method. Moreover we can predict the final loss with reasonable accuracy.\nUPDATE: In \"Conclusion 1.\", use the best-fitting regime (ignore the initial steps where our approximations break down) to fit the parameters.\nTry this: fixed lr for 1 hr, then exponential decay to 0.2 * lr in 12 hrs, and choose the t=[1hr, 13hr] segment.\nIn the last three plots, black = predicted loss curve of the new LR schedule, blue = original (unoptimized) real loss curve, orange = new LR schedule." + }, + { + "comment": "This code describes the architecture of RWKV v1 language model, which consists of alternating time-mix and channel-mix layers. The time-mix layer uses sigmoid activation function with a weighted sum of channel inputs, while the channel-mix layer employs sigmoid activation with a weighted sum of different dimensions.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/README.md\":646-655", + "content": "![better_lr_schedule](Research/better_lr_schedule.png)\n# RWKV v1\nWe propose the RWKV language model, with alternating time-mix and channel-mix layers:\n\n* The R, K, V are generated by linear transforms of input, and W is parameter. The idea of RWKV is to decompose attention into R(target) * W(src, target) * K(src). So we can call R \"receptance\", and sigmoid means it's in 0~1 range.\n* The Time-mix is similar to AFT (https://arxiv.org/abs/2105.14103). There are two differences.\n(1) We changed the normalization (denominator). For masked language models, we define:\n\n**(UPDATE: We are using the original AFT normalization in v2)**\nInitialize K and R matrices (and the output projection matrix) to ZERO for fast & stable convergence.\n(2) We decompose W_{t,u,c} and introduce multi-head W (here h is the corresponding head of c):\n\nMoreover we multiply the final output of Time-mix layer by \u03b3(t). The reason for the \u03b1 \u03b2 \u03b3 factors, is because the context size is smaller when t is small, and this can be compensated using the \u03b1 \u03b2 \u03b3 factors.\n**(UPDATE: We remove \u03b1 \u03b2 \u03b3 factors in v2-RNN and restrict W to be of a simple form and hence able to rewrite it as RNN)**\n* The Channel-mix is similar to GeGLU (https://arxiv.org/abs/2002.05202) with an extra R factor. Initialize R and W matrices to ZERO for fast & stable convergence." + }, + { + "comment": "This code adds token-shift (time-shift mixing) to the model as in minGPT-tuned, which divides channel sizes by 2 and shifts them. This improves performance for char-level English and Chinese LMs but requires larger embedding sizes (at least 1024) for BPE-level English LMs. The shifted channels help the model focus on collecting previous context information.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/README.md\":686-710", + "content": "* Finally, we add extra token-shift (time-shift mixing) as in (https://github.com/BlinkDL/minGPT-tuned).\n# Token-shift (time-shift mixing)\nThe token-shift explicitly uses (half the channels of this token) & (half the channels of prev token) to generate all vectors (QKV, RWKV, ...).\n```\nself.time_shift = nn.ZeroPad2d((0,0,1,-1))\nx = torch.cat([self.time_shift(x[:, :, :C//2]), x[:, :, C//2:]], dim = -1)\n```\nDividing channels by 2 and shift-1 works great for char-level English and char-level Chinese LM.\nHowever for BPE-level English LM, it's only effective if your embedding is large enough (at least 1024 - so the usual small L12-D768 model is not enough).\nMy theory on the effectiveness of token-shift:\nWhen we train a GPT, the hidden representation of a token has to accomplish two different objects:\n1. Predict the next token. Sometimes this is easy (obvious next token).\n2. Collect all previous context info, so later tokens can use it. This is always hard.\nThe shifted channels can focus on (2), so we ha" + }, + { + "comment": "This code is implementing the Head-QK trick in a transformer model. It adds extra Q and K projections to the final output, allowing the model to directly copy or avoid tokens in the context. The learned weights can be used for named entity recognition. Token-shift is used in the self-attention mechanism, and less mixing may be required in higher layers. The MHA_pro model in this repo demonstrates strong performance.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/README.md\":710-723", + "content": "ve good propagation of info. It's like some kind of residual connection, or a small RNN inside the transformer.\nYou can use token-shift in usual QKV self-attention too. I looked at the weights, and found V really likes the shifted channels, less so for Q. Makes sense if you think about it. I also found you may want to use less mixing in higher layers.\np.s. There is a MHA_pro model in this repo with strong performance. Give it a try :)\n# The Head-QK Trick: learning to copy and avoid tokens\nIn usual transformer, a small model has difficulty copying tokens (such as person names) in the context. We add extra Q & K to the final output such that the model can directly copy (or avoid) tokens in the context. Afterwards the model will teach itself NER (named entity recognition) if you look at the learned weights.\n```\nq = self.head_q(x)[:,:T,:] # projecting to 256-d\nk = self.head_k(x)[:,:T,:] # projecting to 256-d\nc = (q @ k.transpose(-2, -1)) * (1.0 / 256)\nc = c.masked_fill(self.copy_mask[:T,:T] == 0, 0)" + }, + { + "comment": "The code calculates the one-hot representation of an index and multiplies it with the previous value 'c'. It then adds this product to 'x', which is processed by the head module. The resulting value is stored in 'c' for further processing.\n\nThe top-a sampling method involves finding the maximum probability, removing probabilities lower than a threshold (0.2 * pow(max_prob, 2)), and allowing more or fewer possibilities depending on the max_prob value. This improves accuracy by focusing on the most likely tokens while still considering other alternatives.\n\nThe code removes tokens with probabilities below the limit from the logits, effectively ignoring them during the computation. The threshold used here is 0.02 times the squared maximum probability.\n\nThe performance results show that the character-level loss is improved on the simplebooks-92 dataset using this method.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/README.md\":724-753", + "content": "c = c @ F.one_hot(idx, num_classes = self.config.vocab_size).float() \nx = self.head(x) + c\n```\nNote: when a token occurs multiple times in the context, it might be better to use max(prob) instead of sum(prob).\n# The top-a sampling method\nWe also propose a new sampling method called top-a (as in src/utils.py):\n(1) Find the max probability p_max after softmax.\n(2) Remove all entries whose probability is lower than 0.2 * pow(p_max, 2). So it's adaptive, hence \"top-a\".\n(3) Feel free to tune the 0.2 and 2 factor. Tune 0.2 first.\nThe idea of top-a:\n1. If max_prob=0.9, then remove all tokens with prob < 0.162 (so, removing all alternatives)\n2. If max_prob=0.5, then remove all tokens with prob < 0.05 (so, allowing more choices)\n3. If max_prob=0.1, then remove all tokens with prob < 0.002 (so, allowing lots of possibilities)\n```\nprobs = F.softmax(logits, dim=-1)\nlimit = torch.pow(torch.max(probs), 2) * 0.02\nlogits[probs < limit] = -float('Inf')\n```\n# Performance\nCharacter-level loss on simplebooks-92 dataset https://dldata-public.s3.us-east-2.amazonaws.com/simplebooks.zip" + }, + { + "comment": "The code shows a comparison of different attention mechanisms in a language model, with RWKV having better performance and VRAM efficiency. It references a specific software and discusses the importance of careful initialization for fast convergence.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/README.md\":755-784", + "content": "![RWKV-vs-MHA](RWKV-vs-MHA.png)\nGray: usual MHA+Rotary+GeGLU - performance not as good. 17.2M params.\nRed: RWKV (\"linear\" attention) - VRAM friendly - quite faster when ctx window is long - good performance. 16.6M params.\nGreen: MHA+Rotary+GeGLU+Token_shift. 17.2M params.\nBlue: MHA_pro (MHA with various tweaks & RWKV-type-FFN) - slow - needs more VRAM - good performance. 16.6M params.\n```\n@software{peng_bo_2021_5196578,\n author = {PENG Bo},\n title = {BlinkDL/RWKV-LM: 0.01},\n month = aug,\n year = 2021,\n publisher = {Zenodo},\n version = {0.01},\n doi = {10.5281/zenodo.5196577},\n url = {https://doi.org/10.5281/zenodo.5196577}\n}\n```\n# Initialization\nWe use careful initialization for RWKV to get fast convergence - orthogonal matrices with proper scaling, and special time_w curves. Check model.py for details.\nSome learned time_w examples:\n![RWKV-time-w](RWKV-time-w.png)" + } + ] +} \ No newline at end of file diff --git a/docs/doc/16bad05c-b021-452b-9980-04fe0cc0000c.json b/docs/doc/16bad05c-b021-452b-9980-04fe0cc0000c.json new file mode 100644 index 00000000..c289a23d --- /dev/null +++ b/docs/doc/16bad05c-b021-452b-9980-04fe0cc0000c.json @@ -0,0 +1,40 @@ +{ + "summary": "The Trainer class enables CUDA optimizations and handles data loading for training a language model. It iterates over the data, updates parameters, decays learning rate if necessary, logs progress, saves checkpoints, and manages learning rate decay with a tokens counter.", + "details": [ + { + "comment": "The code imports necessary libraries for training a language model, sets some default hyperparameters such as maximum epochs and batch size, and creates a TrainerConfig class to manage these settings. The code also enables CUDA optimizations and opens a log file for output.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/src/trainer.py\":0-35", + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nfrom torch.utils.data.dataloader import DataLoader\nfrom torch.optim.lr_scheduler import LambdaLR\nfrom torch.nn import functional as F\nimport torch.nn as nn\nimport torch.optim as optim\nimport torch\nfrom tqdm.auto import tqdm\nimport numpy as np\nimport logging\nimport os\nimport datetime\nimport sys\nimport math\n# import wandb # comment this if you don't have wandb\n# print('logging to wandb... (comment it if you don\\'t have wandb)')\nlogger = logging.getLogger(__name__)\ntorch.backends.cudnn.benchmark = True\ntorch.backends.cudnn.allow_tf32 = True\ntorch.backends.cuda.matmul.allow_tf32 = True\nlog_file = open(\"mylog.txt\", \"a\")\nclass TrainerConfig:\n max_epochs = 10\n batch_size = 64\n learning_rate = 4e-4\n betas = (0.9, 0.99)\n eps = 1e-8\n grad_norm_clip = 1.0" + }, + { + "comment": "This code initializes a Trainer class with parameters for model, train and test datasets, and config. It also includes options for learning rate decay, warmup and final tokens, epoch save frequency, number of data loader workers, and device usage. The code checks if the WandB module is available and sets up wandb initialization with combined configuration from model and user input.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/src/trainer.py\":36-66", + "content": " lr_decay = True # linear warmup followed by cosine decay\n warmup_tokens = 0\n final_tokens = 0\n epoch_save_frequency = 0\n epoch_save_path = 'trained-'\n num_workers = 0 # for DataLoader\n def __init__(self, **kwargs):\n for k, v in kwargs.items():\n setattr(self, k, v)\nclass Trainer:\n def __init__(self, model, train_dataset, test_dataset, config):\n self.model = model\n self.train_dataset = train_dataset\n self.test_dataset = test_dataset\n self.config = config\n self.avg_loss = -1\n self.steps = 0\n if 'wandb' in sys.modules:\n cfg = model.config\n for k in config.__dict__:\n setattr(cfg, k, config.__dict__[k]) # combine cfg\n wandb.init(project=\"RWKV-LM\", name=self.get_run_name() + '-' +\n datetime.datetime.today().strftime('%Y-%m-%d-%H-%M-%S'), config=cfg, save_code=False)\n self.device = 'cpu'\n if torch.cuda.is_available(): # take over whatever gpus are on the system" + }, + { + "comment": "The code initializes the device for CUDA operations, generates a unique run name based on model configuration, and defines a function to run an epoch. It also configures optimizers according to the model's parameters and handles data loading for training and testing with specified configurations.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/src/trainer.py\":67-91", + "content": " self.device = torch.cuda.current_device()\n def get_run_name(self):\n raw_model = self.model.module if hasattr(\n self.model, \"module\") else self.model\n cfg = raw_model.config\n run_name = str(cfg.vocab_size) + '-' + str(cfg.ctx_len) + '-' + \\\n cfg.model_type + '-' + str(cfg.n_layer) + '-' + str(cfg.n_embd)\n return run_name\n def train(self):\n model, config = self.model, self.config\n raw_model = model.module if hasattr(self.model, \"module\") else model\n optimizer = raw_model.configure_optimizers(config)\n def run_epoch(split):\n is_train = split == 'train'\n model.train(is_train)\n data = self.train_dataset if is_train else self.test_dataset\n if config.num_workers > 0:\n loader = DataLoader(data, shuffle=False, pin_memory=True,\n batch_size=config.batch_size,\n num_workers=config.num_workers)\n else:" + }, + { + "comment": "This code initializes a DataLoader for loading data in batches, creating a progress bar (pbar) to track progress, and iterates over the data. It then places the data on the correct device, forwards the model, computes loss, backpropagates if training, updates parameters, clips gradients if necessary, and optionally decays learning rate based on progress.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/src/trainer.py\":92-116", + "content": " loader = DataLoader(data, shuffle=False,\n batch_size=config.batch_size,\n num_workers=config.num_workers)\n pbar = tqdm(enumerate(loader), total=len(\n loader), bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}') if is_train else enumerate(loader)\n for it, (x, y) in pbar:\n x = x.to(self.device) # place data on the correct device\n y = y.to(self.device)\n with torch.set_grad_enabled(is_train):\n _, loss = model(x, y) # forward the model\n if is_train: # backprop and update the parameters\n model.zero_grad()\n loss.backward()\n if config.grad_norm_clip > 0:\n torch.nn.utils.clip_grad_norm_(\n model.parameters(), config.grad_norm_clip)\n optimizer.step()\n if config.lr_decay: # decay the learning rate based on our progress" + }, + { + "comment": "This code segment is responsible for determining the learning rate multiplier during training. It first checks if the number of processed tokens is less than the warmup token count. If so, it performs linear warmup by interpolating between the final learning rate and the initial learning rate based on the proportion of processed tokens to warmup tokens. Otherwise, it calculates the exponential learning rate decay by using a progress variable representing the fraction of completed tokens beyond the warmup phase.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/src/trainer.py\":117-132", + "content": " # number of tokens processed this step (i.e. label is not -100)\n self.tokens += (y >= 0).sum()\n lr_final_factor = config.lr_final / config.learning_rate\n if self.tokens < config.warmup_tokens:\n # linear warmup\n lr_mult = lr_final_factor + \\\n (1 - lr_final_factor) * float(self.tokens) / \\\n float(config.warmup_tokens)\n progress = 0\n else:\n # exponential learning rate decay\n progress = float(self.tokens - config.warmup_tokens) / float(max(1, config.final_tokens - config.warmup_tokens))\n if progress >= 1:\n lr_mult = lr_final_factor\n else:\n lr_mult = math.exp(math.log(lr_final_factor) * pow(progress, 1))" + }, + { + "comment": "This code updates the learning rate (lr) based on a config file and adjusts the loss, average loss, and progress during training. It also logs the loss to WandB and updates the progress bar description with relevant information like loss, lr, etc.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/src/trainer.py\":133-154", + "content": " lr = config.learning_rate * lr_mult\n for param_group in optimizer.param_groups:\n param_group['lr'] = lr\n else:\n lr = config.learning_rate\n now_loss = loss.item() # report progress\n self.lr = lr\n if 'wandb' in sys.modules:\n wandb.log({\"loss\": now_loss},\n step=self.steps * self.config.batch_size)\n self.steps += 1\n if self.avg_loss < 0:\n self.avg_loss = now_loss\n else:\n factor = 1 / (it + 1)\n self.avg_loss = self.avg_loss * \\\n (1.0 - factor) + now_loss * factor\n pbar.set_description(\n f\"mini-epoch {epoch+1} prog {progress*100.0:.2f}% iter {it}: ppl {math.exp(self.avg_loss):.2f} loss {self.avg_loss:.4f} lr {lr:e}\")" + }, + { + "comment": "This code initializes a tokens counter for learning rate decay, trains the model for specified epochs, logs progress, and saves model checkpoints at user-specified intervals or at the end of training.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/src/trainer.py\":156-170", + "content": " self.tokens = 0 # counter used for learning rate decay\n for epoch in range(config.max_epochs):\n run_epoch('train')\n log_file.write(\n f'{epoch+1} {self.avg_loss:.6f} {math.exp(self.avg_loss):.4f} {self.lr:.8f} {datetime.datetime.now()} \\n')\n log_file.flush()\n if (self.config.epoch_save_frequency > 0 and epoch % self.config.epoch_save_frequency == 0) or (epoch == config.max_epochs - 1):\n # DataParallel wrappers keep raw model object in .module\n raw_model = self.model.module if hasattr(\n self.model, \"module\") else self.model\n torch.save(raw_model.state_dict(),\n self.config.epoch_save_path + str(epoch+1) + '.pth')" + } + ] +} \ No newline at end of file diff --git a/docs/doc/180ef764-b7e6-4ef2-ae6b-1e1e4d1997a4.json b/docs/doc/180ef764-b7e6-4ef2-ae6b-1e1e4d1997a4.json new file mode 100644 index 00000000..979ca606 --- /dev/null +++ b/docs/doc/180ef764-b7e6-4ef2-ae6b-1e1e4d1997a4.json @@ -0,0 +1,75 @@ +{ + "summary": "This code utilizes PyTorch and CUDA to implement RWKV Language Model and GPT model with time-based mixing, layer normalization, attention heads, Adam optimizer, weight decay, and efficient attention retrieval for forward computations.", + "details": [ + { + "comment": "The code is for the RWKV Language Model, implemented in PyTorch with CUDA kernel. It defines constants such as `RWKV_K_CLAMP`, `RWKV_K_EPS`, and `RWKV_HEAD_QK_DIM`. The CUDA kernel limits the maximum context length to 1024, allows grouping for forward and backward passes.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/src/model.py\":0-24", + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nfrom torch.utils.cpp_extension import load\nimport math\nimport numpy as np\nimport logging\nimport torch\nimport torch.nn as nn\nfrom torch.nn import functional as F\nlogger = logging.getLogger(__name__)\nRWKV_K_CLAMP = 60 # e^60 = 1e26\nRWKV_K_EPS = 1e-8\nRWKV_HEAD_QK_DIM = 256\nprint(f'\\nRWKV_K_CLAMP {RWKV_K_CLAMP} RWKV_K_EPS {RWKV_K_EPS} RWKV_HEAD_QK_DIM {RWKV_HEAD_QK_DIM}\\n')\n########################################################################################################\n# CUDA Kernel\n########################################################################################################\nT_MAX = 1024 # increase this if your ctx_len > 1024\nB_GROUP_FORWARD = 4 # set to 8 for best performance\nB_GROUP_BACKWARD = 2 # set to 2 for best performance (sometimes 8 is faster)" + }, + { + "comment": "This code defines a TimeX class that extends torch.autograd.Function for the TimeX operation. It includes a forward method to calculate the output and a backward method for gradient computation. The class initializes context variables (B, C, T) based on input arguments, performs assertions on T, B, and checks contiguity of input tensors. It then calls the timex_cuda.forward function with saved tensors, an empty tensor for wk output, and other input arguments. The backward method performs similar assertions and uses saved tensors and gradient input gwk to calculate gradients for w and k. T_MAX, B_GROUP_FORWARD, and B_GROUP_BACKWARD are constants used in assertions.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/src/model.py\":26-49", + "content": "timex_cuda = load(name=\"timex\", sources=[\"cuda/timex_op.cpp\", \"cuda/timex_cuda.cu\"],\n verbose=True, extra_cuda_cflags=['--use_fast_math', '--extra-device-vectorization', f'-DTmax={T_MAX}', f'-DBF={B_GROUP_FORWARD}', f'-DBB={B_GROUP_BACKWARD}'])\nclass TimeX(torch.autograd.Function):\n @staticmethod\n def forward(ctx, w, k, B, C, T, eps):\n ctx.B = B\n ctx.C = C\n ctx.T = T\n assert ctx.T % 4 == 0 and ctx.T <= T_MAX and ctx.B % B_GROUP_FORWARD == 0 and ctx.B % B_GROUP_BACKWARD == 0\n w = w.contiguous()\n k = k.contiguous()\n ctx.save_for_backward(w, k)\n wk = torch.empty((B, C, T), device='cuda',\n memory_format=torch.contiguous_format)\n timex_cuda.forward(w, k, wk, eps, B, C, T)\n return wk\n @staticmethod\n def backward(ctx, gwk):\n assert ctx.T % 4 == 0 and ctx.T <= T_MAX and ctx.B % B_GROUP_FORWARD == 0 and ctx.B % B_GROUP_BACKWARD == 0\n w, k = ctx.saved_tensors\n gw = torch.empty((ctx.B, ctx.C, ctx.T), device='cuda'," + }, + { + "comment": "This code snippet initializes a model's weights using RWKV-v3's time and channel mixing techniques. It creates empty tensors, performs backward calculations on the weight matrix (w) and key matrix (k), and returns gradients for further processing. The `RWKV_Init` function initializes the linear and embedding layers of a module using specific naming conventions.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/src/model.py\":50-68", + "content": " memory_format=torch.contiguous_format)\n gk = torch.empty((ctx.B, ctx.C, ctx.T), device='cuda',\n memory_format=torch.contiguous_format)\n timex_cuda.backward(w, k, gwk.contiguous(), gw,\n gk, ctx.B, ctx.C, ctx.T)\n return (gw.sum(dim=0), gk, None, None, None, None)\n########################################################################################################\n# RWKV: RWKV Time-mix + RWKV Channel-mix\n########################################################################################################\ndef RWKV_Init(module, config): # fancy initialization of all lin & emb layer in the module\n for m in module.modules():\n if not isinstance(m, (nn.Linear, nn.Embedding)):\n continue\n with torch.no_grad():\n name = '[unknown weight]'\n for name, parameter in module.named_parameters(): # find the name of the weight\n if id(m.weight) == id(parameter):" + }, + { + "comment": "This code checks the type of a layer in the neural network model and applies different weight initialization depending on its type. If it is an Embedding or Linear layer, it adjusts the gain and scale accordingly. If it has a scale_init attribute, that value is used for initialization. If scale is -999, it initializes weights with identity matrix.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/src/model.py\":69-97", + "content": " break\n shape = m.weight.data.shape\n gain = 1.0\n scale = 1.0 # extra scale for gain\n if isinstance(m, nn.Embedding):\n gain = math.sqrt(max(shape[0], shape[1]))\n if shape[0] == config.vocab_size and shape[1] == config.n_embd: # token emb?\n scale = 1e-4\n else:\n scale = 0\n if isinstance(m, nn.Linear):\n if m.bias is not None:\n m.bias.data.zero_()\n if shape[0] > shape[1]:\n gain = math.sqrt(shape[0] / shape[1])\n if shape[0] == config.vocab_size and shape[1] == config.n_embd: # final projection?\n scale = 0.5\n if hasattr(m, 'scale_init'):\n scale = m.scale_init\n # print(str(shape[0]).ljust(5), str(shape[1]).ljust(5), f'{round(scale,2):g}'.ljust(4), name)\n gain *= scale\n if scale == -999:\n nn.init.eye_(m.weight)" + }, + { + "comment": "This code initializes the weights of matrices in the RWKV_TimeMix module based on a gain value. If the gain is zero, it sets the weights to zero. If the gain is positive, it uses orthogonal initialization with the specified gain. Otherwise, it uses normal initialization with a mean of 0 and a negative standard deviation. The class RWKV_TimeMix is a custom module that takes a configuration and layer ID as inputs and performs time-based curve operations for attention scores. It also initializes a time_curve tensor and calculates a time_decay based on the layer ID.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/src/model.py\":98-126", + "content": " elif gain == 0:\n # zero init is great for some RWKV matrices\n nn.init.zeros_(m.weight)\n elif gain > 0:\n nn.init.orthogonal_(m.weight, gain=gain)\n else:\n nn.init.normal_(m.weight, mean=0.0, std=-scale)\nclass RWKV_TimeMix(nn.Module):\n def __init__(self, config, layer_id):\n super().__init__()\n self.layer_id = layer_id\n self.ctx_len = config.ctx_len\n self.n_embd = config.n_embd\n attn_sz = config.n_embd\n with torch.no_grad(): # fancy init\n self.time_curve = torch.tensor([-(config.ctx_len - 2 - i) for i in range(config.ctx_len-1)]).unsqueeze(0)\n self.time_curve = self.time_curve.to('cuda')\n ratio_0_to_1 = (layer_id / (config.n_layer - 1)) # 0 to 1\n ratio_1_to_almost0 = (1.0 - (layer_id / config.n_layer)) # 1 to ~0\n # fancy time_decay\n decay_speed = torch.ones(attn_sz, 1)\n for h in range(attn_sz):\n decay_speed[h][0] = -5 + 8 * (h / (attn_sz-1)) ** (0.7 + 1.3 * ratio_0_to_1)" + }, + { + "comment": "This code initializes and sets various parameters for an attention mechanism in a model. It includes time-related parameters such as `time_decay`, `time_first`, `time_mix_k`, `time_mix_v`, `time_mix_r` and `time_shift`. The `key`, `value`, and `receptance` layers are also defined, each with a specified number of input/output dimensions. These parameters will be used to calculate attention scores between queries and keys, allowing for more effective information retrieval from the input sequence.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/src/model.py\":127-147", + "content": " self.time_decay = nn.Parameter(decay_speed)\n # print(layer_id, self.time_decay.flatten()[:3].cpu().numpy(), '...', self.time_decay.flatten()[-3:].cpu().numpy())\n # fancy time_first\n zigzag = (torch.tensor([(i+1)%3 - 1 for i in range(attn_sz)]) * 0.5).unsqueeze(1)\n self.time_first = nn.Parameter(torch.ones(attn_sz, 1) * math.log(0.3) + zigzag)\n # fancy time_mix\n x = torch.ones(1, 1, config.n_embd)\n for i in range(config.n_embd):\n x[0, 0, i] = i / config.n_embd\n self.time_mix_k = nn.Parameter(torch.pow(x, ratio_1_to_almost0))\n self.time_mix_v = nn.Parameter(torch.pow(x, ratio_1_to_almost0) + 0.3 * ratio_0_to_1)\n self.time_mix_r = nn.Parameter(torch.pow(x, 0.5 * ratio_1_to_almost0))\n self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))\n self.key = nn.Linear(config.n_embd, attn_sz, bias=False)\n self.value = nn.Linear(config.n_embd, attn_sz, bias=False)\n self.receptance = nn.Linear(config.n_embd, attn_sz, bias=False)" + }, + { + "comment": "This code initializes the model and defines the forward function, which performs time-mixing operations on input data, then uses this mixed data to produce attention keys, values, and receptances. These are then clamped and transformed into exponential form before being combined in a weighted sum. A W-curve is also calculated for some future use that may eliminate the need for clamping.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/src/model.py\":149-174", + "content": " self.output = nn.Linear(attn_sz, config.n_embd, bias=False)\n self.key.scale_init = 0\n self.receptance.scale_init = 0\n self.output.scale_init = 0\n def forward(self, x):\n B, T, C = x.size() # x = (Batch,Time,Channel)\n # Mix x with the previous timestep to produce xk, xv, xr\n xx = self.time_shift(x) # self.time_shift = nn.ZeroPad2d((0,0,1,-1))\n xk = x * self.time_mix_k + xx * (1 - self.time_mix_k)\n xv = x * self.time_mix_v + xx * (1 - self.time_mix_v)\n xr = x * self.time_mix_r + xx * (1 - self.time_mix_r)\n # Use xk, xv, xr to produce k, v, r\n k = self.key(xk).transpose(-1, -2)\n v = self.value(xv).transpose(-1, -2)\n r = self.receptance(xr)\n # RWKV_K_CLAMP can be removed if the CUDA kernel substracts the correct k_max for each k (I will do this later)\n k = torch.clamp(k, max=RWKV_K_CLAMP) # clamp k to avoid overflow\n k = torch.exp(k)\n kv = k * v\n # Compute the W-curve = [e^(-n * e^time_decay), e^(-(n-1) * e^time_decay), ..., 1, e^(time_first)]" + }, + { + "comment": "This code defines a RWKV_ChannelMix module that performs time-based mixing of kv and k tensors using the TimeX function. It also calculates wkv and wk tensors, applies sigmoid function, and outputs the resulting rwkv tensor. The class inherits from nn.Module and initializes with configuration parameters and layer ID. It includes a time_shift operation and sets fancy init for time_mix_k using torch.pow.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/src/model.py\":175-203", + "content": " self.time_w = torch.cat(\n [torch.exp(self.time_decay) * self.time_curve, self.time_first], dim=-1)\n w = torch.exp(self.time_w)\n # Use W to mix kv and k respectively. Add K_EPS to wk to avoid divide-by-zero\n wkv = TimeX.apply(w, kv, B, C, T, 0)\n # RWKV_K_EPS can be removed if the CUDA kernel sets 0/0 = 0 (I will do this later)\n wk = TimeX.apply(w, k, B, C, T, RWKV_K_EPS)\n rwkv = torch.sigmoid(r) * (wkv / wk).transpose(-1, -2)\n rwkv = self.output(rwkv)\n return rwkv\nclass RWKV_ChannelMix(nn.Module):\n def __init__(self, config, layer_id):\n super().__init__()\n self.layer_id = layer_id\n self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))\n with torch.no_grad(): # fancy init of time_mix\n ratio_1_to_almost0 = (1.0 - (layer_id / config.n_layer)) # 1 to ~0\n x = torch.ones(1, 1, config.n_embd)\n for i in range(config.n_embd):\n x[0, 0, i] = i / config.n_embd\n self.time_mix_k = nn.Parameter(torch.pow(x, ratio_1_to_almost0))" + }, + { + "comment": "This code is for a GPT model implementation with time-based mixing. It initializes parameters, defines forward pass calculations, and contains a configuration class for the model. The model uses time shifting, mixing, key/value calculations, and sigmoid activation functions in its operations.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/src/model.py\":204-232", + "content": " self.time_mix_r = nn.Parameter(torch.pow(x, ratio_1_to_almost0))\n hidden_sz = 4 * config.n_embd\n self.key = nn.Linear(config.n_embd, hidden_sz, bias=False)\n self.receptance = nn.Linear(config.n_embd, config.n_embd, bias=False)\n self.value = nn.Linear(hidden_sz, config.n_embd, bias=False)\n self.value.scale_init = 0\n self.receptance.scale_init = 0\n def forward(self, x):\n xx = self.time_shift(x)\n xk = x * self.time_mix_k + xx * (1 - self.time_mix_k)\n xr = x * self.time_mix_r + xx * (1 - self.time_mix_r)\n k = self.key(xk)\n k = torch.square(torch.relu(k))\n kv = self.value(k)\n rkv = torch.sigmoid(self.receptance(xr)) * kv\n return rkv\n########################################################################################################\n# The GPT Model with our blocks\n########################################################################################################\nclass GPTConfig:\n def __init__(self, vocab_size, ctx_len, **kwargs):" + }, + { + "comment": "This code defines a class for the RWKV model's Block, which is an instance of nn.Module. It contains layer normalization layers and two modules: RWKV_TimeMix and RWKV_ChannelMix. The forward function applies these modules sequentially to input x after layer normalization. If this is the first block (layer_id == 0), it also includes an additional layer normalization and, if a specific model type is specified, applies the ffnPre module before the other modules.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/src/model.py\":233-265", + "content": " self.vocab_size = vocab_size\n self.ctx_len = ctx_len\n for k, v in kwargs.items():\n setattr(self, k, v)\nclass Block(nn.Module):\n def __init__(self, config, layer_id):\n super().__init__()\n self.config = config\n self.layer_id = layer_id\n self.ln1 = nn.LayerNorm(config.n_embd)\n self.ln2 = nn.LayerNorm(config.n_embd)\n if self.layer_id == 0:\n self.ln0 = nn.LayerNorm(config.n_embd)\n if self.layer_id == 0 and self.config.model_type == 'RWKV-ffnPre':\n self.ffnPre = RWKV_ChannelMix(config, layer_id+1000)\n else:\n self.att = RWKV_TimeMix(config, layer_id)\n self.ffn = RWKV_ChannelMix(config, layer_id)\n def forward(self, x):\n if self.layer_id == 0:\n x = self.ln0(x) \n if self.layer_id == 0 and self.config.model_type == 'RWKV-ffnPre':\n x = x + self.ffnPre(self.ln1(x)) # better in some cases\n else:\n x = x + self.att(self.ln1(x))\n x = x + self.ffn(self.ln2(x))" + }, + { + "comment": "The code defines a class called \"GPT\" which inherits from nn.Module and initializes its parameters based on the given configuration. It includes an embedding layer, multiple blocks, a normalization layer, and different linear layers for output. If RWKV_HEAD_QK_DIM is greater than 0, it also initializes extra head layers for Q and K. The code ends by printing the total number of parameters in the model and logging it.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/src/model.py\":266-296", + "content": " return x\nclass GPT(nn.Module):\n def __init__(self, config):\n super().__init__()\n self.step = 0\n self.config = config\n self.emb = nn.Embedding(config.vocab_size, config.n_embd)\n self.blocks = nn.Sequential(*[Block(config, i)\n for i in range(config.n_layer)])\n self.ln_out = nn.LayerNorm(config.n_embd)\n self.head = nn.Linear(config.n_embd, config.vocab_size, bias=False)\n if RWKV_HEAD_QK_DIM > 0:\n self.head_q = nn.Linear(config.n_embd, RWKV_HEAD_QK_DIM, bias=False)\n self.head_q.scale_init = 0\n self.head_k = nn.Linear(config.n_embd, RWKV_HEAD_QK_DIM, bias=False)\n self.head_k.scale_init = 0.1\n self.register_buffer(\"copy_mask\", torch.tril(\n torch.ones(config.ctx_len, config.ctx_len)))\n self.ctx_len = config.ctx_len\n RWKV_Init(self, config)\n logger.info(\"number of parameters: %e\", sum(p.numel()\n for p in self.parameters()))" + }, + { + "comment": "This code defines a model with functions for getting the context length, initializing weights, and configuring optimizers. It separates parameters into those subject to weight decay and those not, and ensures no parameter is included in both sets.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/src/model.py\":298-323", + "content": " def get_ctx_len(self):\n return self.ctx_len\n def _init_weights(self, module):\n if isinstance(module, (nn.Linear)):\n module.weight.data.normal_(mean=0.0, std=0.01)\n if isinstance(module, (nn.Embedding)):\n module.weight.data.normal_(mean=0.0, std=1e-5)\n if isinstance(module, nn.Linear) and module.bias is not None:\n module.bias.data.zero_()\n def configure_optimizers(self, train_config):\n # separate out all parameters to those that will and won't experience regularizing weight decay\n decay = set()\n no_decay = set()\n for mn, m in self.named_modules(): # here we disable weight_decay\n for pn, p in m.named_parameters():\n fpn = '%s.%s' % (mn, pn) if mn else pn # full param name\n no_decay.add(fpn)\n param_dict = {pn: p for pn, p in self.named_parameters()}\n inter_params = decay & no_decay\n union_params = decay | no_decay\n assert len(\n inter_params) == 0, \"parameters %s made it into both decay/no_decay sets!\" % (str(inter_params), )" + }, + { + "comment": "This code defines a model and an optimization function. The model has embedding layers, blocks, and a layer normalization layer. It also includes optional attention heads for query-key calculations. The forward function performs the model's computations based on input indexes and optionally produces output from attention heads. The optimizer sets up the Adam optimizer for training with specified learning rate and betas.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/src/model.py\":324-353", + "content": " assert len(param_dict.keys() - union_params) == 0, \"parameters %s were not separated into either decay/no_decay set!\" \\\n % (str(param_dict.keys() - union_params), )\n optim_groups = [\n {\"params\": [param_dict[pn]\n for pn in sorted(list(no_decay))], \"weight_decay\": 0.0},\n ]\n optimizer = torch.optim.Adam(\n optim_groups, lr=train_config.learning_rate, betas=train_config.betas, eps=train_config.eps)\n return optimizer\n def forward(self, idx, targets=None):\n self.step += 1\n B, T = idx.size()\n assert T <= self.ctx_len, \"Cannot forward, because len(input) > model ctx_len.\"\n x = self.emb(idx)\n x = self.blocks(x)\n x = self.ln_out(x)\n if RWKV_HEAD_QK_DIM > 0:\n q = self.head_q(x)[:, :T, :]\n k = self.head_k(x)[:, :T, :]\n c = (q @ k.transpose(-2, -1)) * (1.0 / RWKV_HEAD_QK_DIM)\n c = c.masked_fill(self.copy_mask[:T, :T] == 0, 0)\n c = c @ F.one_hot(idx, num_classes=self.config.vocab_size).float()" + }, + { + "comment": "This code calculates the output of a model and optionally computes a cross-entropy loss if targets are provided. If targets are None, it just returns the output. The head layer is used to process the input 'x'.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/src/model.py\":354-362", + "content": " x = self.head(x) + c\n else:\n x = self.head(x)\n loss = None\n if targets is not None:\n loss = F.cross_entropy(x.view(-1, x.size(-1)), targets.view(-1))\n return x, loss" + } + ] +} \ No newline at end of file diff --git a/docs/doc/18983925-68ef-49f4-8834-5412c211dd56.json b/docs/doc/18983925-68ef-49f4-8834-5412c211dd56.json new file mode 100644 index 00000000..532d74b2 --- /dev/null +++ b/docs/doc/18983925-68ef-49f4-8834-5412c211dd56.json @@ -0,0 +1,125 @@ +{ + "summary": "This code utilizes DeepSpeed and implements a transformer layer with CUDA operations. It creates RWKV model architecture, initializes an optimizer, and optimizes memory usage for transformer models.", + "details": [ + { + "comment": "Importing necessary libraries, setting profiling mode for torch.jit, defining custom modules and functions, integrating DeepSpeed library if available, and configuring environment variables for testing and JIT usage.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/src/model.py\":0-30", + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nimport os, math, gc, importlib\nimport torch\n# torch._C._jit_set_profiling_executor(True)\n# torch._C._jit_set_profiling_mode(True)\nimport torch.nn as nn\nfrom torch.nn import functional as F\nimport pytorch_lightning as pl\nfrom pytorch_lightning.utilities import rank_zero_info, rank_zero_only\nfrom pytorch_lightning.strategies import DeepSpeedStrategy\nif importlib.util.find_spec('deepspeed'):\n import deepspeed\n from deepspeed.ops.adam import DeepSpeedCPUAdam, FusedAdam\n# from deepspeed.runtime.fp16.onebit.zoadam import ZeroOneAdam\ntry:\n print('RWKV_MY_TESTING', os.environ[\"RWKV_MY_TESTING\"])\nexcept:\n os.environ[\"RWKV_MY_TESTING\"] = ''\ndef __nop(ob):\n return ob\nMyModule = nn.Module\nMyFunction = __nop\nif os.environ[\"RWKV_JIT_ON\"] == \"1\":" + }, + { + "comment": "This code imports a CUDA kernel and uses it in the WKV_5 class, which performs operations on tensors of specific data types and dimensions. It asserts the input tensor sizes to ensure they match the expected format and checks the value of HEAD_SIZE to be equal to C divided by H. The code then instantiates the wkv5 CUDA kernel from the \"wkv5\" module with specific sources, flags, and verbose settings.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/src/model.py\":31-54", + "content": " MyModule = torch.jit.ScriptModule\n MyFunction = torch.jit.script_method\n########################################################################################################\n# CUDA Kernel\n########################################################################################################\nfrom torch.utils.cpp_extension import load\nHEAD_SIZE = int(os.environ[\"RWKV_HEAD_SIZE_A\"])\nwkv5_cuda = load(name=\"wkv5\", sources=[\"cuda/wkv5_op.cpp\", f\"cuda/wkv5_cuda.cu\"],\n verbose=True, extra_cuda_cflags=[\"-res-usage\", \"--use_fast_math\", \"-O3\", \"-Xptxas -O3\", \"--extra-device-vectorization\", f\"-D_N_={HEAD_SIZE}\"])\nclass WKV_5(torch.autograd.Function):\n @staticmethod\n def forward(ctx, B, T, C, H, r, k, v, w, u):\n with torch.no_grad():\n assert r.dtype == torch.bfloat16\n assert k.dtype == torch.bfloat16\n assert v.dtype == torch.bfloat16\n assert w.dtype == torch.bfloat16\n assert u.dtype == torch.bfloat16\n assert HEAD_SIZE == C // H" + }, + { + "comment": "This code appears to be part of a model's forward and backward pass implementation. In the forward pass, it initializes variables for batch size (B), sequence length (T), feature dimensions (C), hidden state dimension (H), and tensors r, k, v, eew, ew, u, and y. It then calls a forward method to compute y using these variables and returns y. In the backward pass, it checks the gradient tensor gy's type and shape, retrieves saved tensors from the context, and initializes an empty tensor gr of the same shape as gy for gradients computation.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/src/model.py\":55-81", + "content": " ctx.B = B\n ctx.T = T\n ctx.C = C\n ctx.H = H\n assert r.is_contiguous()\n assert k.is_contiguous()\n assert v.is_contiguous()\n assert w.is_contiguous()\n assert u.is_contiguous()\n ew = (-torch.exp(w.float())).contiguous()\n eew = (torch.exp(ew)).contiguous()\n ctx.save_for_backward(r, k, v, eew, ew, u)\n y = torch.empty((B, T, C), device=r.device, dtype=torch.bfloat16, memory_format=torch.contiguous_format) # .uniform_(-1, 1)\n wkv5_cuda.forward(B, T, C, H, r, k, v, eew, u, y)\n return y\n @staticmethod\n def backward(ctx, gy):\n with torch.no_grad():\n assert gy.dtype == torch.bfloat16\n B = ctx.B\n T = ctx.T\n C = ctx.C\n H = ctx.H\n assert gy.is_contiguous()\n r, k, v, eew, ew, u = ctx.saved_tensors\n gr = torch.empty((B, T, C), device=gy.device, requires_grad=False, dtype=torch.bfloat16, memory_format=torch.contiguous_format) # .uniform_(-1, 1)" + }, + { + "comment": "This code is initializing various tensors for model parameters, and then calling a function to apply CUDA operations on these tensors. The function RUN_CUDA_RWKV5 calls the apply method of class WKV_5 with given input dimensions and parameters. It returns the gradients of the input parameters after applying the CUDA operations.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/src/model.py\":82-94", + "content": " gk = torch.empty((B, T, C), device=gy.device, requires_grad=False, dtype=torch.bfloat16, memory_format=torch.contiguous_format) # .uniform_(-1, 1)\n gv = torch.empty((B, T, C), device=gy.device, requires_grad=False, dtype=torch.bfloat16, memory_format=torch.contiguous_format) # .uniform_(-1, 1)\n gw = torch.empty((B, C), device=gy.device, requires_grad=False, dtype=torch.bfloat16, memory_format=torch.contiguous_format) # .uniform_(-1, 1)\n gu = torch.empty((B, C), device=gy.device, requires_grad=False, dtype=torch.bfloat16, memory_format=torch.contiguous_format) # .uniform_(-1, 1)\n wkv5_cuda.backward(B, T, C, H, r, k, v, eew, ew, u, gy, gr, gk, gv, gw, gu)\n gw = torch.sum(gw, 0).view(H, C//H)\n gu = torch.sum(gu, 0).view(H, C//H)\n return (None, None, None, None, gr, gk, gv, gw, gu)\ndef RUN_CUDA_RWKV5(B, T, C, H, r, k, v, w, u):\n return WKV_5.apply(B, T, C, H, r, k, v, w, u)\n########################################################################################################" + }, + { + "comment": "This code defines a class for RWKV_TimeMix_RWKV5, a type of MyModule. It initializes instance variables based on input arguments and asserts the head size consistency. The time_mix parameters are calculated using layer ID, number of layers, and embedding dimension.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/src/model.py\":96-118", + "content": "class RWKV_TimeMix_RWKV5(MyModule):\n def __init__(self, args, layer_id):\n super().__init__()\n self.args = args\n self.layer_id = layer_id\n self.head_size = args.head_size_a\n assert HEAD_SIZE == self.head_size # change HEAD_SIZE to match args.head_size_a\n self.n_head = args.dim_att // self.head_size\n assert args.dim_att % self.n_head == 0\n self.head_size_divisor = args.head_size_divisor\n with torch.no_grad():\n ratio_0_to_1 = layer_id / (args.n_layer - 1) # 0 to 1\n ratio_1_to_almost0 = 1.0 - (layer_id / args.n_layer) # 1 to ~0\n ddd = torch.ones(1, 1, args.n_embd)\n for i in range(args.n_embd):\n ddd[0, 0, i] = i / args.n_embd\n # fancy time_mix\n self.time_mix_k = nn.Parameter(torch.pow(ddd, ratio_1_to_almost0))\n self.time_mix_v = nn.Parameter(torch.pow(ddd, ratio_1_to_almost0) + 0.3 * ratio_0_to_1)\n self.time_mix_r = nn.Parameter(torch.pow(ddd, 0.5 * ratio_1_to_almost0))" + }, + { + "comment": "This code is initializing parameters for a transformer layer in RWKV-v5. It defines time_mix_g, time_decay, and time_faaaa as learnable parameters. The time_decay and time_faaaa parameters control the time-based attention mechanism, while time_shift is a zero padding operator and receptance and key are linear layers for processing the input embeddings.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/src/model.py\":119-137", + "content": " self.time_mix_g = nn.Parameter(torch.pow(ddd, 0.5 * ratio_1_to_almost0))\n # fancy time_decay\n decay_speed = torch.ones(args.dim_att)\n for n in range(args.dim_att):\n decay_speed[n] = -6 + 5 * (n / (args.dim_att - 1)) ** (0.7 + 1.3 * ratio_0_to_1)\n self.time_decay = nn.Parameter(decay_speed.reshape(self.n_head, self.head_size))\n # print(layer_id, self.time_decay.flatten()[:3].cpu().numpy(), '...', self.time_decay.flatten()[-3:].cpu().numpy())\n tmp = torch.zeros(args.dim_att)\n for n in range(args.dim_att):\n zigzag = ((n + 1) % 3 - 1) * 0.1\n tmp[n] = ratio_0_to_1 * (1 - (n / (args.dim_att - 1))) + zigzag\n self.time_faaaa = nn.Parameter(tmp.reshape(self.n_head, self.head_size))\n self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))\n self.receptance = nn.Linear(args.n_embd, args.dim_att, bias=False)\n self.key = nn.Linear(args.n_embd, args.dim_att, bias=False)" + }, + { + "comment": "This code defines a class with three linear layers for key, value, and gate calculations. It also includes a GroupNorm layer and two JIT functions for forward pass and normalization. The functions apply time shifting, calculate key, value, and receptance, and perform normalization using GroupNorm.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/src/model.py\":139-168", + "content": " self.value = nn.Linear(args.n_embd, args.dim_att, bias=False)\n self.output = nn.Linear(args.dim_att, args.n_embd, bias=False)\n self.gate = nn.Linear(args.n_embd, args.dim_att, bias=False)\n self.ln_x = nn.GroupNorm(self.n_head, args.dim_att)\n @MyFunction\n def jit_func(self, x):\n B, T, C = x.size()\n xx = self.time_shift(x) # Mix x with the previous timestep to produce xk, xv, xr\n xk = x * self.time_mix_k + xx * (1 - self.time_mix_k)\n xv = x * self.time_mix_v + xx * (1 - self.time_mix_v)\n xr = x * self.time_mix_r + xx * (1 - self.time_mix_r)\n xg = x * self.time_mix_g + xx * (1 - self.time_mix_g)\n r = self.receptance(xr)\n k = self.key(xk)\n v = self.value(xv)\n g = F.silu(self.gate(xg))\n return r, k, v, g\n @MyFunction\n def jit_func_2(self, x, g):\n B, T, C = x.size()\n x = x.view(B * T, C)\n x = self.ln_x(x / self.head_size_divisor).view(B, T, C)\n x = self.output(x * g)\n return x" + }, + { + "comment": "The code defines a forward pass for a model, with input size BTC and H being the number of heads. It uses jit_func to calculate r, k, v, and g. RUN_CUDA_RWKV5 is applied on these calculated values. RWKV_ChannelMix is a subclass of MyModule with an args parameter for arguments and layer_id for layer identification. It initializes time_shift with a zero pad 2D, performs fancy initialization of time_mix by setting ratio_1_to_almost0 and calculating ddd. It also initializes key as a linear layer without bias.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/src/model.py\":170-197", + "content": " def forward(self, x):\n B, T, C = x.size()\n H = self.n_head\n r, k, v, g = self.jit_func(x)\n x = RUN_CUDA_RWKV5(B, T, C, H, r, k, v, w=self.time_decay, u=self.time_faaaa)\n return self.jit_func_2(x, g)\n########################################################################################################\nclass RWKV_ChannelMix(MyModule):\n def __init__(self, args, layer_id):\n super().__init__()\n self.args = args\n self.layer_id = layer_id\n self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))\n with torch.no_grad(): # fancy init of time_mix\n ratio_1_to_almost0 = 1.0 - (layer_id / args.n_layer) # 1 to ~0\n ddd = torch.ones(1, 1, args.n_embd)\n for i in range(args.n_embd):\n ddd[0, 0, i] = i / args.n_embd\n self.time_mix_k = nn.Parameter(torch.pow(ddd, ratio_1_to_almost0))\n self.time_mix_r = nn.Parameter(torch.pow(ddd, ratio_1_to_almost0))\n self.key = nn.Linear(args.n_embd, args.dim_ffn, bias=False)" + }, + { + "comment": "This code initializes layers for a MishGLU module, including time shifting and mixing parameters. The time_shift layer pads the input tensor with a one-pixel shift, while time_mix_k and time_mix_r are learned parameters for mixing inputs. These operations are used to calculate key and receptance values in the forward pass before returning the final result through a sigmoid function multiplied by the value tensor.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/src/model.py\":198-226", + "content": " self.receptance = nn.Linear(args.n_embd, args.n_embd, bias=False)\n self.value = nn.Linear(args.dim_ffn, args.n_embd, bias=False)\n @MyFunction\n def forward(self, x):\n xx = self.time_shift(x)\n xk = x * self.time_mix_k + xx * (1 - self.time_mix_k)\n xr = x * self.time_mix_r + xx * (1 - self.time_mix_r)\n k = self.key(xk)\n k = torch.relu(k) ** 2\n kv = self.value(k)\n return torch.sigmoid(self.receptance(xr)) * kv\nclass MishGLU(MyModule):\n def __init__(self, args, layer_id):\n super().__init__()\n self.args = args\n self.layer_id = layer_id\n self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))\n with torch.no_grad():\n ratio_1_to_almost0 = 1.0 - (layer_id / args.n_layer)\n x = torch.ones(1, 1, args.n_embd)\n for i in range(args.n_embd):\n x[0, 0, i] = i / args.n_embd\n self.time_mix_k = nn.Parameter(torch.pow(x, ratio_1_to_almost0))\n self.time_mix_r = nn.Parameter(torch.pow(x, ratio_1_to_almost0))" + }, + { + "comment": "This code defines the architecture of an RWKV model with layers for time-shifting, mixing, and applying a Mish activation function. The `Block` class represents each layer in the model, which includes layer normalization, feed forward network (FFN), and optional additional layer normalization for the first layer. It also defines functions to perform time shifting and linear transformations using fully connected layers.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/src/model.py\":227-255", + "content": " self.aa = nn.Linear(args.n_embd, args.dim_ffn, bias=False)\n self.bb = nn.Linear(args.n_embd, args.dim_ffn, bias=False)\n self.value = nn.Linear(args.dim_ffn, args.n_embd, bias=False)\n @MyFunction\n def forward(self, x):\n xx = self.time_shift(x)\n xa = x * self.time_mix_k + xx * (1 - self.time_mix_k)\n xb = x * self.time_mix_r + xx * (1 - self.time_mix_r)\n a = self.aa(xa)\n b = self.bb(xb)\n return self.value(a * F.mish(b))\n########################################################################################################\n# The RWKV Model with our blocks\n########################################################################################################\nclass Block(nn.Module):\n def __init__(self, args, layer_id):\n super().__init__()\n self.args = args\n self.layer_id = layer_id\n self.ln1 = nn.LayerNorm(args.n_embd)\n self.ln2 = nn.LayerNorm(args.n_embd)\n if self.layer_id == 0:\n self.ln0 = nn.LayerNorm(args.n_embd)" + }, + { + "comment": "The code initializes model components based on provided arguments and layer ID. If `my_pos_emb > 0`, it creates positional embedding parameters. If `pre_ffn > 0`, it creates an FFN layer (RWKV_ChannelMix) for the first layer. Depending on environment, it also initializes ffn layers (MishGLU or RWKV_ChannelMix). For tiny_att_dim and specific layer ID, it registers linear layers and buffer for tiny attention implementation.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/src/model.py\":256-275", + "content": " if args.my_pos_emb > 0:\n self.pos_emb_x = nn.Parameter(torch.zeros((1,args.my_pos_emb,args.n_embd)))\n self.pos_emb_y = nn.Parameter(torch.zeros((args.my_pos_emb,1,args.n_embd)))\n if self.layer_id == 0 and self.args.pre_ffn > 0:\n self.ffnPre = RWKV_ChannelMix(args, 0)\n else:\n self.att = RWKV_TimeMix_RWKV5(args, layer_id)\n if 'g' in os.environ[\"RWKV_MY_TESTING\"]:\n self.ffn = MishGLU(args, layer_id)\n else:\n self.ffn = RWKV_ChannelMix(args, layer_id)\n if args.tiny_att_dim > 0 and self.layer_id == args.tiny_att_layer:\n self.tiny_ln = nn.LayerNorm(args.n_embd)\n self.tiny_q = nn.Linear(args.n_embd, args.tiny_att_dim, bias=False)\n self.tiny_k = nn.Linear(args.n_embd, args.tiny_att_dim, bias=False)\n self.tiny_v = nn.Linear(args.n_embd, args.n_embd, bias=False)\n self.register_buffer(\"tiny_mask\", torch.tril(torch.ones(args.ctx_len, args.ctx_len)))" + }, + { + "comment": "The code implements a layer of the RWKV-v5 model, including dropout regularization, layers normalization, feed-forward network (FFN), and attention mechanism. The dropout rate is determined by the \"args\" parameter and can be zero. If the pre_ffn argument is greater than 0, it executes FFN before other operations. The code also includes a tiny attention layer at a specific layer specified by the \"tiny_att_dim\" argument.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/src/model.py\":277-304", + "content": " if args.dropout > 0:\n self.drop0 = nn.Dropout(p = args.dropout)\n self.drop1 = nn.Dropout(p = args.dropout)\n def forward(self, x, x_emb=None):\n args = self.args\n B, T, C = x.size()\n if self.layer_id == 0:\n x = self.ln0(x)\n if args.my_pos_emb > 0:\n pos_emb = (self.pos_emb_x + self.pos_emb_y).reshape(T+1, -1)[:-1,:]\n x = x + pos_emb\n if self.args.dropout == 0:\n if self.layer_id == 0 and args.pre_ffn > 0:\n x = x + self.ffnPre(self.ln1(x))\n else:\n x = x + self.att(self.ln1(x))\n x = x + self.ffn(self.ln2(x))\n else:\n if self.layer_id == 0 and args.pre_ffn > 0:\n x = self.drop0(x + self.ffnPre(self.ln1(x)))\n else:\n x = self.drop0(x + self.att(self.ln1(x)))\n x = self.drop1(x + self.ffn(self.ln2(x)))\n if args.tiny_att_dim > 0 and self.layer_id == args.tiny_att_layer:\n xx = self.tiny_ln(x)" + }, + { + "comment": "This code defines a class RWKV, which inherits from LightningModule. It includes an initialization method that takes arguments and creates instance variables. The class also has a forward method for the L2Wrap function, which computes a loss and applies a specific gradient calculation. This appears to be part of a deep learning model implementation.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/src/model.py\":305-337", + "content": " q = self.tiny_q(xx)[:, :T, :]\n k = self.tiny_k(xx)[:, :T, :]\n c = (q @ k.transpose(-2, -1)) * (args.tiny_att_dim ** (-0.5))\n c = c.masked_fill(self.tiny_mask[:T, :T] == 0, 0)\n x = x + c @ self.tiny_v(x_emb)\n return x\nclass L2Wrap(torch.autograd.Function):\n @staticmethod\n def forward(ctx, loss, y):\n ctx.save_for_backward(y)\n return loss\n @staticmethod\n def backward(ctx, grad_output):\n y = ctx.saved_tensors[0]\n # to encourage the logits to be close to 0\n factor = 1e-4 / (y.shape[0] * y.shape[1])\n maxx, ids = torch.max(y, -1, keepdim=True)\n gy = torch.zeros_like(y)\n gy.scatter_(-1, ids, maxx * factor)\n return (grad_output, gy)\nclass RWKV(pl.LightningModule):\n def __init__(self, args):\n super().__init__()\n self.args = args\n if not hasattr(args, 'dim_att'):\n args.dim_att = args.n_embd\n if not hasattr(args, 'dim_ffn'):\n args.dim_ffn = args.n_embd * 4" + }, + { + "comment": "The code defines a model with multiple layers, embeddings, and optional components depending on the provided arguments. It also sets up optimizer configurations based on the specified arguments.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/src/model.py\":338-365", + "content": " if not hasattr(args, 'tiny_att_layer'):\n args.tiny_att_layer = -1\n if not hasattr(args, 'tiny_att_dim'):\n args.tiny_att_dim = -1\n assert args.n_embd % 32 == 0\n assert args.dim_att % 32 == 0\n assert args.dim_ffn % 32 == 0\n self.emb = nn.Embedding(args.vocab_size, args.n_embd)\n self.blocks = nn.ModuleList([Block(args, i) for i in range(args.n_layer)])\n self.ln_out = nn.LayerNorm(args.n_embd)\n self.head = nn.Linear(args.n_embd, args.vocab_size, bias=False)\n if args.head_qk > 0:\n self.head_q = nn.Linear(args.n_embd, args.head_qk, bias=False)\n self.head_k = nn.Linear(args.n_embd, args.head_qk, bias=False)\n self.register_buffer(\"copy_mask\", torch.tril(torch.ones(args.ctx_len, args.ctx_len)))\n if args.dropout > 0:\n self.drop0 = nn.Dropout(p = args.dropout)\n def configure_optimizers(self):\n args = self.args\n lr_decay = set()\n lr_1x = set()\n lr_2x = set()" + }, + { + "comment": "This code is assigning different learning rates based on parameter names and certain conditions. It first defines four sets (lr_1x, lr_2x, lr_3x, lr_decay) and then populates them according to specific criteria: if a parameter's name matches certain strings or has a layerwise learning rate greater than 0, it will be added to the corresponding set. Finally, the sets are sorted and stored.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/src/model.py\":366-393", + "content": " lr_3x = set()\n for n, p in self.named_parameters():\n if (\"time_mix\" in n) and (args.layerwise_lr > 0):\n if args.my_pile_stage == 2:\n lr_2x.add(n)\n else:\n lr_1x.add(n)\n elif (\"time_decay\" in n) and (args.layerwise_lr > 0):\n if args.my_pile_stage == 2:\n lr_3x.add(n)\n else:\n lr_2x.add(n)\n elif (\"time_faaaa\" in n) and (args.layerwise_lr > 0):\n if args.my_pile_stage == 2:\n lr_2x.add(n)\n else:\n lr_1x.add(n)\n elif (\"time_first\" in n) and (args.layerwise_lr > 0):\n lr_3x.add(n)\n elif (len(p.squeeze().shape) >= 2) and (args.weight_decay > 0):\n lr_decay.add(n)\n else:\n lr_1x.add(n)\n lr_decay = sorted(list(lr_decay))\n lr_1x = sorted(list(lr_1x))\n lr_2x = sorted(list(lr_2x))\n lr_3x = sorted(list(lr_3x))" + }, + { + "comment": "The code defines different optimizer groups for layers based on the desired layer-wise learning rate. It separates parameters into three groups with scaling factors of 1x, 2x, and 3x (relative to initial learning rate), and sets their respective weights decay to zero.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/src/model.py\":394-411", + "content": " # print('decay', lr_decay)\n # print('1x', lr_1x)\n # print('2x', lr_2x)\n # print('3x', lr_3x)\n param_dict = {n: p for n, p in self.named_parameters()}\n if args.layerwise_lr > 0:\n if args.my_pile_stage == 2:\n optim_groups = [\n {\"params\": [param_dict[n] for n in lr_1x], \"weight_decay\": 0.0, \"my_lr_scale\": 1.0},\n {\"params\": [param_dict[n] for n in lr_2x], \"weight_decay\": 0.0, \"my_lr_scale\": 5.0},# test: 2e-3 / args.lr_init},\n {\"params\": [param_dict[n] for n in lr_3x], \"weight_decay\": 0.0, \"my_lr_scale\": 5.0},# test: 3e-3 / args.lr_init},\n ]\n else:\n optim_groups = [\n {\"params\": [param_dict[n] for n in lr_1x], \"weight_decay\": 0.0, \"my_lr_scale\": 1.0},\n {\"params\": [param_dict[n] for n in lr_2x], \"weight_decay\": 0.0, \"my_lr_scale\": 2.0},\n {\"params\": [param_dict[n] for n in lr_3x], \"weight_decay\": 0.0, \"my_lr_scale\": 3.0}," + }, + { + "comment": "This code sets up an optimizer for the model. It creates a list of optimization groups, adding either a group with no weight decay or a group with weight decay based on arguments provided. Depending on the presence of a weight decay argument and whether DeepSpeed is being used, it returns a corresponding optimizer - either DeepSpeedCPUAdam or FusedAdam.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/src/model.py\":412-424", + "content": " ]\n else:\n optim_groups = [{\"params\": [param_dict[n] for n in lr_1x], \"weight_decay\": 0.0, \"my_lr_scale\": 1.0}]\n if args.weight_decay > 0:\n optim_groups += [{\"params\": [param_dict[n] for n in lr_decay], \"weight_decay\": args.weight_decay, \"my_lr_scale\": 1.0}]\n if self.deepspeed_offload:\n return DeepSpeedCPUAdam(optim_groups, lr=self.args.lr_init, betas=self.args.betas, eps=self.args.adam_eps, bias_correction=True, adamw_mode=True, amsgrad=False)\n return FusedAdam(optim_groups, lr=self.args.lr_init, betas=self.args.betas, eps=self.args.adam_eps, bias_correction=True, adam_w_mode=True, amsgrad=False)\n else:\n if self.deepspeed_offload:\n return DeepSpeedCPUAdam(optim_groups, lr=self.args.lr_init, betas=self.args.betas, eps=self.args.adam_eps, bias_correction=True, adamw_mode=False, weight_decay=0, amsgrad=False)\n return FusedAdam(optim_groups, lr=self.args.lr_init, betas=" + }, + { + "comment": "Lines 424-448: Initialize Adam optimizer with specified arguments, including learning rate, betas, epsilon, bias correction, weight decay, and amsgrad flag.\nLine 451-456: Check if the current strategy is a DeepSpeedStrategy and return whether to offload the optimizer or parameters from this model.\nLine 458-470: Define the forward function for the model, applying embedding, dropout (if enabled), and potentially using DeepSpeed checkpointing with gradient clip (if enabled).", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/src/model.py\":424-448", + "content": "self.args.betas, eps=self.args.adam_eps, bias_correction=True, adam_w_mode=False, weight_decay=0, amsgrad=False)\n # return ZeroOneAdam(optim_groups, lr=self.args.lr_init, betas=self.args.betas, eps=self.args.adam_eps, bias_correction=True, weight_decay=0, amsgrad=False, cuda_aware=False)\n @property\n def deepspeed_offload(self) -> bool:\n strategy = self.trainer.strategy\n if isinstance(strategy, DeepSpeedStrategy):\n cfg = strategy.config[\"zero_optimization\"]\n return cfg.get(\"offload_optimizer\") or cfg.get(\"offload_param\")\n return False\n def forward(self, idx):\n args = self.args\n B, T = idx.size()\n assert T <= args.ctx_len, \"Cannot forward, model ctx_len is exhausted.\"\n x = self.emb(idx)\n x_emb = x\n if args.dropout > 0:\n x = self.drop0(x)\n if args.tiny_att_dim > 0:\n for block in self.blocks:\n if args.grad_cp == 1:\n x = deepspeed.checkpointing.checkpoint(block, x, x_emb)" + }, + { + "comment": "This code is defining a multi-head attention mechanism in a transformer model, including options for head masking and different float modes (FP16 or BF16). It applies checkpointing for gradient accumulation and performs matrix operations to generate context vectors.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/src/model.py\":449-477", + "content": " else:\n x = block(x, x_emb)\n else:\n for block in self.blocks:\n if args.grad_cp == 1:\n x = deepspeed.checkpointing.checkpoint(block, x)\n else:\n x = block(x)\n x = self.ln_out(x)\n if args.head_qk > 0:\n q = self.head_q(x)[:, :T, :]\n k = self.head_k(x)[:, :T, :]\n c = (q @ k.transpose(-2, -1)) * (1.0 / args.head_qk)\n c = c.masked_fill(self.copy_mask[:T, :T] == 0, 0)\n if \"32\" in os.environ[\"RWKV_FLOAT_MODE\"]:\n c = c @ F.one_hot(idx, num_classes=args.vocab_size)\n elif os.environ[\"RWKV_FLOAT_MODE\"] == \"fp16\":\n c = c @ F.one_hot(idx, num_classes=args.vocab_size).half()\n elif os.environ[\"RWKV_FLOAT_MODE\"] == \"bf16\":\n c = c @ F.one_hot(idx, num_classes=args.vocab_size).bfloat16()\n x = self.head(x) + c\n else:\n x = self.head(x)\n return x" + }, + { + "comment": "The code defines a training step function for the model. It checks an environment variable and performs different operations based on its value. If the variable is not equal to 1, it computes logits from input idx, calculates cross-entropy loss, and prints logits and idx if certain conditions are met. If the variable is equal to 1, it handles batches with masks, checks for mask sum, and adjusts the loss calculation based on whether all values in the mask are 0 or not. It also prints the loss for specific ranks.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/src/model.py\":479-502", + "content": " def training_step(self, batch, batch_idx):\n args = self.args\n if args.my_qa_mask != 1:\n idx, targets = batch\n logits = self(idx)\n loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))\n # if '0' in os.environ[\"RWKV_MY_TESTING\"]:\n # print('logits', logits)\n # torch.set_printoptions(threshold=10000)\n # print('idx', idx)\n # exit(0)\n else:\n idx, targets, mask = batch\n mask = mask.view(-1)\n sum_mask = torch.sum(mask).item()\n # if sum_mask == 0:\n # return torch.tensor([0.0], requires_grad=True)\n logits = self(idx)\n if sum_mask == mask.shape[0]:\n loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))\n # print('rank', self.global_rank, 'loss', loss.item())\n else:\n loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), reduction='none')" + }, + { + "comment": "In this code snippet, the model calculates the loss for a batch of data. The loss is calculated by summing up the element-wise multiplication of the loss and a mask, then dividing by the sum of the mask. The code also includes additional logging functionality to gather statistics on the training process across multiple GPUs using PyTorch's all_gather function.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/src/model.py\":503-529", + "content": " # loss_raw = loss\n loss = torch.sum(loss * mask) / sum_mask\n # torch.set_printoptions(threshold=10000)\n # if True: #self.global_rank == 1:\n # tmp = ''\n # sss = 0\n # ccc = 0\n # for i in range(mask.shape[0]):\n # if mask[i] > 0:\n # tmp += str(idx.view(-1)[i].item()) + ','\n # sss += loss_raw.view(-1)[i].float().item()\n # ccc += 1\n # print('rank', self.global_rank, 'loss', loss.item(), 'lavg', sss / ccc)#, 'tmp', tmp, 'input', idx)\n return L2Wrap.apply(loss, logits)\n def training_step_end(self, batch_parts):\n if pl.__version__[0]!='2':\n all = self.all_gather(batch_parts)\n if self.trainer.is_global_zero:\n self.trainer.my_loss_all = all\n def generate_init_weight(self):\n print(\n f\"\"\"\n############################################################################" + }, + { + "comment": "This code initializes the model's weights, handling specific layers and adjusting gains and scales accordingly. It also handles layer normalization (ln_x.weight), embedding scale initialization (-1 * lr_init), and certain zero-initialized layers.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/src/model.py\":530-558", + "content": "#\n# Init model weight (slow for large models)...\n#\n############################################################################\n\"\"\"\n )\n m = {}\n for n in self.state_dict():\n p = self.state_dict()[n]\n shape = p.shape\n gain = 1.0\n scale = 1.0\n if \"ln_\" in n or \".ln\" in n or \"time_\" in n or \"_mask\" in n or \"pos_emb\" in n or '.mask.' in n:\n if 'ln_x.weight' in n:\n layer_scale = (1+int(n.split('.')[1])) / self.args.n_layer\n m[n] = (p * 0.0) + (layer_scale ** 0.7)\n else:\n m[n] = p\n else:\n if n == \"emb.weight\":\n scale = -1 * self.args.lr_init\n else:\n if shape[0] > shape[1]:\n gain = math.sqrt(shape[0] / shape[1])\n zero = [\".att.output.\", \".ffn.value.\", \".ffn.receptance.\", \".ffnPre.value.\", \".ffnPre.receptance.\", \"head_q.\", '.oo.', '.rr.']\n for kk in zero:" + }, + { + "comment": "This code initializes the model's weights with different scales based on their names. If the name contains \"head_k.\" or \"head_q.\", the scale is set to 0. If the name is \"head.weight\", the scale is set to 0.5. Otherwise, the scale is set to 0. The weights are initialized using PyTorch's initialization functions depending on their scales. Finally, the model parameters are moved to CPU and potentially converted to BF16 or FP16 if environment variables RWKV_FLOAT_MODE is set to \"bf16\" or \"fp16\".", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/src/model.py\":559-585", + "content": " if kk in n:\n scale = 0\n if n == \"head.weight\":\n scale = 0.5\n if \"head_k.\" in n:\n scale = 0.1\n if \"head_q.\" in n:\n scale = 0\n print(f\"{str(shape[0]).ljust(5)} {str(shape[1]).ljust(5)} {str(scale).ljust(4)} {n}\")\n if self.args.accelerator.upper() == \"GPU\":\n m[n] = torch.empty((shape[0], shape[1]), device=\"cuda\")\n else:\n m[n] = torch.empty((shape[0], shape[1]))\n if scale == 0:\n nn.init.zeros_(m[n])\n elif scale < 0:\n nn.init.uniform_(m[n], a=scale, b=-scale)\n else:\n nn.init.orthogonal_(m[n], gain=gain * scale)\n m[n] = m[n].cpu()\n if os.environ[\"RWKV_FLOAT_MODE\"] == \"fp16\":\n m[n] = m[n].half()\n elif os.environ[\"RWKV_FLOAT_MODE\"] == \"bf16\":" + }, + { + "comment": "In this code snippet, the model's parameters are converted to bfloat16 and then garbage collected, followed by clearing CUDA cache. This improves memory usage and efficiency.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/src/model.py\":586-593", + "content": " m[n] = m[n].bfloat16()\n # if n == \"emb.weight\":\n # print(m[n])\n gc.collect()\n torch.cuda.empty_cache()\n return m" + } + ] +} \ No newline at end of file diff --git a/docs/doc/19a011e2-5593-49d0-b75b-4d0f47c0bb19.json b/docs/doc/19a011e2-5593-49d0-b75b-4d0f47c0bb19.json new file mode 100644 index 00000000..73a77dd1 --- /dev/null +++ b/docs/doc/19a011e2-5593-49d0-b75b-4d0f47c0bb19.json @@ -0,0 +1,75 @@ +{ + "summary": "The RWKV Language Model employs CUDA support, time-shifted operations, and forward passes for efficient execution. It trains a transformer model with head-multihead attention using RWKV-v4, organizes weights in namespaced structure, and performs layer normalization, feed-forward operations, and applies layers like LN and FFN.", + "details": [ + { + "comment": "This code snippet is from the RWKV Language Model. It begins with importing necessary libraries and defines some constants like RWKV_HEAD_QK_DIM, DEBUG_TIME, and T_MAX. The code checks if the execution environment is CUDA and imports a CUDA kernel if it is. This language model is designed to perform natural language processing tasks with potential VRAM usage limitations.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/src/model_run.py\":0-24", + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nimport types\nimport copy\nimport torch\nimport math, os\nfrom torch.nn import functional as F\nimport torch.nn as nn\nRWKV_HEAD_QK_DIM = 0\nprint(f'\\nRWKV_HEAD_QK_DIM {RWKV_HEAD_QK_DIM}\\n')\nDEBUG_TIME = False # True False - show trained time-coeffs\n########################################################################################################\n# CUDA Kernel\n########################################################################################################\nif os.environ['RWKV_RUN_DEVICE'] == 'cuda':\n T_MAX = 1024 # increase this if your ctx_len is long [NOTE: TAKES LOTS OF VRAM!]\n # it's possible to go beyond CUDA limitations if you slice the ctx and pass the hidden state in each slice\n from torch.utils.cpp_extension import load" + }, + { + "comment": "This code initializes a WKV object, loads the model from CUDA, and handles float mode conversion. It asserts certain conditions, such as T being less than or equal to a maximum value, and B*C being divisible by min(C, 1024). The code also checks the environment variable 'RWKV_FLOAT_MODE' and converts the float types accordingly for compatibility. The model parameters are stored in the ctx object for backward propagation.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/src/model_run.py\":25-47", + "content": " wkv_cuda = load(name=\"wkv\", sources=[\"cuda/wkv_op.cpp\", \"cuda/wkv_cuda.cu\"],\n verbose=True, extra_cuda_cflags=['-res-usage', '--maxrregcount 60', '--use_fast_math', '-O3', '-Xptxas -O3', f'-DTmax={T_MAX}'])\n class WKV(torch.autograd.Function):\n @staticmethod\n def forward(ctx, B, T, C, w, u, k, v):\n ctx.B = B\n ctx.T = T\n ctx.C = C\n assert T <= T_MAX\n assert B * C % min(C, 1024) == 0\n if '32' in os.environ['RWKV_FLOAT_MODE']:\n w = -torch.exp(w.contiguous())\n u = u.contiguous()\n k = k.contiguous()\n v = v.contiguous()\n else:\n w = -torch.exp(w.float().contiguous())\n u = u.float().contiguous()\n k = k.float().contiguous()\n v = v.float().contiguous()\n ctx.save_for_backward(w, u, k, v)\n y = torch.empty((B, T, C), device='cuda', memory_format=torch.contiguous_format)" + }, + { + "comment": "This code defines a function `forward` for a model that performs operations on input tensors B, T, C, w, u, k, and v. It also includes a backward function for gradient calculations using saved tensors. The `forward` function returns the output y, which is modified based on the environment variable RWKV_FLOAT_MODE. The backward function performs gradient calculations based on the input tensor gy and saves gradients in gw, gu, gk, and gv.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/src/model_run.py\":48-71", + "content": " wkv_cuda.forward(B, T, C, w, u, k, v, y)\n if '32' in os.environ['RWKV_FLOAT_MODE']:\n return y\n elif os.environ['RWKV_FLOAT_MODE'] == 'fp16':\n return y.half()\n elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':\n return y.bfloat16()\n @staticmethod\n def backward(ctx, gy):\n B = ctx.B\n T = ctx.T\n C = ctx.C\n assert T <= T_MAX\n assert B * C % min(C, 1024) == 0\n w, u, k, v = ctx.saved_tensors\n gw = torch.zeros((B, C), device='cuda').contiguous()\n gu = torch.zeros((B, C), device='cuda').contiguous()\n gk = torch.zeros((B, T, C), device='cuda').contiguous()\n gv = torch.zeros((B, T, C), device='cuda').contiguous()\n if '32' in os.environ['RWKV_FLOAT_MODE']:\n wkv_cuda.backward(B, T, C, w, u, k, v, gy.contiguous(), gw, gu, gk, gv)\n else:\n wkv_cuda.backward(B, T, C, w, u, k, v, gy.float().contiguous(), gw, gu, gk, gv)" + }, + { + "comment": "This code snippet is part of a model training and inference process. It defines a function `RUN_CUDA` for running the model on CUDA devices, and initializes a module `RWKV_ChannelMix`. The code also sets up various tensor operations such as summations, and environment variable checks for floating point precision modes. The module is part of the RWKV language model framework.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/src/model_run.py\":72-95", + "content": " gw = torch.sum(gw, dim=0)\n gu = torch.sum(gu, dim=0)\n if '32' in os.environ['RWKV_FLOAT_MODE']:\n return (None, None, None, gw, gu, gk, gv)\n elif os.environ['RWKV_FLOAT_MODE'] == 'fp16':\n return (None, None, None, gw.half(), gu.half(), gk.half(), gv.half())\n elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':\n return (None, None, None, gw.bfloat16(), gu.bfloat16(), gk.bfloat16(), gv.bfloat16())\n def RUN_CUDA(B, T, C, w, u, k, v):\n return WKV.apply(B, T, C, w.cuda(), u.cuda(), k.cuda(), v.cuda())\n############################################################################################################\nRWKV_CFG = types.SimpleNamespace()\nclass RWKV_ChannelMix(nn.Module):\n def __init__(self, layer_id):\n super().__init__()\n self.layer_id = layer_id\n self.time_shift = nn.ZeroPad2d((0,0,1,-1))\n self.time_mix_k = nn.Parameter(torch.ones(1, 1, RWKV_CFG.n_embd))\n self.time_mix_r = nn.Parameter(torch.ones(1, 1, RWKV_CFG.n_embd))" + }, + { + "comment": "Class \"RWKV_TimeMix\" initializes with layer id, and contains parameters for time decay, first position correction, shift operation, key mix, and value mix. The forward function applies time shifting, mixes with key and value mix parameters, applies square and relu operations on keys, multiplies by sigmoid-transformed values, and returns the result.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/src/model_run.py\":97-123", + "content": " hidden_sz = 4 * RWKV_CFG.n_embd\n self.key = nn.Linear(RWKV_CFG.n_embd, hidden_sz, bias=False)\n self.receptance = nn.Linear(RWKV_CFG.n_embd, RWKV_CFG.n_embd, bias=False)\n self.value = nn.Linear(hidden_sz, RWKV_CFG.n_embd, bias=False)\n def forward(self, x):\n xx = self.time_shift(x)\n xk = x * self.time_mix_k + xx * (1 - self.time_mix_k)\n xr = x * self.time_mix_r + xx * (1 - self.time_mix_r)\n k = self.key(xk)\n k = torch.square(torch.relu(k))\n kv = self.value(k)\n rkv = torch.sigmoid(self.receptance(xr)) * kv\n return rkv\nclass RWKV_TimeMix(nn.Module):\n def __init__(self, layer_id):\n super().__init__()\n self.layer_id = layer_id\n self.time_decay = nn.Parameter(torch.ones(RWKV_CFG.n_embd))\n self.time_first = nn.Parameter(torch.ones(RWKV_CFG.n_embd) * math.log(0.3))\n self.time_shift = nn.ZeroPad2d((0,0,1,-1))\n self.time_mix_k = nn.Parameter(torch.ones(1,1,RWKV_CFG.n_embd))\n self.time_mix_v = nn.Parameter(torch.ones(1,1,RWKV_CFG.n_embd))" + }, + { + "comment": "The code initializes a module with time-based mixing parameters and applies a series of linear transformations to the input. The forward function performs time-shifted operations, then multiplies with learned coefficients for key, value, and receptance components. The results are passed through a sigmoid activation, another layer normalization, and a final linear transformation before returning the final output. This block is part of the RWKV model implementation.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/src/model_run.py\":124-154", + "content": " self.time_mix_r = nn.Parameter(torch.ones(1,1,RWKV_CFG.n_embd))\n self.key = nn.Linear(RWKV_CFG.n_embd, RWKV_CFG.n_embd, bias=False)\n self.value = nn.Linear(RWKV_CFG.n_embd, RWKV_CFG.n_embd, bias=False)\n self.receptance = nn.Linear(RWKV_CFG.n_embd, RWKV_CFG.n_embd, bias=False)\n self.output = nn.Linear(RWKV_CFG.n_embd, RWKV_CFG.n_embd, bias=False)\n def forward(self, x):\n B, T, C = x.size()\n xx = self.time_shift(x)\n xk = x * self.time_mix_k + xx * (1 - self.time_mix_k)\n xv = x * self.time_mix_v + xx * (1 - self.time_mix_v)\n xr = x * self.time_mix_r + xx * (1 - self.time_mix_r)\n k = self.key(xk)\n v = self.value(xv)\n r = self.receptance(xr)\n rwkv = torch.sigmoid(r) * RUN_CUDA(B, T, C, self.time_decay, self.time_first, k, v)\n rwkv = self.output(rwkv)\n return rwkv\nclass Block(nn.Module):\n def __init__(self, layer_id):\n super().__init__()\n self.layer_id = layer_id\n self.ln1 = nn.LayerNorm(RWKV_CFG.n_embd)" + }, + { + "comment": "This code initializes a RWKV-GPT model instance with specified configuration, and includes layer normalization and different forward pass depending on the layer ID and model type. The forward method performs layer normalization and adds the outputs of specific layers or modules, resulting in the final output.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/src/model_run.py\":155-185", + "content": " self.ln2 = nn.LayerNorm(RWKV_CFG.n_embd)\n if self.layer_id == 0:\n self.ln0 = nn.LayerNorm(RWKV_CFG.n_embd)\n if self.layer_id == 0 and RWKV_CFG.model_type == 'RWKV-ffnPre':\n self.ffnPre = RWKV_ChannelMix(layer_id+1000)\n else:\n self.att = RWKV_TimeMix(layer_id)\n self.ffn = RWKV_ChannelMix(layer_id)\n def forward(self, x):\n if self.layer_id == 0:\n x = self.ln0(x)\n if self.layer_id == 0 and RWKV_CFG.model_type == 'RWKV-ffnPre':\n x = x + self.ffnPre(self.ln1(x))\n else:\n x = x + self.att(self.ln1(x))\n x = x + self.ffn(self.ln2(x))\n return x\nclass RWKV_GPT(nn.Module):\n def __init__(self, MODEL_NAME, RUN_DEVICE, model_type, vocab_size, n_layer, n_embd, ctx_len):\n global RWKV_CFG\n super().__init__()\n RWKV_CFG.RUN_DEVICE = RUN_DEVICE\n RWKV_CFG.model_type = model_type\n RWKV_CFG.vocab_size = vocab_size\n RWKV_CFG.n_layer = n_layer\n RWKV_CFG.n_embd = n_embd" + }, + { + "comment": "This code initializes an RWKV model, sets its context length, and loads the state dictionary from a saved file. It also includes optional head layers for QK vectors and a copy mechanism. The forward pass performs embedding, passes through blocks, applies layer normalization, and returns the output.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/src/model_run.py\":186-216", + "content": " RWKV_CFG.ctx_len = ctx_len\n print('\\nloading RWKV-GPT', MODEL_NAME)\n self.emb = nn.Embedding(vocab_size, n_embd)\n self.blocks = nn.Sequential(*[Block(i) for i in range(n_layer)])\n self.ln_out = nn.LayerNorm(n_embd)\n self.head = nn.Linear(n_embd, vocab_size, bias=False)\n if RWKV_HEAD_QK_DIM > 0:\n self.head_q = nn.Linear(n_embd, RWKV_HEAD_QK_DIM, bias=False)\n self.head_q.scale_init = 0\n self.head_k = nn.Linear(n_embd, RWKV_HEAD_QK_DIM, bias=False)\n self.head_k.scale_init = 0.1\n self.register_buffer(\"copy_mask\", torch.tril(\n torch.ones(ctx_len, ctx_len)))\n self.ctx_len = ctx_len\n self.eval()\n self.load_state_dict(torch.load(MODEL_NAME + '.pth'))\n self.eval()\n def forward(self, idx):\n B, T = idx.size()\n assert T <= self.ctx_len, \"Cannot forward, because len(input) > model ctx_len.\"\n x = self.emb(idx)\n x = self.blocks(x)\n x = self.ln_out(x)" + }, + { + "comment": "This code segment is part of a model training process. It performs a head-multihead attention mechanism and then applies the result to the input. If RWKV_HEAD_QK_DIM is greater than 0, it calculates and applies the attention matrix, otherwise it simply passes through the input. The floating point mode is set based on the environment variable, with options for FP32, FP16, or BF16 precision. The class RWKV_RNN initializes a model with specified parameters like device, model type, number of layers, embedding size, and context length.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/src/model_run.py\":218-241", + "content": " if RWKV_HEAD_QK_DIM > 0:\n q = self.head_q(x)[:, :T, :]\n k = self.head_k(x)[:, :T, :]\n c = (q @ k.transpose(-2, -1)) * (1.0 / RWKV_HEAD_QK_DIM)\n c = c.masked_fill(self.copy_mask[:T, :T] == 0, 0)\n if '32' in os.environ['RWKV_FLOAT_MODE']:\n c = c @ F.one_hot(idx, num_classes=RWKV_CFG.vocab_size)\n elif os.environ['RWKV_FLOAT_MODE'] == 'fp16':\n c = c @ F.one_hot(idx, num_classes=RWKV_CFG.vocab_size).half()\n elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':\n c = c @ F.one_hot(idx, num_classes=RWKV_CFG.vocab_size).bfloat16()\n x = self.head(x) + c\n else:\n x = self.head(x) \n return x\n############################################################################################################\nclass RWKV_RNN(): # this is running in FP32 at this moment\n def __init__(self, MODEL_NAME, RUN_DEVICE, model_type, n_layer, n_embd, ctx_len):\n self.RUN_DEVICE = RUN_DEVICE" + }, + { + "comment": "This code loads a pre-trained RWKV-v4 model from a file and assigns the weights to the corresponding layers. It also performs some processing on time-related parameters, such as squeezing them and applying exponential decay. The code then organizes the loaded weights into a nested namespace structure based on their names, following a hierarchical pattern with layer numbers and layer types.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/src/model_run.py\":242-271", + "content": " self.model_type = model_type\n self.n_layer = n_layer\n self.n_embd = n_embd\n self.ctx_len = ctx_len\n self.w = types.SimpleNamespace()\n w = torch.load(MODEL_NAME + '.pth',\n map_location=torch.device(RUN_DEVICE))\n for x in w.keys():\n w[x] = w[x].float()\n if '.time_' in x:\n w[x] = w[x].squeeze()\n if '.time_decay' in x:\n w[x] = -torch.exp(w[x])\n if DEBUG_TIME and '.time_' in x:\n print(x, w[x].squeeze().cpu().numpy())\n xx = x.split('.')\n here = self.w\n for i in range(len(xx)):\n if xx[i].isdigit():\n ii = int(xx[i])\n if ii not in here:\n here[ii] = types.SimpleNamespace()\n here = here[ii]\n else:\n if i == len(xx) - 1:\n setattr(here, xx[i], w[x])\n elif not hasattr(here, xx[i]):" + }, + { + "comment": "This code represents a class that can load, clear, and save various variables (xx, aa, bb, pp). It also contains methods to perform layer normalization (LN) and feed-forward operations (FF). The code uses the `getattr` function to dynamically access attributes based on input, and it initializes certain attributes as SimpleNamespace or empty dictionaries. The `clear`, `save`, and `load` functions are used to manage the state of the class variables.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/src/model_run.py\":272-305", + "content": " if xx[i+1].isdigit():\n setattr(here, xx[i], {})\n else:\n setattr(here, xx[i], types.SimpleNamespace())\n here = getattr(here, xx[i])\n self.clear()\n def clear(self):\n self.xx = {}\n self.aa = {}\n self.bb = {}\n self.pp = {}\n self.hk = None\n def save(self, target):\n target.xx = copy.deepcopy(self.xx)\n target.aa = copy.deepcopy(self.aa)\n target.bb = copy.deepcopy(self.bb)\n target.pp = copy.deepcopy(self.pp)\n target.hk = copy.deepcopy(self.hk)\n def load(self, target):\n self.xx = copy.deepcopy(target.xx)\n self.aa = copy.deepcopy(target.aa)\n self.bb = copy.deepcopy(target.bb)\n self.pp = copy.deepcopy(target.pp)\n self.hk = copy.deepcopy(target.hk)\n def LN(self, xx, w):\n return F.layer_norm(xx, (self.n_embd,), weight=w.weight, bias=w.bias)\n def FF(self, xx, w, name):\n if name not in self.xx:" + }, + { + "comment": "Code is a part of an attention mechanism in a transformer model. It calculates the key, value and returns a weighted sum. The SA function initializes variables for each name.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/src/model_run.py\":306-329", + "content": " self.xx[name] = torch.zeros(self.n_embd, device=self.RUN_DEVICE)\n xk = xx * w.time_mix_k + self.xx[name] * (1 - w.time_mix_k)\n xr = xx * w.time_mix_r + self.xx[name] * (1 - w.time_mix_r)\n self.xx[name] = xx\n r = torch.sigmoid(w.receptance.weight @ xr)\n k = torch.square(torch.relu(w.key.weight @ xk))\n kv = w.value.weight @ k\n return r * kv\n def SA(self, xx, w, name):\n if name not in self.xx:\n self.xx[name] = torch.zeros(self.n_embd, device=self.RUN_DEVICE)\n self.aa[name] = torch.zeros(self.n_embd, device=self.RUN_DEVICE)\n self.bb[name] = torch.zeros(self.n_embd, device=self.RUN_DEVICE)\n self.pp[name] = torch.zeros(self.n_embd, device=self.RUN_DEVICE) - 1e30\n xk = xx * w.time_mix_k + self.xx[name] * (1 - w.time_mix_k)\n xv = xx * w.time_mix_v + self.xx[name] * (1 - w.time_mix_v)\n xr = xx * w.time_mix_r + self.xx[name] * (1 - w.time_mix_r)\n self.xx[name] = xx\n r = torch.sigmoid(w.receptance.weight @ xr)" + }, + { + "comment": "Function defines the operation of a RWKV model. It applies layers like LN, FFN, and SA in a loop to transform input x. The function uses variables pp, aa, bb, and ww for intermediate calculations related to time-decaying weights and exponential operations. Output is the weighted sum of input x transformed by the applied layers.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/src/model_run.py\":331-365", + "content": " k = w.key.weight @ xk\n v = w.value.weight @ xv\n pp = self.pp[name]\n aa = self.aa[name]\n bb = self.bb[name]\n ww = w.time_first + k\n p = torch.maximum(pp, ww)\n e1 = torch.exp(pp - p)\n e2 = torch.exp(ww - p)\n a = e1 * aa + e2 * v\n b = e1 * bb + e2\n ww = pp + w.time_decay\n p = torch.maximum(ww, k)\n e1 = torch.exp(ww - p)\n e2 = torch.exp(k - p)\n self.aa[name] = e1 * aa + e2 * v\n self.bb[name] = e1 * bb + e2\n self.pp[name] = p\n rwkv = r * a / b\n return w.output.weight @ rwkv\n def run(self, ctx):\n w = self.w\n x = w.emb.weight[ctx[-1]]\n for i in range(self.n_layer):\n if i == 0:\n x = self.LN(x, w.blocks[i].ln0)\n if i == 0 and self.model_type == 'RWKV-ffnPre':\n x = x + self.FF(self.LN(x, w.blocks[i].ln1), w.blocks[i].ffnPre, f'ffnPre.{i}')\n else:\n x = x + self.SA(self.LN(x, w.blocks[i].ln1), w.blocks[i].att, f'att.{i}')" + }, + { + "comment": "This code applies a feed-forward network (FFN) and layer normalization (LN) to the input 'x' and updates it based on the context length ('ctx_len'). It also handles the case when RWKV_HEAD_QK_DIM is greater than 0, calculating the head key matrix ('hk') and updating 'x' accordingly. Finally, it returns the updated 'x'.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/src/model_run.py\":366-391", + "content": " x = x + self.FF(self.LN(x, w.blocks[i].ln2), w.blocks[i].ffn, f'ffn.{i}')\n x = self.LN(x, w.ln_out)\n if RWKV_HEAD_QK_DIM > 0:\n if self.hk == None:\n self.hk = (w.head_k.weight @ x).unsqueeze(0)\n else:\n self.hk = torch.cat(\n [self.hk, (w.head_k.weight @ x).unsqueeze(0)], dim=0)\n if self.hk.shape[0] > self.ctx_len:\n self.hk = self.hk[-self.ctx_len:, :]\n q = w.head_q.weight @ x\n x = w.head.weight @ x\n x = x.cpu().numpy().tolist()\n c = (self.hk @ q) / RWKV_HEAD_QK_DIM\n for i in range(len(c)):\n x[ctx[i]] += c[i]\n else:\n x = w.head.weight @ x\n x = x.cpu().numpy().tolist()\n return x" + } + ] +} \ No newline at end of file diff --git a/docs/doc/1a4d54be-0f2b-4525-af34-d0b0a5581f78.json b/docs/doc/1a4d54be-0f2b-4525-af34-d0b0a5581f78.json new file mode 100644 index 00000000..f5305f67 --- /dev/null +++ b/docs/doc/1a4d54be-0f2b-4525-af34-d0b0a5581f78.json @@ -0,0 +1,70 @@ +{ + "summary": "The code initializes the RWKV v2-RNN language model, optimizes it for performance, and utilizes CUDA for efficient computation. It implements time-decay parameters, defines a GPT model with RWKV blocks, and sets up optimization methods.", + "details": [ + { + "comment": "The code imports necessary libraries and defines constants for the RWKV v2-RNN Language Model, which is a neural network language model. It loads a CUDA kernel called \"timex\" for efficient computation on GPUs using Torch's `torch.utils.cpp_extension.load` function. The constants T_MAX, B_GROUP_FORWARD, and B_GROUP_BACKWARD are set to optimize performance.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v2-RNN/src/model.py\":0-22", + "content": "########################################################################################################\n# The RWKV v2-RNN Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nfrom torch.utils.cpp_extension import load\nimport math\nimport numpy as np\nimport logging\nimport torch\nimport torch.nn as nn\nfrom torch.nn import functional as F\nlogger = logging.getLogger(__name__)\n########################################################################################################\n# CUDA Kernel\n########################################################################################################\nT_MAX = 1024 # increase this if your ctx_len > 1024\nB_GROUP_FORWARD = 4 # set to 8 for best performance\nB_GROUP_BACKWARD = 2 # set to 2 for best performance\ntimex_cuda = load(name=\"timex\", sources=[\"cuda/timex_op.cpp\", \"cuda/timex_cuda.cu\"],\n verbose=True, extra_cuda_cflags=['--use_fast" + }, + { + "comment": "Defines a TimeX class that implements the Time-X function using CUDA for efficient computation. The class takes in weights (w), kernel (k), batch size (B), number of channels (C), sequence length (T), and epsilon (eps) as input, and returns the output tensor (wk). It also ensures all parameters meet certain conditions before forwarding and backward propagation.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v2-RNN/src/model.py\":22-46", + "content": "_math', '--extra-device-vectorization', f'-DTmax={T_MAX}', f'-DBF={B_GROUP_FORWARD}', f'-DBB={B_GROUP_BACKWARD}'])\nclass TimeX(torch.autograd.Function):\n @staticmethod\n def forward(ctx, w, k, B, C, T, eps):\n ctx.B = B\n ctx.C = C\n ctx.T = T\n assert ctx.T % 4 == 0 and ctx.T <= T_MAX and ctx.B % B_GROUP_FORWARD == 0 and ctx.B % B_GROUP_BACKWARD == 0\n w = w.contiguous()\n k = k.contiguous()\n ctx.save_for_backward(w, k)\n wk = torch.empty((B, C, T), device='cuda',\n memory_format=torch.contiguous_format)\n timex_cuda.forward(w, k, wk, eps, B, C, T)\n return wk\n @staticmethod\n def backward(ctx, gwk):\n assert ctx.T % 4 == 0 and ctx.T <= T_MAX and ctx.B % B_GROUP_FORWARD == 0 and ctx.B % B_GROUP_BACKWARD == 0\n w, k = ctx.saved_tensors\n gw = torch.empty((ctx.B, ctx.C, ctx.T), device='cuda',\n memory_format=torch.contiguous_format)\n gk = torch.empty((ctx.B, ctx.C, ctx.T), device='cuda'," + }, + { + "comment": "This code initializes the RWKV model by setting specific parameters and configurations for each layer. It uses fancy initialization to set the weights of linear and embedding layers in the module. The code also defines constants like RWKV_K_CLAMP, RWKV_K_EPS, and RWKV_HEAD_QK_DIM for further calculations.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v2-RNN/src/model.py\":47-72", + "content": " memory_format=torch.contiguous_format)\n timex_cuda.backward(w, k, gwk.contiguous(), gw,\n gk, ctx.B, ctx.C, ctx.T)\n return (gw.sum(dim=0), gk, None, None, None, None)\n########################################################################################################\n# RWKV: RWKV Time-mix + RWKV Channel-mix\n########################################################################################################\nRWKV_K_CLAMP = 60 # e^60 = 1e26\nRWKV_K_EPS = 1e-16\nRWKV_HEAD_QK_DIM = 256\ndef RWKV_Init(module, config): # fancy initialization of all lin & emb layer in the module\n for m in module.modules():\n if not isinstance(m, (nn.Linear, nn.Embedding)):\n continue\n with torch.no_grad():\n name = '[unknown weight]'\n for name, parameter in module.named_parameters(): # find the name of the weight\n if id(m.weight) == id(parameter):\n break\n shape = m.weight.data.shape" + }, + { + "comment": "The code adjusts the gain and scale of layer weights in a neural network model, depending on the type and shape of the layer. It initializes embeddings with a small scale and linear layers with zero or identity matrices, based on specific conditions. The final projection has a different scale, while zero initialization is used if the scale is set to -999 and gain is 0.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v2-RNN/src/model.py\":73-100", + "content": " gain = 1.0\n scale = 1.0 # extra scale for gain\n if isinstance(m, nn.Embedding):\n gain = math.sqrt(max(shape[0], shape[1]))\n if shape[0] == config.vocab_size and shape[1] == config.n_embd: # token emb?\n scale = 1e-4\n else:\n scale = 0\n if isinstance(m, nn.Linear):\n if m.bias is not None:\n m.bias.data.zero_()\n if shape[0] > shape[1]:\n gain = math.sqrt(shape[0] / shape[1])\n if shape[0] == config.vocab_size and shape[1] == config.n_embd: # final projection?\n scale = 0.5\n if hasattr(m, 'scale_init'):\n scale = m.scale_init\n # print(str(shape[0]).ljust(5), str(shape[1]).ljust(5), f'{round(scale,2):g}'.ljust(4), name)\n gain *= scale\n if scale == -999:\n nn.init.eye_(m.weight)\n elif gain == 0:\n # zero init is great for some RWKV matrices" + }, + { + "comment": "This code initializes the time_w curves for a RWKV_TimeMix module in the model. It uses different initialization methods depending on the gain value. If the gain is zero, it initializes the weight as zeros. If the gain is positive, it initializes the weight with orthogonal values. If the gain is negative, it initializes the weight with normal distribution mean 0 and standard deviation of -scale. The time_w curves are initialized for better convergence using a decay speed variable and calculating f1 and f2 based on layer ID.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v2-RNN/src/model.py\":101-128", + "content": " nn.init.zeros_(m.weight)\n elif gain > 0:\n nn.init.orthogonal_(m.weight, gain=gain)\n else:\n nn.init.normal_(m.weight, mean=0.0, std=-scale)\nclass RWKV_TimeMix(nn.Module):\n def __init__(self, config, layer_id):\n super().__init__()\n self.layer_id = layer_id\n self.ctx_len = config.ctx_len\n self.n_embd = config.n_embd\n attn_sz = config.n_embd\n ############# fancy init of time_w curves ###################################\n f1_begin = 3.0\n f1_end = 1.2\n f2_begin = 0.65\n f2_end = 0.4\n with torch.no_grad(): # initial time_w curves for better convergence\n decay_speed = torch.ones(attn_sz, 1)\n first_sa_layer_id = 1\n for h in range(attn_sz):\n f1 = f1_begin + (layer_id-first_sa_layer_id) / \\\n (config.n_layer-1-first_sa_layer_id) * (f1_end - f1_begin)\n f2 = f2_begin + (layer_id-first_sa_layer_id) / \\" + }, + { + "comment": "This code initializes the time-decay and related parameters for a transformer model. It sets `self.time_decay` as a learnable parameter, calculates `self.time_curve`, and initializes `self.time_first` and `self.time_shift`. The decay speed is adjusted based on the layer id to control how quickly time decays in the attention mechanism.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v2-RNN/src/model.py\":129-146", + "content": " (config.n_layer-1-first_sa_layer_id) * (f2_end - f2_begin)\n if layer_id == first_sa_layer_id:\n f1 += 0.5\n if layer_id == config.n_layer-2:\n f2 = 0.4\n if layer_id == config.n_layer-1:\n f2 = 0.37\n decay_speed[h][0] = math.pow(f2, h / (attn_sz-1) * 7) * f1\n self.time_decay = nn.Parameter(torch.log(decay_speed)) # will use exp(self.time_decay) to ensure time_decay > 0\n self.time_curve = torch.tensor(\n [-(config.ctx_len - 2 - i) for i in range(config.ctx_len-1)]).unsqueeze(0)\n self.time_curve = self.time_curve.to('cuda')\n self.time_first = nn.Parameter(torch.ones(attn_sz, 1) * math.log(0.3))\n #############################################################################\n self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))\n with torch.no_grad(): # init to \"shift half of the channels\"\n ww = torch.ones(1, 1, config.n_embd)" + }, + { + "comment": "This code initializes a model for the RWKV-v2 architecture. It defines the time_mix parameter, and several linear layers (key, value, receptance, output). The forward function applies these layers to input x, scales key and receptance to zero, clamps key within certain bounds, exponentials it, and performs a weighted sum with value before returning the result.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v2-RNN/src/model.py\":147-176", + "content": " for i in range(config.n_embd // 2):\n ww[0, 0, i] = 0\n self.time_mix = nn.Parameter(ww)\n self.key = nn.Linear(config.n_embd, attn_sz, bias=False)\n self.value = nn.Linear(config.n_embd, attn_sz, bias=False)\n self.receptance = nn.Linear(config.n_embd, attn_sz, bias=False)\n self.output = nn.Linear(attn_sz, config.n_embd, bias=False)\n self.key.scale_init = 0\n self.receptance.scale_init = 0\n self.output.scale_init = 0\n def forward(self, x):\n B, T, C = x.size()\n x = x * self.time_mix + self.time_shift(x) * (1 - self.time_mix)\n k = self.key(x).transpose(-1, -2)\n v = self.value(x).transpose(-1, -2)\n r = self.receptance(x)\n # RWKV_K_CLAMP can be removed if the CUDA kernel substracts the correct k_max for each k (I will do this later)\n k = torch.clamp(k, max=RWKV_K_CLAMP)\n k = torch.exp(k)\n kv = k * v\n self.time_w = torch.cat(\n [torch.exp(self.time_decay) * self.time_curve, self.time_first], dim=-1)" + }, + { + "comment": "This code defines a RWKV Channel Mix module for a transformer model. It initializes channel mixing parameters, applies time shifting and mixing operations, and then performs a key-value attention mechanism to produce the final output. The TimeX class is applied to compute weighting factors based on time steps.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v2-RNN/src/model.py\":177-206", + "content": " w = torch.exp(self.time_w)\n wkv = TimeX.apply(w, kv, B, C, T, 0)\n # RWKV_K_EPS can be removed if the CUDA kernel sets 0/0 = 0 (I will do this later)\n wk = TimeX.apply(w, k, B, C, T, RWKV_K_EPS)\n rwkv = torch.sigmoid(r) * (wkv / wk).transpose(-1, -2)\n rwkv = self.output(rwkv)\n return rwkv\nclass RWKV_ChannelMix(nn.Module):\n def __init__(self, config, layer_id):\n super().__init__()\n self.layer_id = layer_id\n self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))\n with torch.no_grad(): # init to \"shift half of the channels\"\n x = torch.ones(1, 1, config.n_embd)\n for i in range(config.n_embd // 2):\n x[0, 0, i] = 0\n self.time_mix = nn.Parameter(x)\n hidden_sz = 4 * config.n_embd\n self.key = nn.Linear(config.n_embd, hidden_sz, bias=False)\n self.receptance = nn.Linear(config.n_embd, config.n_embd, bias=False)\n self.value = nn.Linear(hidden_sz, config.n_embd, bias=False)\n self.value.scale_init = 0" + }, + { + "comment": "The code is defining a GPT model with RWKV blocks. It includes an initialization for the receptance scale, forward function to process input data, and a class for the GPTConfig and Block modules. The RWKV-ffnPre model type initializes additional layers in the first layer.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v2-RNN/src/model.py\":207-241", + "content": " self.receptance.scale_init = 0\n def forward(self, x):\n x = x * self.time_mix + self.time_shift(x) * (1 - self.time_mix)\n k = self.key(x)\n k = torch.square(torch.relu(k))\n kv = self.value(k)\n rkv = torch.sigmoid(self.receptance(x)) * kv\n return rkv\n########################################################################################################\n# The GPT Model with our blocks\n########################################################################################################\nclass GPTConfig:\n def __init__(self, vocab_size, ctx_len, **kwargs):\n self.vocab_size = vocab_size\n self.ctx_len = ctx_len\n for k, v in kwargs.items():\n setattr(self, k, v)\nclass Block(nn.Module):\n def __init__(self, config, layer_id):\n super().__init__()\n self.config = config\n self.layer_id = layer_id\n self.ln1 = nn.LayerNorm(config.n_embd)\n self.ln2 = nn.LayerNorm(config.n_embd)\n if self.layer_id == 0 and self.config.model_type == 'RWKV-ffnPre':" + }, + { + "comment": "The code defines a GPT model with layers and a forward function. It initializes an embedding layer, a sequence of blocks, and output and attention layers. The forward function applies layer normalization and feeds the input through the attention and feed-forward networks. If the layer_id is 0 and the config's model_type is 'RWKV-ffnPre', it adds the ffnPre to the input for better performance in some cases.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v2-RNN/src/model.py\":242-274", + "content": " self.ffnPre = RWKV_ChannelMix(config, layer_id+1000)\n else:\n self.att = RWKV_TimeMix(config, layer_id)\n self.ffn = RWKV_ChannelMix(config, layer_id)\n def forward(self, x):\n x = self.ln1(x)\n if self.layer_id == 0 and self.config.model_type == 'RWKV-ffnPre':\n x = x + self.ffnPre(x) # better in some cases\n else:\n x = x + self.att(x)\n x = self.ln2(x)\n x = x + self.ffn(x)\n return x\nclass GPT(nn.Module):\n def __init__(self, config):\n super().__init__()\n self.step = 0\n self.config = config\n self.emb = nn.Embedding(config.vocab_size, config.n_embd)\n self.blocks = nn.Sequential(*[Block(config, i)\n for i in range(config.n_layer)])\n self.ln_out = nn.LayerNorm(config.n_embd)\n self.head = nn.Linear(config.n_embd, config.vocab_size, bias=False)\n self.head_q = nn.Linear(config.n_embd, RWKV_HEAD_QK_DIM, bias=False)\n self.head_q.scale_init = 0" + }, + { + "comment": "This code defines a model class with initialization and optimization configuration methods. It initializes the linear layer, copy mask, and registers buffers for the model. It also sets the context length (ctx_len) and calls RWKV_Init function to initialize other parameters. The _init_weights method is used to set weights in the layers. Finally, the configure_optimizers method separates model parameters into those with and without weight decay for optimization.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v2-RNN/src/model.py\":275-303", + "content": " self.head_k = nn.Linear(config.n_embd, RWKV_HEAD_QK_DIM, bias=False)\n self.head_k.scale_init = 0.1\n self.register_buffer(\"copy_mask\", torch.tril(\n torch.ones(config.ctx_len, config.ctx_len)))\n self.ctx_len = config.ctx_len\n RWKV_Init(self, config)\n logger.info(\"number of parameters: %e\", sum(p.numel()\n for p in self.parameters()))\n def get_ctx_len(self):\n return self.ctx_len\n def _init_weights(self, module):\n if isinstance(module, (nn.Linear)):\n module.weight.data.normal_(mean=0.0, std=0.01)\n if isinstance(module, (nn.Embedding)):\n module.weight.data.normal_(mean=0.0, std=1e-5)\n if isinstance(module, nn.Linear) and module.bias is not None:\n module.bias.data.zero_()\n def configure_optimizers(self, train_config):\n # separate out all parameters to those that will and won't experience regularizing weight decay\n decay = set()\n no_decay = set()\n for mn, m in self.named_modules(): # here we disable weight_decay" + }, + { + "comment": "Looping through model parameters, separating them into decay and no_decay groups. Creating optimizer groups for no_decay params with weight_decay=0, then initializing an Adam optimizer for training.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v2-RNN/src/model.py\":304-328", + "content": " for pn, p in m.named_parameters():\n fpn = '%s.%s' % (mn, pn) if mn else pn # full param name\n no_decay.add(fpn)\n param_dict = {pn: p for pn, p in self.named_parameters()}\n inter_params = decay & no_decay\n union_params = decay | no_decay\n assert len(\n inter_params) == 0, \"parameters %s made it into both decay/no_decay sets!\" % (str(inter_params), )\n assert len(param_dict.keys() - union_params) == 0, \"parameters %s were not separated into either decay/no_decay set!\" \\\n % (str(param_dict.keys() - union_params), )\n optim_groups = [\n {\"params\": [param_dict[pn]\n for pn in sorted(list(no_decay))], \"weight_decay\": 0.0},\n ]\n optimizer = torch.optim.Adam(\n optim_groups, lr=train_config.learning_rate, betas=train_config.betas, eps=train_config.eps)\n return optimizer\n def forward(self, idx, targets=None):\n self.step += 1\n B, T = idx.size()" + }, + { + "comment": "This code is part of a model's forward pass. It checks if the input length (T) is within the model's context length, embeds the input, passes it through multiple blocks, applies layer normalization, and performs attention calculations for query and key tensors. If targets are provided, it calculates the cross-entropy loss.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v2-RNN/src/model.py\":329-348", + "content": " assert T <= self.ctx_len, \"Cannot forward, because len(input) > model ctx_len.\"\n x = self.emb(idx)\n x = self.blocks(x)\n x = self.ln_out(x)\n q = self.head_q(x)[:, :T, :]\n k = self.head_k(x)[:, :T, :]\n c = (q @ k.transpose(-2, -1)) * (1.0 / RWKV_HEAD_QK_DIM)\n c = c.masked_fill(self.copy_mask[:T, :T] == 0, 0)\n c = c @ F.one_hot(idx, num_classes=self.config.vocab_size).float()\n x = self.head(x) + c\n loss = None\n if targets is not None:\n loss = F.cross_entropy(x.view(-1, x.size(-1)), targets.view(-1))\n return x, loss" + } + ] +} \ No newline at end of file diff --git a/docs/doc/22f93f03-0c6b-48c4-9826-87c58efda7e1.json b/docs/doc/22f93f03-0c6b-48c4-9826-87c58efda7e1.json new file mode 100644 index 00000000..b5b89230 --- /dev/null +++ b/docs/doc/22f93f03-0c6b-48c4-9826-87c58efda7e1.json @@ -0,0 +1,30 @@ +{ + "summary": "This code initializes an RWKV model for generating text, evaluates it on a given dataset considering parameters like layers and temperature, and trains the model in a script. It checks for existing files, loads or runs the model, debugs output metrics, converts output to text using tokenizer, and displays time taken.", + "details": [ + { + "comment": "The code is initializing the RWKV v2-RNN Language Model, setting context length (ctx_len), number of layers (n_layer), embedding dimension (n_embd), and model type (either 'RWKV' or 'RWKV-ffnPre'). It also specifies the name of a pre-trained model (MODEL_NAME) and a vocabulary file (WORD_NAME). The code is written in Python, using TensorFlow and PyTorch libraries for deep learning tasks.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v2-RNN/run.py\":0-30", + "content": "# -*- coding:utf-8 -*-\n########################################################################################################\n# The RWKV v2-RNN Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nimport numpy as np\nimport math\nimport time\nimport types\nimport copy\nimport torch\nfrom torch.nn import functional as F\nfrom src.utils import TOKENIZER, Dataset\nfrom src.model_run import RWKV_RNN\ntorch.backends.cudnn.benchmark = True\ntorch.backends.cudnn.allow_tf32 = True\ntorch.backends.cuda.matmul.allow_tf32 = True\nnp.set_printoptions(precision=4, suppress=True, linewidth=200)\n### Step 1: set model ##################################################################################\nctx_len = 1024\nn_layer = 6\nn_embd = 512\nmodel_type = 'RWKV' # 'RWKV' or 'RWKV-ffnPre'\n# your trained model\nMODEL_NAME = 'trained-31'\nWORD_NAME = 'vocab' # the .json vocab (generated by train.py\n# ########## Uncomment these to test my 27M params enwik8 model ##########" + }, + { + "comment": "The code initializes an RWKV model with the specified name, device (CPU or CUDA), model type, number of layers, embedding size, and context length. The context is set as a prompt, with a given number of trials and length per trial. Temperature and top-p values are also defined for generating text. Unknown characters in the context are denoted by a space character.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v2-RNN/run.py\":31-57", + "content": "# MODEL_NAME = 'enwik8-ppl1.65-6064-1024-RWKV-6-512-2022-03-25-21-05-13'\n# WORD_NAME = 'enwik8-vocab'\n# EVAL_DATA = 'enwik8' # uncomment this for EVAL MODE (no text generation)\n# ########################################################################\n# --> set UNKNOWN_CHAR to the rarest token in your vocab.json <--\n# --> all unknown tokens in your context will be denoted by it <--\nUNKNOWN_CHAR = ' ' # here we just set it to [space] for simplicity\nRUN_DEVICE = 'cpu' # 'cpu' (already very fast) or 'cuda'\nDEBUG_DEBUG = False # True False - show softmax output\n### Step 2: set context ################################################################################\ncontext = \"\\nIn the\" # ==> this is your prompt\nNUM_TRIALS = 999\nLENGTH_PER_TRIAL = 500\nTEMPERATURE = 1.0\ntop_p = 0.7\ntop_p_newline = 0.9\n########################################################################################################\nprint(f'Loading {MODEL_NAME}...')\nmodel = RWKV_RNN(MODEL_NAME, RUN_DEVICE, model_type, n_layer, n_embd, ctx_len)" + }, + { + "comment": "This code is evaluating the model on a given dataset (EVAL_DATA) by taking random samples, creating context sequences, and calculating average loss over ctx_len. The model's performance is assessed based on this average loss across sampled data.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v2-RNN/run.py\":58-88", + "content": "tokenizer = TOKENIZER(WORD_NAME, UNKNOWN_CHAR=UNKNOWN_CHAR)\n########################################################################################################\nif 'EVAL_DATA' in vars() or 'EVAL_DATA' in globals():\n print('Evaluating on ' + EVAL_DATA + ' ...')\n data = open(EVAL_DATA, \"r\", encoding='utf-8').read()\n loss_table = np.zeros(ctx_len)\n N_SAMPLE = 1000\n for iii in range(N_SAMPLE):\n pos = np.random.randint(0, len(data) - ctx_len-1)\n context = data[pos:pos+ctx_len+1]\n ctx = [tokenizer.stoi.get(s, tokenizer.UNKNOWN_CHAR) for s in context]\n model.clear()\n for i in range(1, ctx_len+1):\n x = ctx[:i]\n out = model.run(x)\n prob = F.softmax(torch.tensor(out), dim=-1)\n loss_table[i-1] += -math.log(prob[ctx[i]])\n print(f'Tested {iii+1} samples: avg_loss over ctx_len =',\n np.mean(loss_table) / (iii+1))\n exit(0)\n########################################################################################################" + }, + { + "comment": "This code performs the first run of RWKV-v2 model on a prompt, with future versions expected to be faster. It uses an RNN for processing and saves/loads model states between trials. The input prompt is tokenized, and its length affects the number of trials performed. The DEBUG_DEBUG flag can skip the first run.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v2-RNN/run.py\":90-119", + "content": "context = tokenizer.refine_context(context)\nprint('\\nYour prompt has ' + str(len(context)) + ' tokens.')\nprint('\\n--> Currently the first run takes a while if your prompt is long, as we are using RNN to process the prompt. This will be much faster in future versions. <--\\n')\nfor TRIAL in range(1 if DEBUG_DEBUG else NUM_TRIALS):\n t_begin = time.time_ns()\n src_len = len(context)\n ctx = [tokenizer.stoi.get(s, tokenizer.UNKNOWN_CHAR) for s in context]\n print(('-' * 30) + context, end='')\n model.clear()\n if TRIAL == 0:\n init_state = types.SimpleNamespace()\n for i in range(src_len):\n x = ctx[:i+1]\n if i == src_len - 1:\n init_state.out = model.run(x)\n else:\n model.run(x)\n model.save(init_state)\n else:\n model.load(init_state)\n for i in range(src_len, src_len + (1 if DEBUG_DEBUG else LENGTH_PER_TRIAL)):\n x = ctx[:i+1]\n x = x[-ctx_len:]\n if i == src_len:\n out = copy.deepcopy(init_state.out)" + }, + { + "comment": "This code is part of a model training script. It checks if there's an existing file for the same context and loads it if it exists, or runs the model on input data if not. The debugging option prints certain metrics about the model's output. The output is then converted to text using the tokenizer and printed. Finally, the time taken for this operation is displayed.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v2-RNN/run.py\":120-132", + "content": " else:\n out = model.run(x)\n if DEBUG_DEBUG:\n print('model', np.array(x), '==>', np.array(\n out), np.max(out), np.min(out))\n char = tokenizer.sample_logits(out, x, ctx_len, temperature=TEMPERATURE,\n top_p_usual=top_p, top_p_newline=top_p_newline)\n char = char.item()\n print(tokenizer.itos[int(char)], end='', flush=True)\n ctx += [char]\n t_end = time.time_ns()\n print(\"\\n----------\", round((t_end - t_begin) / (10 ** 9), 2), end='s ')" + } + ] +} \ No newline at end of file diff --git a/docs/doc/24f742c5-f814-4f54-a04e-2ab7d5adfa08.json b/docs/doc/24f742c5-f814-4f54-a04e-2ab7d5adfa08.json new file mode 100644 index 00000000..0b1978d0 --- /dev/null +++ b/docs/doc/24f742c5-f814-4f54-a04e-2ab7d5adfa08.json @@ -0,0 +1,50 @@ +{ + "summary": "The code initializes and optimizes an RWKV Language Model in PyTorch, creates a model class with feed-forward network, applies deep learning processing techniques, checks if current layer is rescaling, adjusts input, and performs layer normalization/feed-forward operations before returning modified input and state.", + "details": [ + { + "comment": "This code is initializing the RWKV Language Model, which is implemented in PyTorch. It defines a module and function for optimizing the code using torchdynamo or torch jit depending on the environment variable RWKV_JIT_ON. The code also sets some variables like RWKV_HEAD_QK_DIM and DEBUG_TIME, which control certain aspects of the model's behavior.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model_run.py\":0-32", + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nimport types\nimport torch\nimport math, os, gc\nfrom torch.nn import functional as F\nimport torch.nn as nn\nfrom typing import List, Dict\nMyModule = nn.Module\ndef __nop(ob):\n return ob\nMyFunction = __nop\n# # try torchdynamo\n# import torchdynamo\n# MyFunction = torchdynamo.optimize(os.environ[\"RWKV_RUN_BACKEND\"]) # !!!BUGGY!!! wrong output\n# try torch jit --> faster for fp32, slower for fp16 (why?)\nif os.environ[\"RWKV_JIT_ON\"] == \"1\":\n MyModule = torch.jit.ScriptModule\n MyFunction = torch.jit.script_method\nRWKV_HEAD_QK_DIM = 0\nprint(f'\\nRWKV_HEAD_QK_DIM {RWKV_HEAD_QK_DIM} RWKV_JIT_ON {os.environ[\"RWKV_JIT_ON\"]}\\n')\nDEBUG_TIME = False # True False - show trained time-coeffs\nRWKV_RESCALE_LAYER = 6 # set x=x/2 every X layer\n############################################################################################################" + }, + { + "comment": "This code defines a class for the RWKV_RNN model, initializes its attributes based on provided arguments, loads and refines weights from a pre-trained model, adjusts certain layers' weights according to block ID, and handles loading and reshaping position embedding.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model_run.py\":34-60", + "content": "class RWKV_RNN(MyModule):\n def __init__(self, args):\n super().__init__()\n self.args = args\n self.FLOAT_MODE = args.FLOAT_MODE\n self.RUN_DEVICE = args.RUN_DEVICE\n with torch.no_grad():\n w = torch.load(args.MODEL_NAME + '.pth', map_location='cpu')\n # refine weights and send to correct device\n keys = list(w.keys())\n if 'pos_emb_x' in keys:\n w['pos_emb'] = (w['pos_emb_x'] + w['pos_emb_y']).reshape(args.ctx_len+1, -1)[:-1,:]\n keys = list(w.keys())\n print_need_newline = False\n for x in keys:\n block_id = 0\n if 'blocks.' in x:\n block_id = int(x.split('.')[1])\n if 'att.output.weight' in x:\n w[x] = w[x] / (2 ** int(block_id // RWKV_RESCALE_LAYER))\n if 'ffn.value.weight' in x:\n w[x] = w[x] / (2 ** int(block_id // RWKV_RESCALE_LAYER))\n if '.time_' in x:\n w[x] = w[x].squeeze()" + }, + { + "comment": "This code snippet is responsible for managing the data type and device of model parameters based on specified settings. It prints information about each parameter, including its name, data type, and device it's stored in. The DEBUG_TIME flag controls whether to print activation-time weights, while FLOAT_MODE determines the desired floating-point precision (fp32, fp16, or bf16). Parameters from specific groups are not modified unless they are in a specified group ('blocks.'). If RUN_DEVICE is 'cuda', parameters are moved to GPU if they're not the embedding layer weight.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model_run.py\":61-84", + "content": " if DEBUG_TIME:\n print(x, w[x].numpy())\n if '.time_decay' in x:\n w[x] = w[x].float()\n w[x] = -torch.exp(w[x])\n elif '.time_first' in x:\n w[x] = w[x].float()\n else:\n if self.FLOAT_MODE == \"fp32\":\n w[x] = w[x].float()\n elif self.FLOAT_MODE == \"bf16\":\n w[x] = w[x].bfloat16()\n elif self.FLOAT_MODE == \"fp16\":\n w[x] = w[x].half()\n w[x].requires_grad = False\n if args.RUN_DEVICE == 'cuda' and x != 'emb.weight':\n w[x] = w[x].cuda()\n if ('blocks.' not in x) or ('blocks.0.' in x):\n if print_need_newline:\n print('\\n', end = '')\n print_need_newline = False\n print(x.ljust(40), str(w[x].dtype).replace('torch.', '').ljust(10), w[x].device)" + }, + { + "comment": "This code is organizing and storing weights from a dictionary 'w' into an object 'self.w'. It uses string manipulation to split keys into components, then iteratively creates nested namespaces or dictionaries within self.w according to the key structure. Finally, it sets attributes in each namespace/dictionary using setattr(). The code also ensures garbage collection and empties CUDA cache for optimization.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model_run.py\":85-113", + "content": " else:\n print_need_newline = True\n print('.', end = '', flush = True)\n # store weights in self.w\n keys = list(w.keys())\n self.w = types.SimpleNamespace()\n for x in keys:\n xx = x.split('.')\n here = self.w\n for i in range(len(xx)):\n if xx[i].isdigit():\n ii = int(xx[i])\n if ii not in here:\n here[ii] = types.SimpleNamespace()\n here = here[ii]\n else:\n if i == len(xx) - 1:\n setattr(here, xx[i], w[x])\n elif not hasattr(here, xx[i]):\n if xx[i+1].isdigit():\n setattr(here, xx[i], {})\n else:\n setattr(here, xx[i], types.SimpleNamespace())\n here = getattr(here, xx[i])\n self.eval()\n gc.collect()\n torch.cuda.empty_cache()" + }, + { + "comment": "This function, \"FF\", applies a feed-forward network (FFN) to the input tensor 'x' using state information from a previous iteration. It also accounts for different floating point types ('bf16', 'fp16') and performs element-wise operations with learnable weights. The resulting output is a product of the input, kernel, and weight matrices, with elements multiplied by sigmoid and squared ReLU activation functions respectively.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model_run.py\":115-139", + "content": " def LN(self, x, w):\n return F.layer_norm(x, (self.args.n_embd,), weight=w.weight, bias=w.bias)\n # state[] 0=ffn_xx 1=att_xx 2=att_aa 3=att_bb 4=att_pp\n @MyFunction\n def FF(self, x, state, i:int, time_mix_k, time_mix_r, kw, vw, rw):\n if self.FLOAT_MODE == \"bf16\":\n xk = x * time_mix_k + state[5*i+0].type(torch.bfloat16) * (1 - time_mix_k)\n xr = x * time_mix_r + state[5*i+0].type(torch.bfloat16) * (1 - time_mix_r)\n state[5*i+0] = x.float()\n elif self.FLOAT_MODE == \"fp16\":\n xk = x * time_mix_k + state[5*i+0].half() * (1 - time_mix_k)\n xr = x * time_mix_r + state[5*i+0].half() * (1 - time_mix_r)\n state[5*i+0] = x.float() \n else:\n xk = x * time_mix_k + state[5*i+0] * (1 - time_mix_k)\n xr = x * time_mix_r + state[5*i+0] * (1 - time_mix_r)\n state[5*i+0] = x\n r = torch.sigmoid(rw @ xr)\n k = torch.square(torch.relu(kw @ xk))\n kv = vw @ k\n return r * kv" + }, + { + "comment": "This function calculates a weighted average of three inputs (x, xv, and xr) based on mixing factors and applies them to state[5*i+1]. Depending on FLOAT_MODE, it performs the calculation with different precision (bf16, fp16, or float32). The result is passed through a sigmoid function to obtain the final output r.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model_run.py\":141-159", + "content": " @MyFunction\n def SA(self, x, state, i:int, time_mix_k, time_mix_v, time_mix_r, time_first, time_decay, kw, vw, rw, ow):\n if self.FLOAT_MODE == \"bf16\":\n xk = x * time_mix_k + state[5*i+1].type(torch.bfloat16) * (1 - time_mix_k)\n xv = x * time_mix_v + state[5*i+1].type(torch.bfloat16) * (1 - time_mix_v)\n xr = x * time_mix_r + state[5*i+1].type(torch.bfloat16) * (1 - time_mix_r)\n state[5*i+1] = x.float()\n elif self.FLOAT_MODE == \"fp16\":\n xk = x * time_mix_k + state[5*i+1].half() * (1 - time_mix_k)\n xv = x * time_mix_v + state[5*i+1].half() * (1 - time_mix_v)\n xr = x * time_mix_r + state[5*i+1].half() * (1 - time_mix_r)\n state[5*i+1] = x.float() \n else:\n xk = x * time_mix_k + state[5*i+1] * (1 - time_mix_k)\n xv = x * time_mix_v + state[5*i+1] * (1 - time_mix_v)\n xr = x * time_mix_r + state[5*i+1] * (1 - time_mix_r)\n state[5*i+1] = x\n r = torch.sigmoid(rw @ xr)" + }, + { + "comment": "This code is performing a matrix multiplication operation and then applying exponential functions and maximum operations on the result. It also checks the FLOAT_MODE to handle different data types and returns the output of the operation.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model_run.py\":160-199", + "content": " k = kw @ xk\n v = vw @ xv\n if '16' in self.FLOAT_MODE:\n kk = k.float()\n vv = v.float()\n else:\n kk = k\n vv = v\n aa = state[5*i+2]\n bb = state[5*i+3]\n pp = state[5*i+4]\n ww = time_first + kk\n p = torch.maximum(pp, ww)\n e1 = torch.exp(pp - p)\n e2 = torch.exp(ww - p)\n a = e1 * aa + e2 * vv\n b = e1 * bb + e2\n ww = pp + time_decay\n p = torch.maximum(ww, kk)\n e1 = torch.exp(ww - p)\n e2 = torch.exp(kk - p)\n state[5*i+2] = e1 * aa + e2 * vv\n state[5*i+3] = e1 * bb + e2\n state[5*i+4] = p\n if self.FLOAT_MODE == \"bf16\":\n wkv = (a / b).type(torch.bfloat16)\n elif self.FLOAT_MODE == \"fp16\":\n wkv = (a / b).half()\n else:\n wkv = a / b\n return ow @ (r * wkv)\n def forward(self, ctx, state, preprocess_only = False):\n with torch.no_grad():\n w = self.w\n args = self.args\n x = w.emb.weight[ctx[-1]]" + }, + { + "comment": "This code segment is responsible for handling the input data and processing it through a sequence of layers in a deep learning model. The RUN_DEVICE variable determines whether the computation should be done on CPU or GPU. Positional embedding is added to the input, and initial state values are set if necessary. Finally, the input goes through multiple layers, including attention and feed-forward networks, with appropriate time mixing and normalization.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model_run.py\":200-225", + "content": " if self.RUN_DEVICE == 'cuda':\n x = x.cuda()\n try:\n pos_emb = w.pos_emb[len(ctx)-1]\n x = x + pos_emb\n except:\n pass \n if state == None:\n state = torch.zeros(args.n_layer * 5, args.n_embd, device=self.RUN_DEVICE)\n for i in range(args.n_layer):\n state[5*i+4] -= 1e30\n for i in range(args.n_layer):\n if i == 0:\n x = self.LN(x, w.blocks[i].ln0)\n ww = w.blocks[i].att\n x = x + self.SA(self.LN(x, w.blocks[i].ln1), state, i, \n ww.time_mix_k, ww.time_mix_v, ww.time_mix_r, ww.time_first, ww.time_decay, \n ww.key.weight, ww.value.weight, ww.receptance.weight, ww.output.weight)\n ww = w.blocks[i].ffn\n x = x + self.FF(self.LN(x, w.blocks[i].ln2), state, i, \n ww.time_mix_k, ww.time_mix_r, \n ww.key.weight, ww.value.weight, ww.receptance.weight)" + }, + { + "comment": "This code segment checks if the current layer is a rescaling layer and adjusts the input accordingly. If preprocessing only is enabled, it returns the state; otherwise, it applies layer normalization and feed-forward operations before returning the modified input and state.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model_run.py\":227-236", + "content": " if (i+1) % RWKV_RESCALE_LAYER == 0:\n x = x / 2\n if preprocess_only:\n return state\n x = self.LN(x, w.ln_out)\n x = w.head.weight @ x\n return x.float(), state" + } + ] +} \ No newline at end of file diff --git a/docs/doc/2b8e46de-a77b-4003-844e-61d6d1d84a35.json b/docs/doc/2b8e46de-a77b-4003-844e-61d6d1d84a35.json new file mode 100644 index 00000000..30e4336c --- /dev/null +++ b/docs/doc/2b8e46de-a77b-4003-844e-61d6d1d84a35.json @@ -0,0 +1,25 @@ +{ + "summary": "This code initializes and trains a RWKV language model, sets layers and context length, creates tokenizer, loads model file, and prints select RWKV-RNN outputs.", + "details": [ + { + "comment": "The code imports necessary libraries and sets environment variables for running the RWKV Language Model, which verifies results from different models to ensure consistency. It also specifies the device (CPU or GPU) to run the model and tokenization method (Pile).", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/verify.py\":0-26", + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\n# this is for verifying the results of different models and make sure they agree with each other\nimport os, sys, types\nimport numpy as np\nimport torch\nnp.set_printoptions(precision=4, suppress=True, linewidth=200)\ntry:\n os.environ[\"CUDA_VISIBLE_DEVICES\"] = sys.argv[1]\nexcept:\n pass\ntorch.backends.cudnn.benchmark = True\ntorch.backends.cudnn.allow_tf32 = False\ntorch.backends.cuda.matmul.allow_tf32 = False\nos.environ['RWKV_FLOAT_MODE'] = 'bf16' # bf16 or fp32\nos.environ['RWKV_RUN_DEVICE'] = 'cuda' # currently model_train requires CUDA\nRUN_DEVICE = os.environ['RWKV_RUN_DEVICE']\nTOKEN_MODE = 'pile'\nif TOKEN_MODE == 'pile':\n WORD_NAME = ['20B_tokenizer.json', '20B_tokenizer.json']\n MODEL_NAME = '/fsx/BlinkDL/HF-MODEL/rwkv-4-pile-3b/RWKV-4-Pile-3B-20221003-6783'" + }, + { + "comment": "This code initializes a RWKV model for training. It sets the number of layers, embedding dimension, and context length. The tokenizer is created based on the given word name and unknown character. The environment variables are set to define the maximum context length and enable JIT compilation. The model is then loaded from the specified file using appropriate float mode.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/verify.py\":27-65", + "content": " n_layer = 32\n n_embd = 2560\n ctx_len = 1024\n UNKNOWN_CHAR = None\nfrom src.utils import TOKENIZER\ntokenizer = TOKENIZER(WORD_NAME, UNKNOWN_CHAR=UNKNOWN_CHAR)\nif TOKEN_MODE == 'pile':\n tokenizer.vocab_size = 50277\n########################################################################################################\nos.environ[\"RWKV_JIT_ON\"] = \"1\"\nos.environ[\"RWKV_T_MAX\"] = str(ctx_len)\nfrom src.model_run import RWKV_RNN\nfrom src.model import RWKV\nargs = types.SimpleNamespace()\nargs.vocab_size = tokenizer.vocab_size\nargs.ctx_len = ctx_len\nargs.n_embd = n_embd\nargs.n_layer = n_layer\nargs.head_qk = 0\nargs.pre_ffn = 0\nargs.grad_cp = 0\nargs.my_pos_emb = 0\nmodel_train = RWKV(args).to(RUN_DEVICE)\nif os.environ['RWKV_FLOAT_MODE'] == 'fp16':\n model_train = model_train.half()\nelif os.environ['RWKV_FLOAT_MODE'] == 'bf16':\n model_train = model_train.bfloat16()\nprint('loading ' + MODEL_NAME)\nm2 = torch.load(MODEL_NAME + '.pth', map_location='cpu')\nmodel_train.load_state_dict(m2)\nif os.environ['RWKV_FLOAT_MODE'] == 'fp16':" + }, + { + "comment": "This code checks the RWKV_FLOAT_MODE environment variable and sets the appropriate float mode for the model_train. It then verifies the device and float mode, encodes a context string into tokens using the tokenizer, and generates output from the model_train in forward pass with no gradient calculation (torch.no_grad()).", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/verify.py\":66-90", + "content": " model_train = model_train.half()\nelif os.environ['RWKV_FLOAT_MODE'] == 'bf16':\n model_train = model_train.bfloat16()\nargs.MODEL_NAME = MODEL_NAME\nargs.RUN_DEVICE = RUN_DEVICE\nargs.FLOAT_MODE = os.environ['RWKV_FLOAT_MODE']\nmodel_rnn = RWKV_RNN(args)\n########################################################################################################\nprint(f\"\\nVerifying {os.environ['RWKV_RUN_DEVICE']} {os.environ['RWKV_FLOAT_MODE']}\")\n# context = '\\nIn a'\ncontext = '\\nIn a shocking finding, scientist discovered a herd of dragons living in a remote, previously unexplored valley, in Tibet. Even more surprising to the researchers was the fact that the dragons spoke perfect Chinese.'\nif TOKEN_MODE == 'pile':\n ctx = tokenizer.tokenizer.encode(context)\nprint(f'input len {len(ctx)} data {ctx}')\n########################################################################################################\nwith torch.no_grad():\n print('\\nRWKV-train output')\n out = model_train.forward(torch.tensor([ctx]).to(RUN_DEVICE))[0].detach().cpu().float().numpy()" + }, + { + "comment": "This code prints RWKV-RNN output at certain positions in the sequence. It uses a for loop to iterate through the context, calling the forward function of model_rnn. The first three outputs and the third one are printed using `print(out.detach().cpu().numpy())`, with ellipsis ('...') printed after the second output.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/verify.py\":91-103", + "content": " print(out, '\\n')\n print('\\nRWKV-RNN output')\n state = None\n out = None\n src_len = len(ctx)\n for i in range(src_len):\n x = ctx[:i+1]\n out, state = model_rnn.forward(x, state)\n if i < 3 or i >= src_len - 3:\n print(out.detach().cpu().numpy())\n if i == 2:\n print('...')" + } + ] +} \ No newline at end of file diff --git a/docs/doc/2d012723-4800-4e6e-958f-4303d88a1543.json b/docs/doc/2d012723-4800-4e6e-958f-4303d88a1543.json new file mode 100644 index 00000000..d0dc29cb --- /dev/null +++ b/docs/doc/2d012723-4800-4e6e-958f-4303d88a1543.json @@ -0,0 +1,65 @@ +{ + "summary": "The code sets up an RWKV Language Model environment for English/Chinese, creates a multilingual chatbot in Python with response generation commands and independent question prompts, handling user input and generating text from the model.", + "details": [ + { + "comment": "Loading RWKV Language Model and setting up environment with specified device, float mode, and tokenizer for English/Chinese language.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/chat.py\":0-30", + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nprint('Loading...')\nfrom src.model_run import RWKV_RNN\nimport numpy as np\nimport os, copy, types, gc, sys\nimport torch\nfrom src.utils import TOKENIZER\ntry:\n os.environ[\"CUDA_VISIBLE_DEVICES\"] = sys.argv[1]\nexcept:\n pass\ntorch.backends.cudnn.benchmark = True\ntorch.backends.cudnn.allow_tf32 = True\ntorch.backends.cuda.matmul.allow_tf32 = True\nnp.set_printoptions(precision=4, suppress=True, linewidth=200)\nCHAT_LANG = 'English' # English Chinese\nWORD_NAME = [\n \"20B_tokenizer.json\",\n \"20B_tokenizer.json\",\n] # [vocab, vocab] for Pile model\nUNKNOWN_CHAR = None\ntokenizer = TOKENIZER(WORD_NAME, UNKNOWN_CHAR=UNKNOWN_CHAR)\nargs = types.SimpleNamespace()\nargs.RUN_DEVICE = \"cuda\" # 'cpu' (already very fast) // 'cuda'\nargs.FLOAT_MODE = \"fp16\" # fp32 (good for CPU) // fp16 (recommended for GPU) // bf16 (less accurate)" + }, + { + "comment": "Code is setting hyperparameters for RWKV-v4neo model, including vocabulary size, dimensions, and layers. It also has multiple conditionals to change these values based on the chat language, and provides aliases for user and bot.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/chat.py\":31-58", + "content": "args.vocab_size = 50277\nargs.head_qk = 0\nargs.pre_ffn = 0\nargs.grad_cp = 0\nargs.my_pos_emb = 0\nargs.MODEL_NAME = '/fsx/BlinkDL/HF-MODEL/rwkv-4-pile-14b/RWKV-4-Pile-14B-20230108-5170'\nargs.n_layer = 40\nargs.n_embd = 5120\nargs.ctx_len = 1024\n# args.MODEL_NAME = '/fsx/BlinkDL/HF-MODEL/rwkv-4-pile-7b/RWKV-4-Pile-7B-20221115-8047'\n# args.n_layer = 32\n# args.n_embd = 4096\n# args.ctx_len = 1024\n# args.MODEL_NAME = '/fsx/BlinkDL/HF-MODEL/rwkv-4-pile-3b/RWKV-4-Pile-3B-20221008-8023'\n# args.n_layer = 32\n# args.n_embd = 2560\n# args.ctx_len = 1024\nif CHAT_LANG == 'English':\n user = \"User\"\n bot = \"Bot\"\n interface = \":\"\n # The following is a verbose and detailed conversation between an AI assistant called {bot}, and a human user called {user}. {bot} is intelligent, knowledgeable, wise and polite.\n # The following is a conversation between a highly knowledgeable and intelligent AI called {bot}, and a human called {user}. In the following interactions, {user} and {bot} converse in natural language, and {bot}" + }, + { + "comment": "This code contains a sample conversation between an AI assistant named {bot} and a user. The assistant provides answers to questions, is respectful and polite, and always tells the truth. The code also includes instructions for using the chat functionality and commands like 'say something' to initiate the conversation with the bot.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/chat.py\":58-85", + "content": " do its best to answer {user}'s questions. {bot} is respectful, polite and inclusive. {bot} knows a lot, and always tells the truth.\n init_prompt = f'''\nThe following is a verbose and detailed conversation between an AI assistant called {bot}, and a human user called {user}. {bot} is intelligent, knowledgeable, wise and polite.\n{user}{interface} french revolution what year\n{bot}{interface} The French Revolution started in 1789, and lasted 10 years until 1799.\n{user}{interface} 3+5=?\n{bot}{interface} The answer is 8.\n{user}{interface} guess i marry who ?\n{bot}{interface} Only if you tell me more about yourself - what are your interests?\n{user}{interface} solve for a: 9-a=2\n{bot}{interface} The answer is a = 7, because 9 - 7 = 2.\n{user}{interface} wat is lhc\n{bot}{interface} LHC is a high-energy particle collider, built by CERN, and completed in 2008. They used it to confirm the existence of the Higgs boson in 2012.\n'''\n HELP_MSG = '''Commands:\nsay something --> chat with bot. use \\\\n for new line." + }, + { + "comment": "This code is for a chatbot implemented in Python using the RWKV-v4neo language model. It supports Chinese and English languages, allowing users to ask questions or generate free text. The code provides specific parameters and prompts for Chinese and English interactions, with the ability to reset the bot's memory, and instructions on how to use the chatbot effectively.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/chat.py\":86-117", + "content": "+alt --> alternate chat reply\n+reset --> reset chat\n+gen YOUR PROMPT --> free generation with any prompt. use \\\\n for new line.\n+qa YOUR QUESTION --> free generation - ask any question (just ask the question). use \\\\n for new line.\n+more --> continue last free generation (only for +gen / +qa)\n+retry --> retry last free generation (only for +gen / +qa)\nNow talk with the bot and enjoy. Remember to +reset periodically to clean up the bot's memory. Use RWKV-4 14B for best results.\nThis is not instruct-tuned for conversation yet, so don't expect good quality. Better use +gen for free generation.\n'''\nelif CHAT_LANG == 'Chinese':\n args.MODEL_NAME = '/fsx/BlinkDL/CODE/_PUBLIC_/RWKV-LM/RWKV-v4neo/7-run3z/rwkv-293'\n args.n_layer = 32\n args.n_embd = 4096\n args.ctx_len = 1024\n user = \"Q\"\n bot = \"A\"\n interface = \":\"\n init_prompt = '''\nQ: \u4f01\u9e45\u4f1a\u98de\u5417\uff1f\nA: \u4f01\u9e45\u662f\u4e0d\u4f1a\u98de\u7684\u3002\u5b83\u4eec\u7684\u7fc5\u8180\u4e3b\u8981\u7528\u4e8e\u6e38\u6cf3\u548c\u5e73\u8861\uff0c\u800c\u4e0d\u662f\u98de\u884c\u3002\nQ: \u897f\u74dc\u662f\u4ec0\u4e48\nA: \u897f\u74dc\u662f\u4e00\u79cd\u5e38\u89c1\u7684\u6c34\u679c\uff0c\u662f\u4e00\u79cd\u591a\u5e74\u751f\u8513\u751f\u85e4\u672c\u690d\u7269\u3002\u897f\u74dc\u7684\u679c\u5b9e\u5448\u5706\u5f62\u6216\u5375\u5f62\uff0c\u901a\u5e38\u662f\u7eff\u8272\u7684\uff0c\u91cc\u9762\u6709\u7ea2\u8272\u6216\u9ec4\u8272\u7684\u8089\u548c\u5f88\u591a\u7684\u7c7d\u3002\u897f\u74dc\u5473\u751c\uff0c\u591a\u5403\u53ef\u4ee5\u589e\u52a0\u6c34\u5206\uff0c\u662f\u590f\u5b63\u975e\u5e38\u53d7\u6b22\u8fce\u7684\u6c34\u679c\u4e4b\u4e00\u3002\n'''\n HELP_MSG = '''\u6307\u4ee4:" + }, + { + "comment": "This code is part of an interactive chatbot that uses the RWKV model for natural language processing. It supports various commands to generate or reset responses, and prompts for independent questions. The code loads the model and defines a function run_rnn() which takes input tokens, processes them through the RWKV model, and generates output.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/chat.py\":118-155", + "content": "\u76f4\u63a5\u8f93\u5165\u5185\u5bb9 --> \u548c\u673a\u5668\u4eba\u804a\u5929\uff0c\u7528\\\\n\u4ee3\u8868\u6362\u884c\n+alt --> \u8ba9\u673a\u5668\u4eba\u6362\u4e2a\u56de\u7b54\n+reset --> \u91cd\u7f6e\u5bf9\u8bdd\n+gen \u67d0\u67d0\u5185\u5bb9 --> \u7eed\u5199\u4efb\u4f55\u4e2d\u82f1\u6587\u5185\u5bb9\uff0c\u7528\\\\n\u4ee3\u8868\u6362\u884c\n+qa \u67d0\u67d0\u95ee\u9898 --> \u95ee\u72ec\u7acb\u7684\u95ee\u9898\uff08\u5ffd\u7565\u4e0a\u4e0b\u6587\uff09\uff0c\u7528\\\\n\u4ee3\u8868\u6362\u884c\n+more --> \u7ee7\u7eed +gen / +qa \u7684\u56de\u7b54\n+retry --> \u6362\u4e2a +gen / +qa \u7684\u56de\u7b54\n\u73b0\u5728\u53ef\u4ee5\u8f93\u5165\u5185\u5bb9\u548c\u673a\u5668\u4eba\u804a\u5929\uff08\u6ce8\u610f\u5b83\u4e0d\u600e\u4e48\u61c2\u4e2d\u6587\uff0c\u5b83\u53ef\u80fd\u66f4\u61c2\u82f1\u6587\uff09\u3002\u8bf7\u7ecf\u5e38\u4f7f\u7528 +reset \u91cd\u7f6e\u673a\u5668\u4eba\u8bb0\u5fc6\u3002\n'''\n# Load Model\nos.environ[\"RWKV_RUN_DEVICE\"] = args.RUN_DEVICE\nMODEL_NAME = args.MODEL_NAME\nprint(f'loading... {MODEL_NAME}')\nmodel = RWKV_RNN(args)\nmodel_tokens = []\ncurrent_state = None\n########################################################################################################\ndef run_rnn(tokens, newline_adj = 0):\n global model_tokens, current_state\n for i in range(len(tokens)):\n model_tokens += [int(tokens[i])]\n if i == len(tokens) - 1:\n out, current_state = model.forward(model_tokens, current_state)\n else:\n current_state = model.forward(model_tokens, current_state, preprocess_only = True)\n # print(f'### model ###\\n[{tokenizer.tokenizer.decode(model_tokens)}]')\n out[0] = -999999999 # disable <|endoftext|>" + }, + { + "comment": "This code is used for saving and loading all-state, running inference, and replying to messages. The all-state contains the RNN state, model tokens, and output. Inference is run on a prompt, and the output is saved and printed with decoded tokens. The reply_msg function can be used to reply to received messages.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/chat.py\":156-193", + "content": " out[187] += newline_adj\n # if newline_adj > 0:\n # out[15] += newline_adj / 2 # '.'\n return out\nall_state = {}\ndef save_all_stat(srv, name, last_out):\n n = f'{name}_{srv}'\n all_state[n] = {}\n all_state[n]['out'] = last_out\n all_state[n]['rnn'] = copy.deepcopy(current_state)\n all_state[n]['token'] = copy.deepcopy(model_tokens)\ndef load_all_stat(srv, name):\n global model_tokens, current_state\n n = f'{name}_{srv}'\n current_state = copy.deepcopy(all_state[n]['rnn'])\n model_tokens = copy.deepcopy(all_state[n]['token'])\n return all_state[n]['out']\n########################################################################################################\n# Run inference\nprint(f'\\nRun prompt...')\nout = run_rnn(tokenizer.tokenizer.encode(init_prompt))\ngc.collect()\ntorch.cuda.empty_cache()\nsave_all_stat('', 'chat_init', out)\nsrv_list = ['dummy_server']\nfor s in srv_list:\n save_all_stat(s, 'chat', out)\nprint(f'### prompt ###\\n[{tokenizer.tokenizer.decode(model_tokens)}]\\n')\ndef reply_msg(msg):" + }, + { + "comment": "This code defines a function `on_message()` that processes incoming messages. It checks if the message is longer than 1000 tokens, applies temperature and top-p sampling parameters, handles reset requests, and possibly generates new text or asks questions based on message content. The processing includes loading and saving chat state, printing messages to console, and sending appropriate replies.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/chat.py\":194-229", + "content": " print(f'{bot}{interface} {msg}\\n')\ndef on_message(message):\n global model_tokens, current_state\n srv = 'dummy_server'\n msg = message.replace('\\\\n','\\n').strip()\n if len(msg) > 1000:\n reply_msg('your message is too long (max 1000 tokens)')\n return\n x_temp = 1.0\n x_top_p = 0.85\n if (\"-temp=\" in msg):\n x_temp = float(msg.split(\"-temp=\")[1].split(\" \")[0])\n msg = msg.replace(\"-temp=\"+f'{x_temp:g}', \"\")\n # print(f\"temp: {x_temp}\")\n if (\"-top_p=\" in msg):\n x_top_p = float(msg.split(\"-top_p=\")[1].split(\" \")[0])\n msg = msg.replace(\"-top_p=\"+f'{x_top_p:g}', \"\")\n # print(f\"top_p: {x_top_p}\")\n if x_temp <= 0.2:\n x_temp = 0.2\n if x_temp >= 5:\n x_temp = 5\n if x_top_p <= 0:\n x_top_p = 0\n if msg == '+reset':\n out = load_all_stat('', 'chat_init')\n save_all_stat(srv, 'chat', out)\n reply_msg(\"Chat reset.\")\n return\n elif msg[:5].lower() == '+gen ' or msg[:4].lower() == '+qa ' or msg.lower() == '+more' or msg.lower() == '+retry':" + }, + { + "comment": "This code checks if the message starts with \"+gen\", \"+qa\", or \"+more\" and performs corresponding actions. If \"+gen\" is found, it generates a response based on the given prompt. If \"+qa\" is found, it loads previous chat context and continues the conversation. If \"+more\" is found, it displays more content related to the current context. It utilizes tokenizer for encoding messages and run_rnn to generate responses. The generated responses are saved in 'gen_0' state for future reference.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/chat.py\":231-254", + "content": " if msg[:5].lower() == '+gen ':\n new = '\\n' + msg[5:].strip()\n # print(f'### prompt ###\\n[{new}]')\n current_state = None\n out = run_rnn(tokenizer.tokenizer.encode(new))\n save_all_stat(srv, 'gen_0', out)\n elif msg[:4].lower() == '+qa ':\n out = load_all_stat('', 'chat_init')\n real_msg = msg[4:].strip()\n new = f\"{user}{interface} {real_msg}\\n\\n{bot}{interface}\"\n # print(f'### qa ###\\n[{new}]')\n out = run_rnn(tokenizer.tokenizer.encode(new))\n save_all_stat(srv, 'gen_0', out)\n # new = f\"\\nThe following is an excellent Q&A session consists of detailed and factual information.\\n\\nQ: What is 3+5?\\nA: The answer is 8.\\n\\nQ: {msg[9:].strip()}\\nA:\"\n # print(f'### prompt ###\\n[{new}]')\n # current_state = None\n # out = run_rnn(tokenizer.tokenizer.encode(new))\n # save_all_stat(srv, 'gen_0', out)\n elif msg.lower() == '+more':" + }, + { + "comment": "The code is performing the following tasks:\n1. Loading statistics from a server and saving them, with error handling for any exceptions that might occur.\n2. Handling user input, specifically \"+retry\" command, by loading saved statistics from an alternative source if an exception occurs during the initial load.\n3. Generating tokens using tokenizer's sample_logits function, considering various parameters like temperature and top_p values.\n4. Running RNN model on generated tokens to process them, handling different cases based on user input.\n5. Printing generated output, handling special characters, and updating the tracking variables for the next iteration.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/chat.py\":255-288", + "content": " try:\n out = load_all_stat(srv, 'gen_1')\n save_all_stat(srv, 'gen_0', out)\n except:\n return\n elif msg.lower() == '+retry':\n try:\n out = load_all_stat(srv, 'gen_0')\n except:\n return\n begin = len(model_tokens)\n out_last = begin\n for i in range(150):\n token = tokenizer.sample_logits(\n out,\n model_tokens,\n args.ctx_len,\n temperature=x_temp,\n top_p_usual=x_top_p,\n top_p_newline=x_top_p,\n )\n if msg[:4].lower() == '+qa ':\n out = run_rnn([token], newline_adj=-1)\n else:\n out = run_rnn([token])\n xxx = tokenizer.tokenizer.decode(model_tokens[out_last:])\n if '\\ufffd' not in xxx:\n print(xxx, end='', flush=True)\n out_last = begin + i + 1\n print('\\n')\n # send_msg = tokenizer.tokenizer.decode(model_tokens[begin:]).strip()" + }, + { + "comment": "This code handles two cases: sending a message and adding a message to the chat history. If the message is \"+alt\", it loads the previous chat state. Otherwise, it loads the current chat state, generates a new message using an RNN model, saves the new message in the \"chat_pre\" file, and prints part of the output. The code also determines the appropriate newline adjustment for the generated text. It uses tokenizer.sample_logits to generate the next token based on the current state of the model.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/chat.py\":289-318", + "content": " # print(f'### send ###\\n[{send_msg}]')\n # reply_msg(send_msg)\n save_all_stat(srv, 'gen_1', out)\n else:\n if msg.lower() == '+alt':\n try:\n out = load_all_stat(srv, 'chat_pre')\n except:\n return\n else:\n out = load_all_stat(srv, 'chat')\n new = f\"{user}{interface} {msg}\\n\\n{bot}{interface}\"\n # print(f'### add ###\\n[{new}]')\n out = run_rnn(tokenizer.tokenizer.encode(new), newline_adj=-999999999)\n save_all_stat(srv, 'chat_pre', out)\n begin = len(model_tokens)\n out_last = begin\n print(f'{bot}{interface}', end='', flush=True)\n for i in range(999):\n if i <= 0:\n newline_adj = -999999999\n elif i <= 30:\n newline_adj = (i - 30) / 10\n elif i <= 130:\n newline_adj = 0\n else:\n newline_adj = (i - 130) * 0.25 # MUST END THE GENERATION\n token = tokenizer.sample_logits(" + }, + { + "comment": "This code is responsible for generating text from a model, tokenizing the output, and printing it until a newline or specified context length is reached. It also handles breaking the loop when encountering a double newline or specific user/bot messages.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/chat.py\":319-343", + "content": " out,\n model_tokens,\n args.ctx_len,\n temperature=x_temp,\n top_p_usual=x_top_p,\n top_p_newline=x_top_p,\n )\n out = run_rnn([token], newline_adj=newline_adj)\n xxx = tokenizer.tokenizer.decode(model_tokens[out_last:])\n if '\\ufffd' not in xxx:\n print(xxx, end='', flush=True)\n out_last = begin + i + 1\n send_msg = tokenizer.tokenizer.decode(model_tokens[begin:])\n if '\\n\\n' in send_msg:\n send_msg = send_msg.strip()\n break\n # send_msg = tokenizer.tokenizer.decode(model_tokens[begin:]).strip()\n # if send_msg.endswith(f'{user}{interface}'): # warning: needs to fix state too !!!\n # send_msg = send_msg[:-len(f'{user}{interface}')].strip()\n # break\n # if send_msg.endswith(f'{bot}{interface}'):\n # send_msg = send_msg[:-len(f'{bot}{interface}')].strip()" + }, + { + "comment": "The code handles user input, continuously prompts the user for messages, passes them to a function `on_message()`, and saves chat data if necessary. If no valid input is provided, it prints an error message.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/chat.py\":344-360", + "content": " # break\n # print(f'{model_tokens}')\n # print(f'[{tokenizer.tokenizer.decode(model_tokens)}]')\n # print(f'### send ###\\n[{send_msg}]')\n # reply_msg(send_msg)\n save_all_stat(srv, 'chat', out)\nprint(HELP_MSG)\nwhile True:\n msg = input(f'{user}{interface} ')\n if len(msg.strip()) > 0:\n on_message(msg)\n else:\n print('Erorr: please say something')" + } + ] +} \ No newline at end of file diff --git a/docs/doc/32cc1444-57e1-48cb-a332-613072974754.json b/docs/doc/32cc1444-57e1-48cb-a332-613072974754.json new file mode 100644 index 00000000..80e91fc0 --- /dev/null +++ b/docs/doc/32cc1444-57e1-48cb-a332-613072974754.json @@ -0,0 +1,35 @@ +{ + "summary": "The code imports libraries, defines a Dataset class for data handling, sets vocabulary size, generates unique tokens, maps characters to integers, prints data and token sizes, initializes a tokenizer, calculates dataset length, samples logits, applies softmax with soft constraints on newlines, includes \"probs_sample\" function, converts tensor to float value, and sets random seeds for Python, numpy, and PyTorch.", + "details": [ + { + "comment": "The code is importing necessary libraries and defining a class Dataset for handling data. It checks the number of GPUs, reads input data, and determines the vocabulary size. It prints current vocabulary size and data token count for verification.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/src/utils.py\":0-28", + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nimport os\ntry:\n NUM_GPUS = int(os.environ['RWKV_NUM_GPUS'])\nexcept:\n NUM_GPUS = 1\nimport json\nimport random\nimport numpy as np\nimport torch\nfrom torch.nn import functional as F\nfrom torch.utils.data import Dataset\nclass Dataset(Dataset):\n def __init__(self, data, ctx_len, epoch_length_fixed):\n self.ctx_len = ctx_len\n self.epoch_length_fixed = epoch_length_fixed\n self.data = data\n if 'MMapIndexedDataset' in str(type(self.data)):\n self.vocab_size = int(os.environ['VOCAB_SIZE'])\n print('current vocab size =', self.vocab_size, \"(make sure it's correct)\")\n self.data_size = len(self.data._bin_buffer) // 2\n print(f'data has {self.data_size} tokens.')\n elif 'numpy' in str(type(self.data)):" + }, + { + "comment": "This code sets the vocabulary size based on environment variable 'VOCAB_SIZE'. If the size is not specified, it generates a unique token list from data and stores it in 'vocab.json', then calculates the vocabulary and data sizes. It also maps characters to unique integers and inverse mapping. Finally, it prints the data size and number of unique tokens.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/src/utils.py\":29-52", + "content": " self.vocab_size = int(os.environ['VOCAB_SIZE'])\n print('current vocab size =', self.vocab_size, \"(make sure it's correct)\")\n self.data_size = len(self.data)\n print(f'data has {self.data_size} tokens.')\n else:\n print('building token list...', end=' ')\n unique = sorted(list(set(data)))\n self.vocab_size = len(unique)\n # print()\n # for u in unique:\n # print(u, end=' ')\n # print('\\n\\n')\n xx = 0\n xxObj = {}\n for u in unique:\n xxObj[xx] = u\n xx += 1\n with open('vocab.json', \"w\", encoding=\"utf-16\") as vocab_file:\n vocab_file.write(json.dumps(xxObj, ensure_ascii=False))\n self.data_size = len(self.data)\n print('data has %d tokens, %d unique.' % (self.data_size, self.vocab_size))\n self.stoi = {ch: i for i, ch in enumerate(unique)}\n self.itos = {i: ch for i, ch in enumerate(unique)}" + }, + { + "comment": "This code defines a class that initializes a tokenizer using pre-trained word embeddings. It also includes methods to calculate the length and retrieve items from the dataset, handling different data types like MMapIndexedDataset or numpy arrays. The tokenizer is initialized with a list of words or a PreTrainedTokenizerFast instance based on the input argument.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/src/utils.py\":54-80", + "content": " def __len__(self):\n return self.epoch_length_fixed // NUM_GPUS\n def __getitem__(self, idx):\n #\n # we are cheating: pick a random spot in dataset\n #\n i = np.random.randint(0, self.data_size - (self.ctx_len + 1))\n if 'MMapIndexedDataset' in str(type(self.data)):\n dix = self.data.get(idx=0, offset=i, length=self.ctx_len + 1).astype(int)\n elif 'numpy' in str(type(self.data)):\n dix = self.data[i:i+self.ctx_len+1]\n else:\n dix = [self.stoi[s] for s in self.data[i:i+self.ctx_len+1]]\n x = torch.tensor(dix[:-1], dtype=torch.long)\n y = torch.tensor(dix[1:], dtype=torch.long)\n return x, y\nclass TOKENIZER():\n def __init__(self, WORD_NAME, UNKNOWN_CHAR='\\ue083'):\n if 'list' in str(type(WORD_NAME)):\n self.charMode = False\n if WORD_NAME[0] == WORD_NAME[1]:\n from transformers import PreTrainedTokenizerFast\n self.tokenizer = PreTrainedTokenizerFast(tokenizer_file=WORD_NAME[0])" + }, + { + "comment": "This code checks if a tokenizer or word table is provided, initializes them accordingly and sets the vocabulary size. It also refines the context input by stripping unnecessary characters and returning an empty string if no content is found.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/src/utils.py\":81-105", + "content": " else:\n from transformers import GPT2TokenizerFast\n self.tokenizer = GPT2TokenizerFast(WORD_NAME[0], WORD_NAME[1])\n self.vocab_size = len(self.tokenizer)\n else:\n self.charMode = True\n with open(WORD_NAME + '.json', \"r\", encoding=\"utf-16\") as result_file:\n self.word_table = json.load(result_file)\n self.vocab_size = len(self.word_table)\n self.stoi = {v: int(k) for k, v in self.word_table.items()}\n self.itos = {int(k): v for k, v in self.word_table.items()}\n self.UNKNOWN_CHAR = self.stoi[UNKNOWN_CHAR]\n def refine_context(self, context):\n context = context.strip().split('\\n')\n for c in range(len(context)):\n context[c] = context[c].strip().strip('\\u3000').strip('\\r')\n context = list(filter(lambda c: c != '', context))\n context = '\\n' + ('\\n'.join(context)).strip()\n if context == '':\n context = '\\n'\n return context" + }, + { + "comment": "This function samples logits from the output of the model and applies softmax to obtain probabilities. It handles newlines by changing the top_p value when last character is a newline, otherwise it uses top_p_usual. It then sorts probabilities in descending order, finds the cutoff for sampling, and sets probabilities below this cutoff to 0.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/src/utils.py\":107-136", + "content": " def sample_logits(self, out, x, ctx_len, temperature=1.0, top_p_usual=None, top_p_newline=None):\n # out[self.UNKNOWN_CHAR] = -float('Inf')\n lastChar = int(x[-1])\n probs = F.softmax(torch.tensor(out), dim=-1)\n if self.charMode:\n if self.itos[lastChar] == '\\n':\n top_p = top_p_newline\n else:\n top_p = top_p_usual\n else:\n top_p = top_p_usual\n sorted_probs, s_index = torch.sort(probs, descending=True)\n # for j in range(30):\n # pp = sorted_probs[j].item()\n # if pp < 0.005:\n # break\n # ss = self.itos[int(s_index[j])].replace('\\n','_')\n # print(f'{math.floor(pp*100):>3.0f}{ss}', end='')\n # print('')\n cumulative_probs = torch.cumsum(sorted_probs, dim=-1).numpy()\n cutoff = float(sorted_probs[np.argmax(cumulative_probs > top_p)])\n probs[probs < cutoff] = 0\n # print(\"[\" + str(round(cutoff,4)) + ' ' + str(round(to_float(sum(probs)),3)) + \"]\", end = \"\")" + }, + { + "comment": "This code snippet contains three functions: \"probs_sample\" which samples one sample from the multinomial distribution if temperature is not 1, \"to_float\" that converts a tensor to a float value, and \"set_seed\" for setting random seeds in Python, numpy, and PyTorch.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/src/utils.py\":138-152", + "content": " if temperature != 1.0:\n probs = probs.pow(1.0 / temperature)\n return torch.multinomial(probs, num_samples=1)[0]\ndef to_float(x):\n return x.cpu().detach().numpy().flatten()[0].astype(float)\ndef set_seed(seed):\n random.seed(seed)\n np.random.seed(seed)\n torch.manual_seed(seed)\n torch.cuda.manual_seed_all(seed)" + } + ] +} \ No newline at end of file diff --git a/docs/doc/3575d866-9f08-4fbb-bc51-0eeebf5d60da.json b/docs/doc/3575d866-9f08-4fbb-bc51-0eeebf5d60da.json new file mode 100644 index 00000000..ab43972d --- /dev/null +++ b/docs/doc/3575d866-9f08-4fbb-bc51-0eeebf5d60da.json @@ -0,0 +1,30 @@ +{ + "summary": "The code performs matrix multiplication, computes gradients for neural network backward pass, and optimally initializes variables for CUDA implementation of RWKV model's forward and backward passes using efficient execution configuration.", + "details": [ + { + "comment": "Kernel function for matrix multiplication with accumulation of running sums and avoiding overflow by dividing the sums by exp(pp).", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/cuda/wkv_cuda.cu\":0-30", + "content": "#include \n#include \n#define MIN_VALUE (-1e38)\ntemplate \n__global__ void kernel_forward(const int B, const int T, const int C,\n const F *__restrict__ const _w, const F *__restrict__ const _u, const F *__restrict__ const _k, const F *__restrict__ const _v,\n F *__restrict__ const _y) {\n const int idx = blockIdx.x * blockDim.x + threadIdx.x;\n const int _b = idx / C;\n const int _c = idx % C;\n const int _offset = _b * T * C + _c;\n F u = _u[_c];\n F w = _w[_c];\n const F *__restrict__ const k = _k + _offset;\n const F *__restrict__ const v = _v + _offset;\n F *__restrict__ const y = _y + _offset;\n // aa and bb are running sums divided by exp(pp) (to avoid overflow)\n F aa = 0, bb = 0, pp = MIN_VALUE;\n for (int i = 0; i < T; i++) {\n const int ii = i * C;\n const F kk = k[ii];\n const F vv = v[ii];\n F ww = u + kk;\n F p = max(pp, ww);\n F e1 = exp(pp - p);\n F e2 = exp(ww - p);" + }, + { + "comment": "This code performs a matrix multiplication and subsequent calculations to compute gradients for the backward pass in a neural network. The kernel function takes input weights, input activations, output activations, and their respective gradients as inputs, and computes gradients for the output weights and input weights. The comments should reflect this understanding of what the code is doing.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/cuda/wkv_cuda.cu\":31-57", + "content": " y[ii] = (e1 * aa + e2 * vv) / (e1 * bb + e2);\n ww = w + pp;\n p = max(ww, kk);\n e1 = exp(ww - p);\n e2 = exp(kk - p);\n aa = e1 * aa + e2 * vv;\n bb = e1 * bb + e2;\n pp = p;\n }\n}\ntemplate \n__global__ void kernel_backward(const int B, const int T, const int C,\n const F *__restrict__ const _w, const F *__restrict__ const _u, const F *__restrict__ const _k, const F *__restrict__ const _v,\n const F *__restrict__ const _y, const F *__restrict__ const _gy,\n F *__restrict__ const _gw, F *__restrict__ const _gu, F *__restrict__ const _gk, F *__restrict__ const _gv) {\n const int idx = blockIdx.x * blockDim.x + threadIdx.x;\n const int _b = idx / C;\n const int _c = idx % C;\n const int _offset = _b * T * C + _c;\n F u = _u[_c];\n F w = _w[_c];\n const F *__restrict__ const k = _k + _offset;\n const F *__restrict__ const v = _v + _offset;\n const F *__restrict__ const y = _y + _offset;" + }, + { + "comment": "This code segment initializes variables and iterates over the data to calculate values for gw, gu, q, and r. It then assigns these calculated values to their respective locations in memory. The multiplication by w is because of the transformation from forward pass in python.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/cuda/wkv_cuda.cu\":58-93", + "content": " const F *__restrict__ const gy = _gy + _offset;\n F *__restrict__ const gk = _gk + _offset;\n F *__restrict__ const gv = _gv + _offset;\n F q[Tmax], r[Tmax];\n F gw = 0, gu = 0, aa = 0, bb = 0, ga = 0, gb = 0, pp = MIN_VALUE;\n for (int i = 0; i < T; i++) {\n const int ii = i * C;\n const F kk = k[ii];\n const F vv = v[ii];\n const F yy = y[ii];\n F ww = u + kk;\n F p = max(pp, ww);\n F e1 = exp(pp - p);\n F e2 = exp(ww - p);\n const F qq = gy[ii] / (e1 * bb + e2);\n gw += (ga - gb * yy) * e1 * qq;\n gu += (vv - yy) * e2 * qq;\n q[i] = qq;\n r[i] = ww - p;\n ww = w + pp;\n p = max(ww, kk);\n e1 = exp(ww - p);\n e2 = exp(kk - p);\n ga = e1 * (aa + ga);\n gb = e1 * (bb + gb);\n aa = e1 * aa + e2 * vv;\n bb = e1 * bb + e2;\n pp = p;\n }\n const int _offsetBC = _b * C + _c;\n _gw[_offsetBC] = gw * _w[_c]; // multiply by w because of w -> -exp(w) in python forward()\n _gu[_offsetBC] = gu;" + }, + { + "comment": "The code initializes variables and performs a matrix operation for the CUDA implementation of RWKV model's forward and backward passes. The function `cuda_forward` launches a CUDA kernel that processes each block with 32 threads, while `cuda_backward` computes gradients for the same matrix operations as the forward pass, storing them in `gw`, `gu`, `gk`, `gv`. The computation involves element-wise multiplication and addition using `e1` and `e2`, respectively. The function `cuda_forward` requires `--maxrregcount 60` for optimal performance due to register usage.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/cuda/wkv_cuda.cu\":95-127", + "content": " aa = 0, bb = 0, pp = MIN_VALUE;\n for (int i = T - 1; i >= 0; i--) {\n const int ii = i * C;\n const F kk = k[ii];\n const F vv = v[ii];\n const F yy = y[ii];\n const F qq = q[i];\n const F rr = r[i];\n F e1 = qq * exp(rr);\n F e2 = exp(kk + pp);\n gk[ii] = e1 * (vv - yy) + e2 * (aa * vv + bb);\n gv[ii] = e1 + e2 * aa;\n const F ww = w + pp;\n const F www = rr - u - kk;\n const F p = max(ww, www);\n e1 = exp(ww - p);\n e2 = qq * exp(www - p);\n aa = e1 * aa + e2;\n bb = e1 * bb - e2 * yy;\n pp = p;\n }\n}\nvoid cuda_forward(int B, int T, int C, float *w, float *u, float *k, float *v, float *y) {\n dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance\n assert(B * C % threadsPerBlock.x == 0);\n dim3 numBlocks(B * C / threadsPerBlock.x);\n kernel_forward<< >>(B, T, C, w, u, k, v, y);\n}\nvoid cuda_backward(int B, int T, int C, float *w, float *u, float *k, float *v, float *y, float *gy, float *gw, float *gu, float *gk, float *gv) {" + }, + { + "comment": "The code sets up the execution configuration for a CUDA kernel function named 'kernel_backward'. It determines the optimal number of threads per block (min(C, 32)) and ensures that the number of blocks is proportional to B*C, ensuring even distribution of tasks. It then launches the kernel function on the specified number of blocks and threads.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/cuda/wkv_cuda.cu\":128-132", + "content": " dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance\n assert(B * C % threadsPerBlock.x == 0);\n dim3 numBlocks(B * C / threadsPerBlock.x);\n kernel_backward<< >>(B, T, C, w, u, k, v, y, gy, gw, gu, gk, gv);\n}" + } + ] +} \ No newline at end of file diff --git a/docs/doc/3c91a0f9-5c3b-4e44-87fc-f2ef742f9fe4.json b/docs/doc/3c91a0f9-5c3b-4e44-87fc-f2ef742f9fe4.json new file mode 100644 index 00000000..87f09195 --- /dev/null +++ b/docs/doc/3c91a0f9-5c3b-4e44-87fc-f2ef742f9fe4.json @@ -0,0 +1,15 @@ +{ + "summary": "The code defines CUDA functions for forward and backward torch Tensor computations, utilizing BF16 data type. Python bindings are created for these functions.", + "details": [ + { + "comment": "This code defines a forward and backward function for torch Tensors using CUDA. The forward function calls a CUDA kernel function to compute the output tensor y based on inputs w, u, k, v. The backward function also calls a CUDA kernel function to compute gradients for w, u, k, v, gw, gu, gk, gv. BF16 (BFloat16) is used as a data type for some tensor operations.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/cuda/wkv_op_bf16.cpp\":0-13", + "content": "#include \n#include \"ATen/ATen.h\"\ntypedef at::BFloat16 bf16;\nvoid cuda_forward(int B, int T, int C, float *w, bf16 *u, bf16 *k, bf16 *v, bf16 *y);\nvoid cuda_backward(int B, int T, int C, float *w, bf16 *u, bf16 *k, bf16 *v, bf16 *y, bf16 *gy, bf16 *gw, bf16 *gu, bf16 *gk, bf16 *gv);\nvoid forward(int64_t B, int64_t T, int64_t C, torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y) {\n cuda_forward(B, T, C, w.data_ptr (), u.data_ptr (), k.data_ptr (), v.data_ptr (), y.data_ptr ());\n}\nvoid backward(int64_t B, int64_t T, int64_t C, torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y,\n torch::Tensor &gy, torch::Tensor &gw, torch::Tensor &gu, torch::Tensor &gk, torch::Tensor &gv) {\n cuda_backward(B, T, C, w.data_ptr (), u.data_ptr (), k.data_ptr (), v.data_ptr (), y.data_ptr (),\n gy.data_ptr (), gw.data_ptr (), gu.data_ptr (), gk.data_ptr (), gv.data_ptr ());" + }, + { + "comment": "Defining Python bindings for the forward and backward functions of the wkv extension module.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/cuda/wkv_op_bf16.cpp\":14-24", + "content": "}\nPYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {\n m.def(\"forward\", &forward, \"wkv forward\");\n m.def(\"backward\", &backward, \"wkv backward\");\n}\nTORCH_LIBRARY(wkv, m) {\n m.def(\"forward\", forward);\n m.def(\"backward\", backward);\n}" + } + ] +} \ No newline at end of file diff --git a/docs/doc/3caccbd2-eaf3-476e-99b4-018f8b124959.json b/docs/doc/3caccbd2-eaf3-476e-99b4-018f8b124959.json new file mode 100644 index 00000000..e1db0419 --- /dev/null +++ b/docs/doc/3caccbd2-eaf3-476e-99b4-018f8b124959.json @@ -0,0 +1,80 @@ +{ + "summary": "The code initializes RWKV Language Model, uses Pytorch Lightning for training and handles command line arguments. It optimizes performance by loading checkpoints, handling exceptions, setting trainer parameters and using Deepspeed optimization.", + "details": [ + { + "comment": "This code initializes the RWKV Language Model, sets up logging, and parses command-line arguments for loading a model, using Wandb, project directory, random seed, and data file. It uses Pytorch Lightning framework for training.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/train.py\":0-22", + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nimport logging\nlogging.basicConfig(level=logging.INFO)\nif __name__ == \"__main__\":\n from argparse import ArgumentParser\n from pytorch_lightning import Trainer\n from pytorch_lightning.utilities import rank_zero_info, rank_zero_only\n import pytorch_lightning as pl\n rank_zero_info(\"########## work in progress ##########\")\n parser = ArgumentParser()\n parser.add_argument(\"--load_model\", default=\"\", type=str) # full path, with .pth\n parser.add_argument(\"--wandb\", default=\"\", type=str) # wandb project name. if \"\" then don't use wandb\n parser.add_argument(\"--proj_dir\", default=\"out\", type=str)\n parser.add_argument(\"--random_seed\", default=\"-1\", type=int)\n parser.add_argument(\"--data_file\", default=\"\", type=str)" + }, + { + "comment": "This code snippet is for argument parsing in the RWKV-LM/RWKV-v5/train.py file. It sets default values and types for various training parameters such as data type, vocab size, context length, epoch steps, number of epochs, initial epoch, epoch save frequency, micro batch size, number of layers, embedding dimension, and attention dimensionality.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/train.py\":23-35", + "content": " parser.add_argument(\"--data_type\", default=\"utf-8\", type=str)\n parser.add_argument(\"--vocab_size\", default=0, type=int) # vocab_size = 0 means auto (for char-level LM and .txt data)\n parser.add_argument(\"--ctx_len\", default=1024, type=int)\n parser.add_argument(\"--epoch_steps\", default=1000, type=int) # a mini \"epoch\" has [epoch_steps] steps\n parser.add_argument(\"--epoch_count\", default=500, type=int) # train for this many \"epochs\". will continue afterwards with lr = lr_final\n parser.add_argument(\"--epoch_begin\", default=0, type=int) # if you load a model trained for x \"epochs\", set epoch_begin = x\n parser.add_argument(\"--epoch_save\", default=5, type=int) # save the model every [epoch_save] \"epochs\"\n parser.add_argument(\"--micro_bsz\", default=12, type=int) # micro batch size (batch size per GPU)\n parser.add_argument(\"--n_layer\", default=6, type=int)\n parser.add_argument(\"--n_embd\", default=512, type=int)\n parser.add_argument(\"--dim_att\", default=0, type=int)" + }, + { + "comment": "This code snippet is from the RWKV-LM's \"train.py\" file and it sets various arguments for model training, such as dimensionality of feedforward network layers (dim_ffn), replacing first attention layer by a feedforward network (pre_ffn), and tricks like my headQK trick (head_qk). The code also specifies parameters for the tiny attention dimension (tiny_att_dim) and layer (tiny_att_layer), learning rate initialization and final values (lr_init, lr_final), warm-up steps, optimizer parameters (beta1, beta2), and Adam epsilon (adam_eps). There's also an argument for gradient checkpointing to save VRAM at the cost of increased training time (grad_cp).", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/train.py\":36-48", + "content": " parser.add_argument(\"--dim_ffn\", default=0, type=int)\n parser.add_argument(\"--pre_ffn\", default=0, type=int) # replace first att layer by ffn (sometimes better)\n parser.add_argument(\"--head_qk\", default=0, type=int) # my headQK trick\n parser.add_argument(\"--tiny_att_dim\", default=0, type=int) # tiny attention dim\n parser.add_argument(\"--tiny_att_layer\", default=-999, type=int) # tiny attention @ which layer\n parser.add_argument(\"--lr_init\", default=6e-4, type=float) # 6e-4 for L12-D768, 4e-4 for L24-D1024, 3e-4 for L24-D2048\n parser.add_argument(\"--lr_final\", default=1e-5, type=float)\n parser.add_argument(\"--warmup_steps\", default=-1, type=int) # try 50 if you load a model\n parser.add_argument(\"--beta1\", default=0.9, type=float)\n parser.add_argument(\"--beta2\", default=0.99, type=float) # use 0.999 when your model is close to convergence\n parser.add_argument(\"--adam_eps\", default=1e-8, type=float)\n parser.add_argument(\"--grad_cp\", default=0, type=int) # gradient checkpt: saves VRAM, but slower" + }, + { + "comment": "This code snippet is using argparse in Python to define various command-line arguments for a machine learning model. These arguments control features such as dropout rate, weight decay, and additional customizable settings like text shift, sample length, and more. Different values can be tried to optimize the performance of the model during training.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/train.py\":49-62", + "content": " parser.add_argument(\"--dropout\", default=0, type=float) # try 0.01 / 0.02 / 0.05 / 0.1\n parser.add_argument(\"--weight_decay\", default=0, type=float) # try 0.1 / 0.01 / 0.001\n parser.add_argument(\"--weight_decay_final\", default=-1, type=float)\n parser.add_argument(\"--my_pile_version\", default=1, type=int) # my special pile version\n parser.add_argument(\"--my_pile_stage\", default=0, type=int) # my special pile mode\n parser.add_argument(\"--my_pile_shift\", default=-1, type=int) # my special pile mode - text shift\n parser.add_argument(\"--my_pile_edecay\", default=0, type=int)\n parser.add_argument(\"--layerwise_lr\", default=1, type=int) # layerwise lr for faster convergence (but slower it/s)\n parser.add_argument(\"--ds_bucket_mb\", default=200, type=int) # deepspeed bucket size in MB. 200 seems enough\n # parser.add_argument(\"--cuda_cleanup\", default=0, type=int) # extra cuda cleanup (sometimes helpful)\n parser.add_argument(\"--my_sample_len\", default=0, type=int)\n parser.add_argument(\"--my_ffn_shift\", default=1, type=int)" + }, + { + "comment": "This code snippet adds command line arguments to a parser for various settings and configurations in the RWKV-v5 model training. It includes options like my_att_shift, head_size_a, head_size_divisor, my_pos_emb, load_partial, magic_prime, my_qa_mask, my_random_steps, my_testing, my_exit, my_exit_tokens, accelerator, strategy, devices and num_nodes. The code also checks if the Python version is 2, in which case it adds arguments for accelerator, strategy, devices and num_nodes.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/train.py\":63-79", + "content": " parser.add_argument(\"--my_att_shift\", default=1, type=int)\n parser.add_argument(\"--head_size_a\", default=64, type=int) # can try larger values for larger models\n parser.add_argument(\"--head_size_divisor\", default=8, type=int)\n parser.add_argument(\"--my_pos_emb\", default=0, type=int)\n parser.add_argument(\"--load_partial\", default=0, type=int)\n parser.add_argument(\"--magic_prime\", default=0, type=int)\n parser.add_argument(\"--my_qa_mask\", default=0, type=int)\n parser.add_argument(\"--my_random_steps\", default=0, type=int)\n parser.add_argument(\"--my_testing\", default='', type=str)\n parser.add_argument(\"--my_exit\", default=99999999, type=int)\n parser.add_argument(\"--my_exit_tokens\", default=0, type=int)\n if pl.__version__[0]=='2':\n parser.add_argument(\"--accelerator\", default=\"gpu\", type=str)\n parser.add_argument(\"--strategy\", default=\"auto\", type=str)\n parser.add_argument(\"--devices\", default=1, type=int)\n parser.add_argument(\"--num_nodes\", default=1, type=int)" + }, + { + "comment": "This code is adding arguments to the argument parser, handling global seed, setting numpy print options, and filtering warnings. The \"--precision\" argument sets the precision type to \"fp16\", and \"--accumulate_grad_batches\" determines the number of gradient accumulation batches. The code also imports necessary libraries and handles warnings related to workers and metric tracking.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/train.py\":80-102", + "content": " parser.add_argument(\"--precision\", default=\"fp16\", type=str)\n parser.add_argument(\"--accumulate_grad_batches\", default=1, type=int)\n else:\n parser = Trainer.add_argparse_args(parser)\n args = parser.parse_args()\n ########################################################################################################\n import os, warnings, math, datetime, sys, time\n import numpy as np\n import torch\n from torch.utils.data import DataLoader\n if \"deepspeed\" in args.strategy:\n import deepspeed\n from pytorch_lightning import seed_everything\n if args.random_seed >= 0:\n print(f\"########## WARNING: GLOBAL SEED {args.random_seed} THIS WILL AFFECT MULTIGPU SAMPLING ##########\\n\" * 3)\n seed_everything(args.random_seed)\n np.set_printoptions(precision=4, suppress=True, linewidth=200)\n warnings.filterwarnings(\"ignore\", \".*Consider increasing the value of the `num_workers` argument*\")\n warnings.filterwarnings(\"ignore\", \".*The progress bar already tracks a metric with the*\")" + }, + { + "comment": "This code sets various arguments for a training script. It enables continuous training (`args.max_epochs = -1`), disables checkpointing and logging, and adjusts the batch size based on the number of nodes and devices. It also configures the dimensions of certain layers and sets specific environment variables for the training process.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/train.py\":103-125", + "content": " # os.environ[\"WDS_SHOW_SEED\"] = \"1\"\n args.my_timestamp = datetime.datetime.today().strftime(\"%Y-%m-%d-%H-%M-%S\")\n args.enable_checkpointing = False\n args.replace_sampler_ddp = False\n args.logger = False\n args.gradient_clip_val = 1.0\n args.num_sanity_val_steps = 0\n args.check_val_every_n_epoch = int(1e20)\n args.log_every_n_steps = int(1e20)\n args.max_epochs = -1 # continue forever\n args.betas = (args.beta1, args.beta2)\n args.real_bsz = int(args.num_nodes) * int(args.devices) * args.micro_bsz\n os.environ[\"RWKV_MY_TESTING\"] = args.my_testing\n os.environ[\"RWKV_HEAD_SIZE_A\"] = str(args.head_size_a)\n if args.dim_att <= 0:\n args.dim_att = args.n_embd\n if args.dim_ffn <= 0:\n args.dim_ffn = int((args.n_embd * 3.5) // 32 * 32) # default = 3.5x emb size\n if args.data_type == \"wds_img\":\n args.run_name = f\"v{args.my_img_version}-{args.my_img_size}-{args.my_img_bit}bit-{args.my_img_clip}x{args.my_img_clip_scale}\"\n args.proj_dir = f\"{args.proj_dir}-{args.run_name}\"" + }, + { + "comment": "This code sets the run name based on certain parameters, creates a project directory if it doesn't exist, and adjusts the magic prime and epoch count for specific stages. It also ensures that the number of epoch steps and batch size are correctly set, and finds the latest saved model in the specified project directory.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/train.py\":126-153", + "content": " else:\n args.run_name = f\"{args.vocab_size} ctx{args.ctx_len} L{args.n_layer} D{args.n_embd}\"\n if not os.path.exists(args.proj_dir):\n os.makedirs(args.proj_dir)\n if args.my_pile_stage > 0:\n magic_prime_bak = args.magic_prime\n if args.my_pile_shift < 0:\n args.my_pile_shift = 0\n if magic_prime_bak > 0:\n args.magic_prime = magic_prime_bak\n if args.my_qa_mask == 2:\n args.epoch_count = 2 * args.magic_prime // 40320\n else:\n args.epoch_count = args.magic_prime // 40320\n args.epoch_steps = 40320 // args.real_bsz\n assert args.epoch_steps * args.real_bsz == 40320\n # if args.my_pile_stage == 2:\n # assert args.lr_final == args.lr_init\n if args.my_pile_stage >= 2: # find latest saved model\n list_p = []\n for p in os.listdir(args.proj_dir):\n if p.startswith(\"rwkv\") and p.endswith(\".pth\"):\n p = ((p.split(\"-\"))[1].split(\".\"))[0]\n if p != \"final\":" + }, + { + "comment": "The code retrieves the maximum value from a list of integers and uses it to determine which model checkpoint file (rwkv-{max_p}.pth or rwkv-init.pth) to load, depending on whether the max value is -1 or not. It also calculates the number of samples and tokens for an epoch based on the provided arguments. The code attempts to retrieve the DeepSpeed version but handles exceptions if it fails.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/train.py\":154-181", + "content": " if p == \"init\":\n p = -1\n else:\n p = int(p)\n list_p += [p]\n list_p.sort()\n max_p = list_p[-1]\n if len(list_p) > 1:\n args.my_pile_prev_p = list_p[-2] # in case max_p is corrupted\n if max_p == -1:\n args.load_model = f\"{args.proj_dir}/rwkv-init.pth\"\n else:\n args.load_model = f\"{args.proj_dir}/rwkv-{max_p}.pth\"\n if args.warmup_steps < 0:\n if args.my_pile_stage == 2:\n args.warmup_steps = 10\n else:\n args.warmup_steps = 30\n args.epoch_begin = max_p + 1\n samples_per_epoch = args.epoch_steps * args.real_bsz\n tokens_per_epoch = samples_per_epoch * args.ctx_len\n try:\n deepspeed_version = deepspeed.__version__\n except:\n deepspeed_version = None\n pass\n rank_zero_info(" + }, + { + "comment": "This code block is displaying various configuration details of the RWKV-5 model, including the precision, number of nodes and devices used, batch size, data file and project directory. It also mentions the epoch range, saving frequency, steps per epoch, model architecture, learning rate schedule, Adam optimizer settings, and version information for Torch, Deepspeed, and PyTorch Lightning. The recommendation section advises using specific versions of these libraries for optimal performance.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/train.py\":182-199", + "content": " f\"\"\"\n############################################################################\n#\n# RWKV-5 {args.precision.upper()} on {args.num_nodes}x{args.devices} {args.accelerator.upper()}, bsz {args.num_nodes}x{args.devices}x{args.micro_bsz}={args.real_bsz}, {args.strategy} {'with grad_cp' if args.grad_cp > 0 else ''}\n#\n# Data = {args.data_file} ({args.data_type}), ProjDir = {args.proj_dir}\n#\n# Epoch = {args.epoch_begin} to {args.epoch_begin + args.epoch_count - 1} (will continue afterwards), save every {args.epoch_save} epoch\n#\n# Each \"epoch\" = {args.epoch_steps} steps, {samples_per_epoch} samples, {tokens_per_epoch} tokens\n#\n# Model = {args.n_layer} n_layer, {args.n_embd} n_embd, {args.ctx_len} ctx_len\n#\n# Adam = lr {args.lr_init} to {args.lr_final}, warmup {args.warmup_steps} steps, beta {args.betas}, eps {args.adam_eps}\n#\n# Found torch {torch.__version__}, recommend 1.13.1+cu117 or newer\n# Found deepspeed {deepspeed_version}, recommend 0.7.0 (faster than newer versions)\n# Found pytorch_lightning {pl.__version__}, recommend 1.9.5" + }, + { + "comment": "This code segment sets up the training environment for the RWKV-v5 model. It checks the arguments provided, ensures correct data type and precision, handles special learning rate cases, and sets up appropriate configurations for faster and stable training. It also provides informative notes if using potentially slower or less stable precisions. Additionally, it enables CUDA features for improved performance.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/train.py\":200-225", + "content": "#\n############################################################################\n\"\"\"\n )\n rank_zero_info(str(vars(args)) + \"\\n\")\n assert args.data_type in [\"utf-8\", \"utf-16le\", \"numpy\", \"binidx\", \"dummy\", \"uint16\"]\n if args.lr_final == 0 or args.lr_init == 0:\n rank_zero_info(\"\\n\\nNote: lr_final = 0 or lr_init = 0. Using linear LR schedule instead.\\n\\n\")\n assert args.precision in [\"fp32\", \"tf32\", \"fp16\", \"bf16\"]\n os.environ[\"RWKV_FLOAT_MODE\"] = args.precision\n if args.precision == \"fp32\":\n for i in range(10):\n rank_zero_info(\"\\n\\nNote: you are using fp32 (very slow). Try bf16 / tf32 for faster training.\\n\\n\")\n if args.precision == \"fp16\":\n rank_zero_info(\"\\n\\nNote: you are using fp16 (might overflow). Try bf16 / tf32 for stable training.\\n\\n\")\n os.environ[\"RWKV_JIT_ON\"] = \"1\"\n if \"deepspeed_stage_3\" in args.strategy:\n os.environ[\"RWKV_JIT_ON\"] = \"0\"\n torch.backends.cudnn.benchmark = True\n torch.backends.cudnn.enabled = True\n if args.precision == \"fp32\":" + }, + { + "comment": "This code snippet is setting up the model training environment. It sets the CUDA backend allowances for TF32 and checks the precision argument (32, fp16, or bf16). It imports necessary modules like `train_callback`, `MyDataset` and `RWKV`. The code initializes a dataset instance, sets the vocab size based on it. Then it creates an RWKV model instance. If there's no pre-existing load model or if it's at the first stage of MyPile, it generates initial weights using `generate_init_weight`, saves them to a file and uses that file as the load model. Finally, it prints a status message about loading the specified model.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/train.py\":226-255", + "content": " torch.backends.cudnn.allow_tf32 = False\n torch.backends.cuda.matmul.allow_tf32 = False\n else:\n torch.backends.cudnn.allow_tf32 = True\n torch.backends.cuda.matmul.allow_tf32 = True\n if \"32\" in args.precision:\n args.precision = 32\n elif args.precision == \"fp16\":\n args.precision = 16\n else:\n args.precision = \"bf16\"\n ########################################################################################################\n from src.trainer import train_callback, generate_init_weight\n from src.dataset import MyDataset\n train_data = MyDataset(args)\n args.vocab_size = train_data.vocab_size\n from src.model import RWKV\n model = RWKV(args)\n if len(args.load_model) == 0 or args.my_pile_stage == 1: # shall we build the initial weights?\n init_weight_name = f\"{args.proj_dir}/rwkv-init.pth\"\n generate_init_weight(model, init_weight_name) # save initial weights\n args.load_model = init_weight_name\n rank_zero_info(f\"########## Loading {args.load_model}... ##########\")" + }, + { + "comment": "This code attempts to load a checkpoint model from the specified file. It handles exceptions if the checkpoint is invalid and allows for loading partial models. If an issue occurs, it provides information about the bad checkpoint and tries again with a different one. The code also removes unnecessary keys starting with \"_forward_module.\"", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/train.py\":256-279", + "content": " try:\n load_dict = torch.load(args.load_model, map_location=\"cpu\")\n load_keys = list(load_dict.keys())\n for k in load_keys:\n if k.startswith('_forward_module.'):\n load_dict[k.replace('_forward_module.','')] = load_dict[k]\n del load_dict[k]\n except:\n rank_zero_info(f\"Bad checkpoint {args.load_model}\")\n if args.my_pile_stage >= 2: # try again using another checkpoint\n max_p = args.my_pile_prev_p\n if max_p == -1:\n args.load_model = f\"{args.proj_dir}/rwkv-init.pth\"\n else:\n args.load_model = f\"{args.proj_dir}/rwkv-{max_p}.pth\"\n args.epoch_begin = max_p + 1\n rank_zero_info(f\"Trying {args.load_model}\")\n load_dict = torch.load(args.load_model, map_location=\"cpu\")\n if args.load_partial == 1:\n load_keys = load_dict.keys()\n for k in model.state_dict():\n if k not in load_keys:\n load_dict[k] = model.state_dict()[k]" + }, + { + "comment": "The code is creating a trainer object for PyTorch Lightning, depending on the version of PyTorch. It loads the state dictionary into the model and initializes the trainer with provided arguments like accelerator, strategy, devices, etc. The code also checks the shape of parameters in the model's state_dict and prints them if the shape has more than one element.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/train.py\":280-298", + "content": " model.load_state_dict(load_dict)\n if pl.__version__[0]=='2':\n trainer = Trainer(accelerator=args.accelerator,strategy=args.strategy,devices=args.devices,num_nodes=args.num_nodes,precision=args.precision,\n logger=args.logger,callbacks=[train_callback(args)],max_epochs=args.max_epochs,check_val_every_n_epoch=args.check_val_every_n_epoch,num_sanity_val_steps=args.num_sanity_val_steps,\n log_every_n_steps=args.log_every_n_steps,enable_checkpointing=args.enable_checkpointing,accumulate_grad_batches=args.accumulate_grad_batches,gradient_clip_val=args.gradient_clip_val)\n else:\n trainer = Trainer.from_argparse_args(\n args,\n callbacks=[train_callback(args)],\n )\n if trainer.global_rank == 0:\n for n in model.state_dict():\n shape = model.state_dict()[n].shape\n shape = [i for i in shape if i != 1]\n if len(shape) > 1:\n print(f\"{str(shape[0]).ljust(5)} {str(shape[1]).ljust(5)} {n}\")\n else:" + }, + { + "comment": "This code prints the shape and number of training samples, configures Deepspeed optimization settings, sets up a DataLoader with specified parameters, and trains the model using the Deepspeed trainer.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/train.py\":299-308", + "content": " print(f\"{str(shape[0]).ljust(5)} {n}\")\n if \"deepspeed\" in args.strategy:\n trainer.strategy.config[\"zero_optimization\"][\"allgather_bucket_size\"] = args.ds_bucket_mb * 1000 * 1000\n trainer.strategy.config[\"zero_optimization\"][\"reduce_bucket_size\"] = args.ds_bucket_mb * 1000 * 1000\n # must set shuffle=False, persistent_workers=False (because worker is in another thread)\n data_loader = DataLoader(train_data, shuffle=False, pin_memory=True, batch_size=args.micro_bsz, num_workers=1, persistent_workers=False, drop_last=True)\n trainer.fit(model, data_loader)" + } + ] +} \ No newline at end of file diff --git a/docs/doc/4043d731-807a-41d0-b1a8-eec524a12d56.json b/docs/doc/4043d731-807a-41d0-b1a8-eec524a12d56.json new file mode 100644 index 00000000..d25f07bd --- /dev/null +++ b/docs/doc/4043d731-807a-41d0-b1a8-eec524a12d56.json @@ -0,0 +1,25 @@ +{ + "summary": "This code trains RWKV v2-RNN and GPT models, adjusting parameters such as batch size, learning rate, and optimizing Torch usage. It saves models at fixed intervals and provides configurations for both models.", + "details": [ + { + "comment": "Code for RWKV v2-RNN language model training. Sets up training data, model size, and provides options for encoding format. Imports necessary modules, optimizes Torch usage, and offers different model configurations.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v2-RNN/train.py\":0-28", + "content": "########################################################################################################\n# The RWKV v2-RNN Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nimport logging\nimport datetime\nimport json\nfrom src.model import GPT, GPTConfig\nfrom src.trainer import Trainer, TrainerConfig\nfrom src.utils import Dataset\nimport torch\nimport numpy as np\ntorch.backends.cudnn.benchmark = True\ntorch.backends.cudnn.allow_tf32 = True\ntorch.backends.cuda.matmul.allow_tf32 = True\n### Step 1: set training data ##########################################################################\ndatafile = \"enwik8\"\ndatafile_encoding = 'utf-8'\n# datafile_encoding = 'utf-16le'\n### Step 2: set model size #############################################################################\nctx_len = 1024 # ===> increase T_MAX in model.py if your ctx_len > 1024\nn_layer = 6\nn_embd = 512\n# 'RWKV' (better for char-level English) or 'RWKV-ffnPre' (better in some cases)" + }, + { + "comment": "This code sets the batch size, learning rate, training mini-epochs, and other parameters for the RWKV model. The batch size should be divisible by forward and backward group sizes in model.py. If encountering \"CUDA out of memory,\" reduce it within GPU limitations. Mini-epochs have a fixed length with saved models saved every 30 mini-epochs.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v2-RNN/train.py\":29-55", + "content": "model_type = 'RWKV'\n### Step 3: set batch size #############################################################################\n# ===> batch_size must be divisible by B_GROUP_FORWARD and B_GROUP_BACKWARD in model.py\n# For example, if your batch_size = 20, you can set B_GROUP_FORWARD = 4, B_GROUP_BACKWARD = 2\n# If you see \"CUDA out of memory\", reduce it. Use GPU-Z to find the highest value for your VRAM.\nbatch_size = 12\n### Step 4: set learning rate, training mini-epochs #######################################################\nlr_init = 6e-4\nlr_final = 1e-5\n# the mini-epoch is very short and of fixed length (ctx_len * epoch_length_fixed tokens)\nn_epoch = 500\n# 0 = never, 1 = every mini-epoch, 2 = every two mini-epochs, etc.\nepoch_save_frequency = 30\nepoch_save_path = 'trained-'\nepoch_length_fixed = 10000\n########################################################################################################\n# import src.utils\n# src.utils.set_seed(42) # remember to change seed if you load a model\nnp.set_printoptions(precision=4, suppress=True, linewidth=200)" + }, + { + "comment": "Loading data for training GPT model...\nTraining the GPT model with specified configuration and parameters.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v2-RNN/train.py\":56-81", + "content": "logging.basicConfig(format=\"%(asctime)s - %(levelname)s - %(name)s - %(message)s\",\n datefmt=\"%Y-%m-%d %H:%M:%S\", level=logging.INFO,)\ngrad_norm_clip = 1.0\nwarmup_tokens = 0\nbetas = (0.9, 0.99)\neps = 4e-9\nnum_workers = 0\n########################################################################################################\n# Load data\n########################################################################################################\nprint('loading data... ' + datafile)\ntrain_dataset = Dataset(open(\n datafile, \"r\", encoding=datafile_encoding).read(), ctx_len, epoch_length_fixed)\n########################################################################################################\n# Train model\n########################################################################################################\nif __name__ == '__main__':\n model = GPT(GPTConfig(train_dataset.vocab_size, train_dataset.ctx_len, model_type=model_type,\n n_layer=n_layer, n_embd=n_embd)).cuda()" + }, + { + "comment": "This code loads a trained model, sets up the Trainer and trains it, then saves the final model. It also prints out various parameters for the training session such as epochs, batch size, etc.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v2-RNN/train.py\":83-97", + "content": " # # # load a trained model. remember to change random seed\n # m2 = torch.load('trained-61.pth')\n # model.load_state_dict(m2)\n print('model', model_type, 'epoch', n_epoch, 'batchsz', batch_size, 'betas',\n betas, 'eps', eps, 'ctx', ctx_len, 'layer', n_layer, 'embd', n_embd, )\n tconf = TrainerConfig(model_type=model_type, max_epochs=n_epoch, batch_size=batch_size,\n learning_rate=lr_init, lr_decay=True, lr_final=lr_final, betas=betas, eps=eps, grad_norm_clip=grad_norm_clip,\n warmup_tokens=warmup_tokens, final_tokens=n_epoch*len(train_dataset)*ctx_len, num_workers=num_workers, epoch_save_frequency=epoch_save_frequency, epoch_save_path=epoch_save_path)\n trainer = Trainer(model, train_dataset, None, tconf)\n trainer.train()\n torch.save(model.state_dict(), 'trained-' + str(n_epoch) + '-' + trainer.get_run_name() +\n '-' + datetime.datetime.today().strftime('%Y-%m-%d-%H-%M-%S') + '.pth')" + } + ] +} \ No newline at end of file diff --git a/docs/doc/44bdf2e4-3e44-466d-a3f6-4bd0894306c7.json b/docs/doc/44bdf2e4-3e44-466d-a3f6-4bd0894306c7.json new file mode 100644 index 00000000..2bba05f3 --- /dev/null +++ b/docs/doc/44bdf2e4-3e44-466d-a3f6-4bd0894306c7.json @@ -0,0 +1,35 @@ +{ + "summary": "This code trains an RWKV-LM and GPT language model, optimizes GPU memory usage, saves partial models, and trains with specified parameters. The trained model's state dictionary is saved with identifiers for file identification.", + "details": [ + { + "comment": "The code is importing necessary modules and setting up the environment for training a language model called RWKV-LM. It also includes a special debug mode option that can be enabled but is currently set to False, and it sets seed for deterministic training. The code imports classes from other files, including GPT model and TrainerConfig for configuration settings. The logging module is configured to display specific information in the console.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/train.py\":0-21", + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nimport os\n# if False: # True False ---> Set to False if you don't understand it\n# print(\"\\n\\n[[[ SPECIAL DEBUG MODE FOR MYSELF. DON'T ENABLE THIS IF YOU DON'T UNDERSTAND IT ]]]\\n\\n\")\n# os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n# import src.utils\n# src.utils.set_seed(42) # make training deterministic (including dataloader). if you are doing this, remember to change seed when you load a model (otherwise the dataloader loads old samples)\nimport logging\nimport datetime\nfrom src.model import GPT, GPTConfig\nfrom src.trainer import Trainer, TrainerConfig\nfrom src.utils import Dataset\nimport torch\nimport numpy as np\nnp.set_printoptions(precision=4, suppress=True, linewidth=200)\nlogging.basicConfig(format=\"%(asctime)s - %(levelname)s - %(name)s - %(message)s\"," + }, + { + "comment": "This code sets up training parameters for the RWKV-v3 language model. It specifies the data file, model size, batch size, and enables CUDA features for efficient GPU utilization. The model type can be either 'RWKV' or 'RWKV-ffnPre', and there is an optional headQK dimension to improve loss. This configuration allows for training deeper models with better performance.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/train.py\":22-47", + "content": " datefmt=\"%Y-%m-%d %H:%M:%S\", level=logging.INFO,)\ntorch.backends.cudnn.benchmark = True\ntorch.backends.cudnn.allow_tf32 = True\ntorch.backends.cuda.matmul.allow_tf32 = True\n### Step 1: set training data ##########################################################################\ndatafile = \"../data/enwik8\" # your data\ndatafile_encoding = 'utf-8'\n# datafile_encoding = 'utf-16le'\n### Step 2: set model size #############################################################################\n# ----> test deeper models (n_layer at least 12) to see the advantage of RWKV-3 over RWKV-2\nctx_len = 1024 # increase T_MAX in model.py if your ctx_len > 1024\nn_layer = 6\nn_embd = 512\n# 'RWKV' (better for English) or 'RWKV-ffnPre' (better in some cases)\nmodel_type = 'RWKV'\n# ---> there is a RWKV_HEAD_QK_DIM in model.py and model_run.py\n# set it to 256, then it's using my headQK trick (similar to a tiny attention) to improve loss\n# set it to 0, then it's a pure RNN (attention-free)\n### Step 3: set batch size #############################################################################" + }, + { + "comment": "This code snippet provides guidance for setting batch_size, learning rate, and the number of mini-epochs in RWKV-v3's training script. It suggests adjusting batch_size to optimize GPU memory usage and offers suggestions on how to set learning rate values for different model sizes. The comments also advise saving a partially trained model for resuming later, with instructions on how to implement this.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/train.py\":49-64", + "content": "# ---> batch_size must be divisible by B_GROUP_FORWARD and B_GROUP_BACKWARD in model.py\n# for example, if your batch_size = 20, you can set B_GROUP_FORWARD = 4, B_GROUP_BACKWARD = 2\n# if you see \"CUDA out of memory\", reduce batch_size. Use nvidia-smi to find the highest value for your GPU.\nbatch_size = 12\n### Step 4: set learning rate, number of mini-epochs #######################################################\n# By default we are using exponential LR decay.\n#\n# Here are my suggestions for training a good model.\n# Let's say you will train a L6-D512 model.\n# 1) Set lr_init = lr_final = 8e-4. Let it run for some mini-epochs, until the improvement of loss become slow.\n# 2) Check epoch_save_frequency and make sure the partially-trained model is saved. Ctrl+C to stop the run.\n# 3) Set lr_init = 8e-4, lr_final = 1e-5, warmup_tokens = ctx_len * batch_size * 50, betas = (0.9, 0.999).\n# 4) Search for \"torch.load\" here and modify it to load the partially-trained model. Continue the training.\n# \n# For L12-D768, set lr_init = 6e-4. For L24-D1024, set lr_init = 4e-4. For L24-D2048, set lr_init = 3e-4." + }, + { + "comment": "This code initializes learning rate, defines training parameters, loads the dataset, and sets up data processing for training RWKV language model. It specifies learning rate initialization, final value, number of epochs, fixed batch length, save frequency, gradient norm clipping, warmup tokens, betas for Adam optimizer, epsilon for stability, and number of workers for data loading.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/train.py\":66-95", + "content": "lr_init = 8e-4 # we can use larger lr because of preLN\nlr_final = 1e-5\n# the mini-epoch is very short and of fixed length (length = ctx_len * epoch_length_fixed tokens)\nn_epoch = 500\nepoch_length_fixed = 10000\n# 0 = never, 1 = every mini-epoch, 2 = every two mini-epochs, ...\nepoch_save_frequency = 10\nepoch_save_path = 'trained-'\n########################################################################################################\ngrad_norm_clip = 1.0\nwarmup_tokens = ctx_len * batch_size * 0\nbetas = (0.9, 0.99)\neps = 4e-9\nnum_workers = 0\n########################################################################################################\n# Load data\n########################################################################################################\nprint('loading data... ' + datafile)\ntrain_dataset = Dataset(open(\n datafile, \"r\", encoding=datafile_encoding).read(), ctx_len, epoch_length_fixed)\n########################################################################################################" + }, + { + "comment": "This code snippet is responsible for training a model using the GPT architecture. It loads a previously trained model, specifies the trainer configuration, and then proceeds to train the model with the specified number of epochs, batch size, learning rate, and other hyperparameters.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/train.py\":96-111", + "content": "# Train model\n########################################################################################################\nif __name__ == '__main__':\n model = GPT(GPTConfig(train_dataset.vocab_size, train_dataset.ctx_len, model_type=model_type,\n n_layer=n_layer, n_embd=n_embd)).cuda()\n ### ---> load a trained model <---\n # m2 = torch.load('trained-61.pth')\n # model.load_state_dict(m2)\n print('model', model_type, 'epoch', n_epoch, 'batchsz', batch_size, 'betas',\n betas, 'eps', eps, 'ctx', ctx_len, 'layer', n_layer, 'embd', n_embd, )\n tconf = TrainerConfig(model_type=model_type, max_epochs=n_epoch, batch_size=batch_size,\n learning_rate=lr_init, lr_decay=True, lr_final=lr_final, betas=betas, eps=eps, grad_norm_clip=grad_norm_clip,\n warmup_tokens=warmup_tokens, final_tokens=n_epoch*len(train_dataset)*ctx_len, num_workers=num_workers, epoch_save_frequency=epoch_save_frequency, epoch_save_path=epoch_save_path)" + }, + { + "comment": "Saving the model's state dictionary after training, including epoch number and run name, with a timestamp for file identification.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/train.py\":112-117", + "content": " trainer = Trainer(model, train_dataset, None, tconf)\n trainer.train()\n torch.save(model.state_dict(), 'trained-' + str(n_epoch) + '-' + trainer.get_run_name() +\n '-' + datetime.datetime.today().strftime('%Y-%m-%d-%H-%M-%S') + '.pth')" + } + ] +} \ No newline at end of file diff --git a/docs/doc/4b316ed5-4fd6-4479-8c33-3f14fa94687b.json b/docs/doc/4b316ed5-4fd6-4479-8c33-3f14fa94687b.json new file mode 100644 index 00000000..7a5567ec --- /dev/null +++ b/docs/doc/4b316ed5-4fd6-4479-8c33-3f14fa94687b.json @@ -0,0 +1,50 @@ +{ + "summary": "The code defines 'MMapIndexedDataset' and 'Index' for handling indexed datasets, reading binary data, creating memory-mapped objects, and allows retrieval/manipulation of data. It doesn't support prefetch and checks if files exist for given path.", + "details": [ + { + "comment": "This code imports necessary libraries and defines functions for handling indexed datasets. The 'print_rank_0' function prints messages only on rank 0 if distributed is initialized, while '_warmup_mmap_file' warms up an mmap file by reading it in chunks. The 'dtypes' dictionary maps data types to their respective codes. 'index_file_path' and 'data_file_path' functions return the paths for index and data files respectively. The class 'MMapIndexedDataset' inherits from torch.utils.data.Dataset, suggesting it handles indexed datasets in a specific format.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/binidx.py\":0-47", + "content": "from lib2to3.pgen2 import token\nimport os\nimport torch\nimport numpy as np\nimport shutil\nimport struct\nfrom functools import lru_cache\nfrom itertools import accumulate\ndef print_rank_0(*message):\n pass\n # \"\"\"If distributed is initialized print only on rank 0.\"\"\"\n # if torch.distributed.is_initialized():\n # if torch.distributed.get_rank() == 0:\n # print(*message, flush=True)\n # else:\n # print(*message, flush=True)\ndef _warmup_mmap_file(path):\n pass\n # with open(path, \"rb\") as stream:\n # while stream.read(100 * 1024 * 1024):\n # pass\ndtypes = {\n 1: np.uint8,\n 2: np.int8,\n 3: np.int16,\n 4: np.int32,\n 5: np.int64,\n 6: float,\n 7: np.double,\n 8: np.uint16,\n}\ndef code(dtype):\n for k in dtypes.keys():\n if dtypes[k] == dtype:\n return k\n raise ValueError(dtype)\ndef index_file_path(prefix_path):\n return prefix_path + \".idx\"\ndef data_file_path(prefix_path):\n return prefix_path + \".bin\"\nclass MMapIndexedDataset(torch.utils.data.Dataset):" + }, + { + "comment": "The code defines a class called Index with a method writer(). This writer() method creates a subclass _Writer which is used to write the header of a binary file. It writes a magic string, version number (little endian), and data type information in the file.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/binidx.py\":48-75", + "content": " class Index(object):\n _HDR_MAGIC = b\"MMIDIDX\\x00\\x00\"\n @classmethod\n def writer(cls, path, dtype):\n class _Writer(object):\n def __enter__(self):\n self._file = open(path, \"wb\")\n # Write Magic string so we can check the file format then opening it again.\n self._file.write(cls._HDR_MAGIC)\n # Write version number\n # Little endian unsigned 64 Bit integer\n self._file.write(struct.pack(\" 600) else 0 # extra tiny attention dim, useful for long ctx char-level english\nrwkv_tiny_head = 1 # 1 is good enough. 8 is slow\n# n_side_proj = 512 # extra 'side projection', quite useful for BPE models \n########################################################################################################\n# Load data\n########################################################################################################\nprint('loading data... ' + datafile)\nclass Dataset(Dataset):\n def __init__(self, data, model_level, ctx_len):\n print('building token list...', end=' ')\n if model_level == 'word':\n import re\n data = re.sub(r'(\\n|\\.|\\,|\\?|\\!|\\:|\\;|\\-|\\\u2014|\\||\\'|\\\"|\\`|\\(|\\)|[0-9]|\\[|\\]|\\{|\\}|\\=|\\+|\\*|\\\\|\\/|\\~|\\&|\\$|\\#|\\%)', r' \\g<0> ', data)\n data = re.sub(' +',' ',data)\n print('splitting token...')\n data = data.lower().split(' ')" + }, + { + "comment": "The code trains the RWKV-LM model and writes the vocabulary to a JSON file named 'vocab.json'. It then defines two dictionaries, `stoi` and `itos`, which map characters to their unique indices and vice versa. The function `__len__` returns the epoch length and `__getitem__` retrieves a chunk of data with context length from the dataset given an index.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v1/train.py\":87-116", + "content": " unique = sorted(list(set(data)))\n # print()\n # for u in unique:\n # print(u, end=' ')\n # print('\\n\\n')\n xx = 0\n xxObj = {}\n for u in unique:\n xxObj[xx] = u\n xx += 1\n with open('vocab.json', \"w\", encoding=\"utf-16\") as vocab_file:\n vocab_file.write(json.dumps(xxObj, ensure_ascii=False))\n data_size, vocab_size = len(data), len(unique)\n print('data has %d %ss, %d unique.' % (data_size, model_level, vocab_size))\n self.stoi = { ch:i for i,ch in enumerate(unique) }\n self.itos = { i:ch for i,ch in enumerate(unique) }\n self.ctx_len = ctx_len\n self.vocab_size = vocab_size\n self.data = data\n def __len__(self):\n return epoch_length_fixed\n def __getitem__(self, idx):\n i = np.random.randint(0, len(self.data) - (self.ctx_len + 1)) # cheat: pick a random spot in dataset\n chunk = self.data[i:i+self.ctx_len+1]\n dix = [self.stoi[s] for s in chunk]\n x = torch.tensor(dix[:-1], dtype=torch.long)" + }, + { + "comment": "This code snippet initializes a GPT model with the specified configuration and train dataset. It then loads a pre-trained model's state dictionary or prints out some details about the model, trainer, and training process setup.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v1/train.py\":117-134", + "content": " y = torch.tensor(dix[1:], dtype=torch.long)\n return x, y\ntrain_dataset = Dataset(open(datafile, \"r\", encoding=datafile_encoding).read(), model_level, ctx_len)\n########################################################################################################\n# Train model\n########################################################################################################\nmodel = GPT(GPTConfig(train_dataset.vocab_size, train_dataset.ctx_len, model_type=model_type,\n rwkv_emb_scale=rwkv_emb_scale, rwkv_tiny_attn=rwkv_tiny_attn, rwkv_tiny_head=rwkv_tiny_head,\n n_layer=n_layer, n_head=n_head, n_embd=n_embd, n_attn=n_attn, n_ffn=n_ffn))\n# load a trained model\n# model.load_state_dict(torch.load('trained-xxx.pth').state_dict())\nprint('model', model_type, 'epoch', n_epoch, 'batchsz', batch_size, 'betas', betas, 'eps', eps, 'wd', weight_decay, 'ctx', ctx_len, 'layer', n_layer, 'head', n_head, 'embd', n_embd, 'attn', n_attn, 'ffn', n_ffn)\ntconf = TrainerConfig(model_type=model_type, max_epochs=n_epoch, batch_size=batch_size, weight_decay=weight_decay," + }, + { + "comment": "This code initializes a trainer with learning rate, decay, and final values for training the model on the provided train dataset. The trainer is then used to train the model, and after training, the trained model is saved in a file named \"trained-[run_name]-[current_date_time].pth\".", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v1/train.py\":135-141", + "content": " learning_rate=lr_init, lr_decay=True, lr_final=lr_final, betas=betas, eps=eps,\n warmup_tokens=0, final_tokens=n_epoch*len(train_dataset)*ctx_len, num_workers=0, epoch_save_frequency=epoch_save_frequency, epoch_save_path=epoch_save_path)\ntrainer = Trainer(model, train_dataset, None, tconf)\ntrainer.train()\ntorch.save(model, 'trained-' + trainer.get_run_name() + '-' + datetime.datetime.today().strftime('%Y-%m-%d-%H-%M-%S') + '.pth')" + } + ] +} \ No newline at end of file diff --git a/docs/doc/4c589681-9bae-4165-805f-bc8706141d7b.json b/docs/doc/4c589681-9bae-4165-805f-bc8706141d7b.json new file mode 100644 index 00000000..0a9d4976 --- /dev/null +++ b/docs/doc/4c589681-9bae-4165-805f-bc8706141d7b.json @@ -0,0 +1,25 @@ +{ + "summary": "This code initializes and compares the outputs of RWKV-GPT and RWKV-RNN models, tokenizing input context and training model_train on GPU with tensor data. It sets environment variables for hardware mode and floating-point precision.", + "details": [ + { + "comment": "This code verifies the results of different models to ensure consistency. It imports necessary libraries, sets environment variables for hardware mode and floating-point precision, defines the model to run (RWKV_RNN or GPT), specifies token mode as either character or pile, and defines variables for model name, vocabulary file, context length, and number of layers.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/verify.py\":0-25", + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\n# this is for verifying the results of different models and make sure they agree with each other\nimport numpy as np\nnp.set_printoptions(precision=4, suppress=True, linewidth=200)\nimport os\nos.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\nos.environ['RWKV_FLOAT_MODE'] = 'bf16' # 'bf16' (stable) or 'fp16' (will overflow after training a large model for very long. can be solved in the future)\nos.environ['RWKV_RUN_DEVICE'] = 'cuda'\nRUN_DEVICE = os.environ['RWKV_RUN_DEVICE']\nimport torch\nfrom src.model_run import RWKV_RNN, RWKV_GPT\nfrom src.model import GPT, GPTConfig\nTOKEN_MODE = 'pile' # char / pile\nif TOKEN_MODE == 'char':\n MODEL_NAME = 'trained-1'\n WORD_NAME = 'vocab' # the .json vocab (generated by train.py)\n ctx_len = 1024\n n_layer = 6" + }, + { + "comment": "The code is initializing a model for the RWKV language model, specifically the \"RWKV-v4\" variant. It checks the TOKEN_MODE and sets up the tokenizer and model accordingly. The tokenizer's vocab_size is set to 50277 if in 'pile' mode, and the model is loaded from a specific .pth file. The model is also converted to fp16 or bf16 depending on the environment variable RWKV_FLOAT_MODE.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/verify.py\":26-56", + "content": " n_embd = 512\n UNKNOWN_CHAR = ' ' # here we just set it to [space] for simplicity\nelif TOKEN_MODE == 'pile':\n WORD_NAME = ['20B_tokenizer.json', '20B_tokenizer.json']\n MODEL_NAME = 'RWKV-4-Pile-169M-20220807-8023'\n ctx_len = 1024\n n_layer = 12\n n_embd = 768\n UNKNOWN_CHAR = None\nmodel_type = 'RWKV'\nfrom src.utils import TOKENIZER\ntokenizer = TOKENIZER(WORD_NAME, UNKNOWN_CHAR=UNKNOWN_CHAR)\nif TOKEN_MODE == 'pile':\n tokenizer.vocab_size = 50277\n########################################################################################################\nmodel_train = GPT(GPTConfig(tokenizer.vocab_size, ctx_len, model_type=model_type, n_layer=n_layer, n_embd=n_embd)).cuda()\nif os.environ['RWKV_FLOAT_MODE'] == 'fp16':\n model_train = model_train.half()\nelif os.environ['RWKV_FLOAT_MODE'] == 'bf16':\n model_train = model_train.bfloat16()\nprint('loading ' + MODEL_NAME)\nm2 = torch.load(MODEL_NAME + '.pth', map_location=RUN_DEVICE)\nmodel_train.load_state_dict(m2)\nmodel_rnn = RWKV_RNN(MODEL_NAME, RUN_DEVICE, model_type, n_layer, n_embd, ctx_len)" + }, + { + "comment": "The code initializes an RWKV-GPT model, tokenizes input context in either character or pile mode, and prints the output of both RWKV-GPT and RWKV-RNN models for the given input.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/verify.py\":57-80", + "content": "model_gpt = RWKV_GPT(MODEL_NAME, RUN_DEVICE, model_type, tokenizer.vocab_size, n_layer, n_embd, ctx_len).cuda()\n########################################################################################################\n# context = '\\nIn a'\ncontext = '\\nIn a shocking finding, scientist discovered a herd of dragons living in a remote, previously unexplored valley, in Tibet. Even more surprising to the researchers was the fact that the dragons spoke perfect Chinese.'\nif TOKEN_MODE == 'char':\n ctx = [tokenizer.stoi.get(s, tokenizer.UNKNOWN_CHAR) for s in context]\nelif TOKEN_MODE == 'pile':\n ctx = tokenizer.tokenizer.encode(context)\nprint(f'input len {len(ctx)} data {ctx}')\n########################################################################################################\nprint('\\nRWKV-GPT output')\nout = model_gpt.forward(torch.tensor(ctx).unsqueeze(0).cuda())[0].detach().cpu().numpy()\nprint(out)\nprint('\\nRWKV-RNN output')\nmodel_rnn.clear()\nsrc_len = len(ctx)\nfor i in range(src_len):\n x = ctx[:i+1]" + }, + { + "comment": "This code snippet is checking the output of RWKV-v4 model at specific indices and then prints the RWKV-train output. It first runs the model_rnn on input x and checks if i (index) is less than 3 or greater than src\\_len - 3, printing the detached output to CPU numpy array. If i equals 2, it prints '...'. Then, it trains the model_train on cuda with ctx tensor, gets the forward output and prints it as float numpy array.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/verify.py\":81-89", + "content": " out = model_rnn.run(x)\n if i < 3 or i >= src_len - 3:\n print(torch.tensor(out).detach().cpu().numpy())\n if i == 2:\n print('...')\nprint('\\nRWKV-train output')\nout = model_train.forward(torch.tensor([ctx]).cuda())[0][0].detach().cpu().float().numpy()\nprint(out, '\\n')" + } + ] +} \ No newline at end of file diff --git a/docs/doc/4c68313e-e6ed-434f-8d62-3babe0c86b6f.json b/docs/doc/4c68313e-e6ed-434f-8d62-3babe0c86b6f.json new file mode 100644 index 00000000..a624825a --- /dev/null +++ b/docs/doc/4c68313e-e6ed-434f-8d62-3babe0c86b6f.json @@ -0,0 +1,30 @@ +{ + "summary": "This code performs a forward pass of a neural network on GPU with BFloat16 data type and softmax cross-entropy loss, calculating gradients using fixed-point arrays in CUDA. It defines two functions, cuda_forward and cuda_backward, which use CUDA to perform matrix operations on GPU.", + "details": [ + { + "comment": "This function implements the forward pass of a neural network operation using BFloat16 (bf16) data type on GPU. It takes input dimensions and pointers to weights, inputs, keys, and values arrays as parameters. The function then performs element-wise multiplications, accumulates results, and stores the result in an output array.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/cuda/wkv_cuda_bf16.cu\":0-25", + "content": "#include\n#include \n#include \"ATen/ATen.h\"\n#define MIN_VALUE (-1e38)\ntypedef at::BFloat16 bf16;\n__global__ void kernel_forward(const int B, const int T, const int C,\n const float *__restrict__ const _w, const bf16 *__restrict__ const _u, const bf16 *__restrict__ const _k, const bf16 *__restrict__ const _v,\n bf16 *__restrict__ const _y) {\n const int idx = blockIdx.x * blockDim.x + threadIdx.x;\n const int _b = idx / C;\n const int _c = idx % C;\n const int _offset = _b * T * C + _c;\n float u = float(_u[_c]);\n float w = _w[_c];\n const bf16 *__restrict__ const k = _k + _offset;\n const bf16 *__restrict__ const v = _v + _offset;\n bf16 *__restrict__ const y = _y + _offset;\n // aa and bb are running sums divided by exp(pp) (to avoid overflow)\n float aa = 0, bb = 0, pp = MIN_VALUE;\n for (int i = 0; i < T; i++) {\n const int ii = i * C;\n const float kk = float(k[ii]);\n const float vv = float(v[ii]);" + }, + { + "comment": "Calculates gradients for weight, input, and kernel arrays using backward pass with softmax cross-entropy loss. Batch size, sequence length, number of channels, weight, input, kernel arrays, output gradients, and gradients for each array are passed as arguments to the kernel function. Gradient computation is performed per element in the arrays.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/cuda/wkv_cuda_bf16.cu\":27-52", + "content": " float ww = u + kk;\n float p = max(pp, ww);\n float e1 = exp(pp - p);\n float e2 = exp(ww - p);\n y[ii] = bf16((e1 * aa + e2 * vv) / (e1 * bb + e2));\n ww = w + pp;\n p = max(ww, kk);\n e1 = exp(ww - p);\n e2 = exp(kk - p);\n aa = e1 * aa + e2 * vv;\n bb = e1 * bb + e2;\n pp = p;\n }\n}\n__global__ void kernel_backward(const int B, const int T, const int C,\n const float *__restrict__ const _w, const bf16 *__restrict__ const _u, const bf16 *__restrict__ const _k, const bf16 *__restrict__ const _v,\n const bf16 *__restrict__ const _y, const bf16 *__restrict__ const _gy,\n bf16 *__restrict__ const _gw, bf16 *__restrict__ const _gu, bf16 *__restrict__ const _gk, bf16 *__restrict__ const _gv) {\n const int idx = blockIdx.x * blockDim.x + threadIdx.x;\n const int _b = idx / C;\n const int _c = idx % C;\n const int _offset = _b * T * C + _c;\n float u = float(_u[_c]);" + }, + { + "comment": "The code is calculating the softmax and element-wise multiplication of input vectors for matrix multiplication using bf16 data type in CUDA. It initializes variables, performs calculations using exp() and max(), stores results in q and r arrays, and updates gw and gu variables.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/cuda/wkv_cuda_bf16.cu\":53-84", + "content": " float w = _w[_c];\n const bf16 *__restrict__ const k = _k + _offset;\n const bf16 *__restrict__ const v = _v + _offset;\n const bf16 *__restrict__ const y = _y + _offset;\n const bf16 *__restrict__ const gy = _gy + _offset;\n bf16 *__restrict__ const gk = _gk + _offset;\n bf16 *__restrict__ const gv = _gv + _offset;\n float q[Tmax], r[Tmax];\n float gw = 0, gu = 0, aa = 0, bb = 0, ga = 0, gb = 0, pp = MIN_VALUE;\n for (int i = 0; i < T; i++) {\n const int ii = i * C;\n const float kk = float(k[ii]);\n const float vv = float(v[ii]);\n const float yy = float(y[ii]);\n float ww = u + kk;\n float p = max(pp, ww);\n float e1 = exp(pp - p);\n float e2 = exp(ww - p);\n const float qq = float(gy[ii]) / (e1 * bb + e2);\n gw += (ga - gb * yy) * e1 * qq;\n gu += (vv - yy) * e2 * qq;\n q[i] = qq;\n r[i] = ww - p;\n ww = w + pp;\n p = max(ww, kk);\n e1 = exp(ww - p);\n e2 = exp(kk - p);\n ga = e1 * (aa + ga);" + }, + { + "comment": "This code computes the forward pass of a neural network using CUDA for efficient GPU computation. The input includes batch size B, time steps T, channels C, and floating-point w parameter, along with fixed-point u, k, v, and y arrays. It initializes gk and gv arrays and performs element-wise computations to calculate the gradients.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/cuda/wkv_cuda_bf16.cu\":85-119", + "content": " gb = e1 * (bb + gb);\n aa = e1 * aa + e2 * vv;\n bb = e1 * bb + e2;\n pp = p;\n }\n const int _offsetBC = _b * C + _c;\n _gw[_offsetBC] = bf16(gw * _w[_c]); // multiply by w because of w -> -exp(w) in python forward()\n _gu[_offsetBC] = bf16(gu);\n aa = 0, bb = 0, pp = MIN_VALUE;\n for (int i = T - 1; i >= 0; i--) {\n const int ii = i * C;\n const float kk = float(k[ii]);\n const float vv = float(v[ii]);\n const float yy = float(y[ii]);\n const float qq = q[i];\n const float rr = r[i];\n float e1 = qq * exp(rr);\n float e2 = exp(kk + pp);\n gk[ii] = bf16(e1 * (vv - yy) + e2 * (aa * vv + bb));\n gv[ii] = bf16(e1 + e2 * aa);\n const float ww = w + pp;\n const float www = rr - u - kk;\n const float p = max(ww, www);\n e1 = exp(ww - p);\n e2 = qq * exp(www - p);\n aa = e1 * aa + e2;\n bb = e1 * bb - e2 * yy;\n pp = p;\n }\n}\nvoid cuda_forward(int B, int T, int C, float *w, bf16 *u, bf16 *k, bf16 *v, bf16 *y) {" + }, + { + "comment": "This code defines two functions, cuda_forward and cuda_backward, which use CUDA to perform a matrix operation on a GPU. The function sets the number of threads per block based on the C dimension and ensures optimal performance by setting --maxrregcount 60. It also asserts that B * C is divisible by threadsPerBlock.x to ensure even distribution of tasks among blocks, then assigns the total number of blocks accordingly. Finally, it calls a kernel function with these parameters.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/cuda/wkv_cuda_bf16.cu\":120-131", + "content": " dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance\n assert(B * C % threadsPerBlock.x == 0);\n dim3 numBlocks(B * C / threadsPerBlock.x);\n kernel_forward<< >>(B, T, C, w, u, k, v, y);\n}\nvoid cuda_backward(int B, int T, int C, float *w, bf16 *u, bf16 *k, bf16 *v, bf16 *y, bf16 *gy, bf16 *gw, bf16 *gu, bf16 *gk, bf16 *gv) {\n dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance\n assert(B * C % threadsPerBlock.x == 0);\n dim3 numBlocks(B * C / threadsPerBlock.x);\n kernel_backward<< >>(B, T, C, w, u, k, v, y, gy, gw, gu, gk, gv);\n}" + } + ] +} \ No newline at end of file diff --git a/docs/doc/4cc9c8b5-b0bd-4cc9-a4ee-f264a5b2abeb.json b/docs/doc/4cc9c8b5-b0bd-4cc9-a4ee-f264a5b2abeb.json new file mode 100644 index 00000000..3125b06f --- /dev/null +++ b/docs/doc/4cc9c8b5-b0bd-4cc9-a4ee-f264a5b2abeb.json @@ -0,0 +1,30 @@ +{ + "summary": "RWKV-v4's CUDA kernel performs RNN operations, gradient calculation, and avoids overflow with matrix multiplication. Efficient parallel execution techniques are used in cuda_forward and cuda_backward functions, optimizing register usage and ensuring proper block and thread configurations for GPU execution.", + "details": [ + { + "comment": "This code snippet defines a CUDA kernel function for the forward pass of a recurrent neural network (RNN). The function takes input and weight tensors as arguments, performs matrix multiplications, and accumulates results using running sums to avoid overflows. The result is stored in output tensor y.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/cuda/wkv_cuda.cu\":0-28", + "content": "#include \n#include \n#define MIN_VALUE (-1e38)\ntemplate \n__global__ void kernel_forward(const int B, const int T, const int C,\n const F *__restrict__ const _w, const F *__restrict__ const _u, const F *__restrict__ const _k, const F *__restrict__ const _v,\n F *__restrict__ const _y) {\n const int idx = blockIdx.x * blockDim.x + threadIdx.x;\n const int _b = idx / C;\n const int _c = idx % C;\n const int _offset = _b * T * C + _c;\n F u = _u[_c];\n F w = _w[_c];\n const F *__restrict__ const k = _k + _offset;\n const F *__restrict__ const v = _v + _offset;\n F *__restrict__ const y = _y + _offset;\n F p = 0, q = 0, o = MIN_VALUE;\n // p and q are running sums divided by exp(o) (to avoid overflows)\n for (int i = 0; i < T; i++) {\n const int ii = i * C;\n F no = max(o, u + k[ii]);\n F A = exp(o - no);\n F B = exp(u + k[ii] - no);\n y[ii] = (A * p + B * v[ii]) / (A * q + B);" + }, + { + "comment": "Kernel function for backward propagation in RWKV-v4. Calculates gradients for weight matrix w, input u and key matrix k. Uses blockIdx and threadIdx to calculate offsets for accessing data.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/cuda/wkv_cuda.cu\":30-55", + "content": " no = max(w + o, k[ii]);\n A = exp(w + o - no);\n B = exp(k[ii] - no);\n p = A * p + B * v[ii];\n q = A * q + B;\n o = no;\n }\n}\ntemplate \n__global__ void kernel_backward(const int B, const int T, const int C,\n const F *__restrict__ const _w, const F *__restrict__ const _u, const F *__restrict__ const _k, const F *__restrict__ const _v, const F *__restrict__ const _gy,\n F *__restrict__ const _gw, F *__restrict__ const _gu, F *__restrict__ const _gk, F *__restrict__ const _gv) {\n const int idx = blockIdx.x * blockDim.x + threadIdx.x;\n const int _b = idx / C;\n const int _c = idx % C;\n const int _offset = _b * T * C + _c;\n F u = _u[_c];\n F w = _w[_c];\n const F *__restrict__ const k = _k + _offset;\n const F *__restrict__ const v = _v + _offset;\n const F *__restrict__ const gy = _gy + _offset;\n F *__restrict__ const gk = _gk + _offset;\n F *__restrict__ const gv = _gv + _offset;" + }, + { + "comment": "This code calculates the gradients of model parameters by iterating through a dataset, updating intermediate variables, and storing gradients in gk and gv arrays. It uses matrix multiplication and exponential operations.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/cuda/wkv_cuda.cu\":57-96", + "content": " F y[Tmax], z[Tmax], zexp[Tmax];\n F gw = 0, gu = 0;\n F p = 0, q = 0;\n F dpdw = 0, dqdw = 0;\n F o = MIN_VALUE;\n for (int i = 0; i < T; i++) {\n const int ii = i * C;\n F no = max(o, k[ii] + u);\n F A = exp(o - no);\n F B = exp(k[ii] + u - no);\n F num = A * p + B * v[ii];\n F iden = 1 / (A * q + B);\n y[i] = num * iden;\n z[i] = iden;\n zexp[i] = k[ii] + u - no;\n gw += gy[ii] * (dpdw - dqdw * y[i]) * iden * A;\n gu += gy[ii] * (v[ii] - y[i]) * B * iden;\n no = max(w + o, k[ii]);\n A = exp(w + o - no);\n B = exp(k[ii] - no);\n dpdw = A * (p + dpdw);\n dqdw = A * (q + dqdw);\n p = A * p + B * v[ii];\n q = A * q + B;\n o = no;\n }\n F gp = 0, gq = 0;\n o = MIN_VALUE;\n for (int i = T - 1; i >= 0; i--) {\n const int ii = i * C;\n F A = gy[ii] * z[i] * exp(zexp[i]);\n F B = exp(k[ii] + o);\n gk[ii] = A * (v[ii] - y[i]) + B * (gp * v[ii] + gq);\n gv[ii] = A + B * gp;" + }, + { + "comment": "This code is a part of the RWKV-v4 library, specifically the cuda_forward and cuda_backward functions for CUDA implementation. It involves setting up blocks and threads based on the given dimensions B, T, and C. The kernels kernel_forward and kernel_backward are launched with appropriate block and thread configurations to perform matrix operations in parallel. This code utilizes an optimal number of registers and asserts that B * C is divisible by the chosen number of threads per block for efficient execution.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/cuda/wkv_cuda.cu\":98-120", + "content": " F no = max(w + o, zexp[i] - k[ii] - u);\n A = exp(w + o - no);\n B = gy[ii] * z[i] * exp(zexp[i] - k[ii] - u - no);\n gp = A * gp + B;\n gq = A * gq - B * y[i];\n o = no;\n }\n // Multiply by w because the w -> -exp(w) preprocessing is halfway in the backwards pass, even though it's not in the forward pass\n const int _offsetBC = _b * C + _c;\n _gw[_offsetBC] += gw * _w[_c];\n _gu[_offsetBC] += gu;\n}\nvoid cuda_forward(int B, int T, int C, float *w, float *u, float *k, float *v, float *y) {\n dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance\n assert(B * C % threadsPerBlock.x == 0);\n dim3 numBlocks(B * C / threadsPerBlock.x);\n kernel_forward<< >>(B, T, C, w, u, k, v, y);\n}\nvoid cuda_backward(int B, int T, int C, float *w, float *u, float *k, float *v, float *gy, float *gw, float *gu, float *gk, float *gv) {\n dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance" + }, + { + "comment": "This code ensures that the number of blocks is an integer multiple of threads per block, then launches a kernel function for backward computation. It uses B, C, and threadsPerBlock parameters to determine the number of blocks and threads per block for efficient GPU execution.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/cuda/wkv_cuda.cu\":121-124", + "content": " assert(B * C % threadsPerBlock.x == 0);\n dim3 numBlocks(B * C / threadsPerBlock.x);\n kernel_backward<< >>(B, T, C, w, u, k, v, gy, gw, gu, gk, gv);\n}" + } + ] +} \ No newline at end of file diff --git a/docs/doc/4ce3d4aa-74db-4c89-8951-fd12785ba16f.json b/docs/doc/4ce3d4aa-74db-4c89-8951-fd12785ba16f.json new file mode 100644 index 00000000..a8831401 --- /dev/null +++ b/docs/doc/4ce3d4aa-74db-4c89-8951-fd12785ba16f.json @@ -0,0 +1,15 @@ +{ + "summary": "This code defines a CUDA kernel for timex forward and backward operations in PyTorch, with the given snippet being the closing brace of a function or class definition.", + "details": [ + { + "comment": "This code defines a CUDA kernel for the timex forward and backward operations, which are then exposed to PyTorch through extension modules. The functions take in torch tensors, perform computations on them using the CUDA kernels, and return the results.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v2-RNN/cuda/timex_op.cpp\":0-19", + "content": "#include \nvoid cuda_forward(const float *w, const float *k, float *x, float eps, int B, int C, int T);\nvoid cuda_backward(const float *w, const float *k, const float *gwk, float *gw, float *gk, int B, int C, int T);\nvoid forward(torch::Tensor &w, const torch::Tensor &k, torch::Tensor &x, double eps, int64_t B, int64_t C, int64_t T) {\n cuda_forward((const float *)w.data_ptr(), (const float *)k.data_ptr(), (float *)x.data_ptr(), eps, B, C, T);\n}\nvoid backward(torch::Tensor &w, const torch::Tensor &k, const torch::Tensor &gwk, torch::Tensor &gw, torch::Tensor &gk, int64_t B, int64_t C, int64_t T) {\n cuda_backward((const float *)w.data_ptr(), (const float *)k.data_ptr(), (const float *)gwk.data_ptr(), (float *)gw.data_ptr(), (float *)gk.data_ptr(), B, C, T);\n}\nPYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {\n m.def(\"forward\", &forward, \"timex forward\");\n m.def(\"backward\", &backward, \"timex backward\");\n}\nTORCH_LIBRARY(timex, m) {\n m.def(\"forward\", forward);\n m.def(\"backward\", backward);" + }, + { + "comment": "This code snippet is the closing brace (}) for a function or class definition, marking the end of a block.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v2-RNN/cuda/timex_op.cpp\":20-20", + "content": "}" + } + ] +} \ No newline at end of file diff --git a/docs/doc/57684d64-4432-47ad-a0a6-2f776620918d.json b/docs/doc/57684d64-4432-47ad-a0a6-2f776620918d.json new file mode 100644 index 00000000..241ac9e0 --- /dev/null +++ b/docs/doc/57684d64-4432-47ad-a0a6-2f776620918d.json @@ -0,0 +1,110 @@ +{ + "summary": "This code initializes neural network weights, sets up time curves for RWKV_TimeMix module, defines Transformer models with rotary position embeddings and applies rotary encoding. It also includes channel mix and tiny attn mechanisms.", + "details": [ + { + "comment": "The code defines a function called `RWKV_Init` that initializes the weights of all linear and embedding layers within a module. It uses fancy initialization, which is a method for assigning weight values to layers based on their names. The function iterates through each layer in the module and checks if it's an instance of either `nn.Linear` or `nn.Embedding`. If so, it proceeds with the weight assignment process.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v1/src/model.py\":0-21", + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nimport math\nimport logging\nimport torch\nimport torch.nn as nn\nfrom torch.nn import functional as F\nlogger = logging.getLogger(__name__)\n########################################################################################################\n# RWKV: RWKV Time-mix + RWKV Channel-mix\n########################################################################################################\ndef RWKV_Init(module, config): # fancy initialization of all lin & emb layer in the module\n for m in module.modules():\n if not isinstance(m, (nn.Linear, nn.Embedding)):\n continue\n with torch.no_grad():\n name = '[unknown weight]'\n for name, parameter in module.named_parameters(): # find the name of the weight" + }, + { + "comment": "The code initializes gain and scale for orthogonal and normal distributions in a neural network. It checks if the weight id is equal to the parameter id, breaks the loop if so. If it's a linear layer, it adjusts the gain based on the shape of the input/output dimensions, and if it matches specific values, scales accordingly. For embedding layers, it does a similar check and scaling. Finally, it checks for a scale_init attribute and assigns it to scale. Lastly, it prints the shape dimensions, scale value, and name.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v1/src/model.py\":22-45", + "content": " if id(m.weight) == id(parameter):\n break\n shape = m.weight.data.shape\n gain = 1.0 # positive: gain for orthogonal, negative: std for normal\n scale = 1.0 # extra scale for gain\n if isinstance(m, nn.Linear):\n if m.bias is not None:\n m.bias.data.zero_()\n if shape[0] > shape[1]:\n gain = math.sqrt(shape[0] / shape[1])\n if shape[0] == config.vocab_size and shape[1] == config.n_embd: # final projection?\n scale = config.rwkv_emb_scale\n if isinstance(m, nn.Embedding):\n gain = math.sqrt(max(shape[0], shape[1]))\n if shape[0] == config.vocab_size and shape[1] == config.n_embd: # token emb?\n scale = config.rwkv_emb_scale\n if hasattr(m, 'scale_init'):\n scale = m.scale_init\n print(str(shape[0]).ljust(5), str(shape[1]).ljust(5), f'{round(scale,2):g}'.ljust(4), name)" + }, + { + "comment": "This code initializes the RWKV_TimeMix module, which is a part of the RWKV model. It sets up the time curves for better convergence using orthogonal or normal initialization based on gain values. The code also ensures that the number of attention heads divides evenly into the total number of attention heads specified in the config file.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v1/src/model.py\":47-69", + "content": " gain *= scale\n if gain == 0:\n nn.init.zeros_(m.weight) # zero init is great for some RWKV matrices\n elif gain > 0:\n nn.init.orthogonal_(m.weight, gain=gain)\n else:\n nn.init.normal_(m.weight, mean=0, std=-gain)\nclass RWKV_TimeMix(nn.Module):\n def __init__(self, config, layer_id):\n super().__init__()\n assert config.n_attn % config.n_head == 0\n self.layer_id = layer_id\n self.ctx_len = config.ctx_len\n self.n_head = config.n_head\n self.head_size = config.n_attn // config.n_head\n with torch.no_grad(): # initial time_w curves for better convergence\n ww = torch.ones(config.n_head, config.ctx_len)\n curve = torch.tensor([-(config.ctx_len - 1 - i) for i in range(config.ctx_len)]) # the distance\n for h in range(config.n_head):\n if h < config.n_head - 1:\n decay_speed = math.pow(config.ctx_len, -(h+1)/(config.n_head-1))" + }, + { + "comment": "This code initializes a multi-head attention layer. It determines the decay speed for each head, sets parameters for time-based operations, creates linear layers for keys and values, and optionally initializes a tiny_att layer if configured. Finally, it sets scale init values for the key, receptance, and output layers to 0.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v1/src/model.py\":70-93", + "content": " else:\n decay_speed = 0.0\n ww[h] = torch.exp(curve * decay_speed)\n # print('layer', layer_id, 'head', h, 'decay_speed', round(decay_speed, 4), ww[h][:5].numpy(), '...', ww[h][-5:].numpy())\n self.time_w = nn.Parameter(ww)\n self.time_alpha = nn.Parameter(torch.ones(self.n_head, 1, config.ctx_len))\n self.time_beta = nn.Parameter(torch.ones(self.n_head, config.ctx_len, 1))\n self.time_gamma = nn.Parameter(torch.ones(config.ctx_len, 1))\n self.time_shift = nn.ZeroPad2d((0,0,1,-1))\n self.key = nn.Linear(config.n_embd, config.n_attn)\n self.value = nn.Linear(config.n_embd, config.n_attn)\n self.receptance = nn.Linear(config.n_embd, config.n_attn)\n # if config.rwkv_tiny_attn > 0:\n # self.tiny_att = RWKV_TinyAttn(config)\n self.output = nn.Linear(config.n_attn, config.n_embd)\n self.key.scale_init = 0\n self.receptance.scale_init = 0\n self.output.scale_init = 0" + }, + { + "comment": "This code defines a forward pass for a model. It performs element-wise operations on input 'x' using learnable parameters 'w', 'time_alpha', and 'time_beta'. It also applies 'self.receptance', 'self.key', and 'self.value' operations to the input, clamps extreme values of 'k', calculates cumulative sums, and performs element-wise multiplication. Finally, it applies a sigmoid function and layer normalization to the results.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v1/src/model.py\":95-124", + "content": " def forward(self, x):\n B, T, C = x.size()\n TT = self.ctx_len\n w = F.pad(self.time_w, (0, TT))\n w = torch.tile(w, [TT])\n w = w[:, :-TT].reshape(-1, TT, 2 * TT - 1)\n w = w[:, :, TT-1:] # w is now a circulant matrix\n w = w[:, :T, :T] * self.time_alpha[:, :, :T] * self.time_beta[:, :T, :]\n x = torch.cat([self.time_shift(x[:, :, :C//2]), x[:, :, C//2:]], dim = -1)\n # if hasattr(self, 'tiny_att'):\n # tiny_att = self.tiny_att(x, self.mask)\n k = self.key(x)\n v = self.value(x)\n r = self.receptance(x)\n k = torch.clamp(k, max=30, min=-60) # clamp extreme values. e^30 = 10^13\n k = torch.exp(k)\n sum_k = torch.cumsum(k, dim=1)\n kv = (k * v).view(B, T, self.n_head, self.head_size)\n wkv = (torch.einsum('htu,buhc->bthc', w, kv)).contiguous().view(B, T, -1)\n rwkv = torch.sigmoid(r) * wkv / sum_k\n rwkv = self.output(rwkv)\n # if hasattr(self, 'tiny_att'):\n # rwkv += tiny_att" + }, + { + "comment": "This code defines three classes for attention mechanisms in the RWKV model. The RWKV_ChannelMix class represents a channel-wise attention mechanism, while RWKV_TinyAttn is an extra tiny version of this attention mechanism. Both use linear layers and apply Mish activation before performing element-wise multiplication with a sigmoid gated weight. This helps in controlling the contribution of each input to the output during attention computation.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v1/src/model.py\":126-157", + "content": " return rwkv * self.time_gamma[:T, :]\nclass RWKV_ChannelMix(nn.Module):\n def __init__(self, config, layer_id):\n super().__init__()\n self.layer_id = layer_id\n self.time_shift = nn.ZeroPad2d((0,0,1,-1))\n hidden_sz = 5 * config.n_ffn // 2 # can use smaller hidden_sz because of receptance gating\n self.key = nn.Linear(config.n_embd, hidden_sz)\n self.value = nn.Linear(config.n_embd, hidden_sz)\n self.weight = nn.Linear(hidden_sz, config.n_embd)\n self.receptance = nn.Linear(config.n_embd, config.n_embd)\n self.receptance.scale_init = 0\n self.weight.scale_init = 0\n def forward(self, x):\n B, T, C = x.size()\n x = torch.cat([self.time_shift(x[:, :, :C//2]), x[:, :, C//2:]], dim = -1)\n k = self.key(x)\n v = self.value(x)\n r = self.receptance(x)\n wkv = self.weight(F.mish(k) * v) # i find mish is a bit better than gelu\n rwkv = torch.sigmoid(r) * wkv\n return rwkv\nclass RWKV_TinyAttn(nn.Module): # extra tiny attention" + }, + { + "comment": "This code defines a model class for attention mechanism in RWKV-v1. It initializes the model's attributes and then applies attention to input using multi-head self-attention. It transposes and masks the results with a mask before applying softmax and returning the output.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v1/src/model.py\":158-179", + "content": " def __init__(self, config):\n super().__init__()\n self.d_attn = config.rwkv_tiny_attn\n self.n_head = config.rwkv_tiny_head\n self.head_size = self.d_attn // self.n_head\n self.qkv = nn.Linear(config.n_embd, self.d_attn * 3)\n self.out = nn.Linear(self.d_attn, config.n_embd)\n def forward(self, x, mask):\n B, T, C = x.size()\n qkv = self.qkv(x)\n q, k, v = qkv.chunk(3, dim = -1)\n if self.n_head > 1:\n q = q.view(B, T, self.n_head, self.head_size).transpose(1, 2) # (B, T, C) -> (B, nh, T, hs)\n k = k.view(B, T, self.n_head, self.head_size).transpose(1, 2) # (B, T, C) -> (B, nh, T, hs)\n v = v.view(B, T, self.n_head, self.head_size).transpose(1, 2) # (B, T, C) -> (B, nh, T, hs)\n qk = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(self.head_size)) # (B, nh, T, hs) * (B, nh, hs, T) -> (B, nh, T, T)\n qk = qk.masked_fill(mask == 0, float('-inf'))\n qk = F.softmax(qk, dim = -1)" + }, + { + "comment": "The code is defining a module for multi-head attention with rotary embedding and GEGLU FFN. The model computes the query, key, value matrices (QKV) using dot product attention. If there are multiple heads, it transposes and reshapes the QKV matrix to output a single sequence of size (B, T, C). It then applies a linear transformation from the output layer. The code also includes a RotaryEmbedding class for applying rotary positional encoding to the input sequence.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v1/src/model.py\":180-202", + "content": " qkv = qk @ v # (B, nh, T, T) * (B, nh, T, hs) -> (B, nh, T, hs)\n if self.n_head > 1:\n qkv = qkv.transpose(1, 2).contiguous().view(B, T, -1) # (B, nh, T, hs) -> (B, T, nh, hs) -> (B, T, C)\n return self.out(qkv)\n########################################################################################################\n# MHA_rotary: Multi-head Attention + Rotary Encoding + GeGLU FFN\n########################################################################################################\nclass RotaryEmbedding(torch.nn.Module):\n def __init__(self, dim, base=10000):\n super().__init__()\n inv_freq = 1. / (base ** (torch.arange(0, dim, 2).float() / dim))\n self.register_buffer('inv_freq', inv_freq)\n self.seq_len_cached = None\n self.cos_cached = None\n self.sin_cached = None\n def forward(self, x, seq_len=None):\n if seq_len != self.seq_len_cached:\n self.seq_len_cached = seq_len" + }, + { + "comment": "Code snippet is from the RWKV-LM's model.py file and it defines a class `MHA_rotary` which applies multi-head attention with rotary position embedding. The function `apply_rotary_pos_emb` performs rotation of half embeddings and multiplies the queries (q) and keys (k) with cosine and sine of corresponding rotary position embeddings. It returns the transformed q and k for further computation in the model.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v1/src/model.py\":203-229", + "content": " t = torch.arange(seq_len, device=x.device)\n freqs = torch.einsum('i,j->ij', t, self.inv_freq)\n emb = torch.cat((freqs, freqs), dim=-1).to(x.device)\n self.cos_cached = emb.cos()\n self.sin_cached = emb.sin()\n return self.cos_cached, self.sin_cached\ndef rotate_half(x):\n x1, x2 = x[..., :x.shape[-1] // 2], x[..., x.shape[-1] // 2:]\n return torch.cat((-x2, x1), -1)\n@torch.jit.script\ndef apply_rotary_pos_emb(q, k, cos, sin):\n cos, sin = cos[...,:q.shape[-2],:], sin[...,:q.shape[-2],:]\n return (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin)\nclass MHA_rotary(nn.Module):\n def __init__(self, config, layer_id, time_shift = False):\n super().__init__()\n self.layer_id = layer_id\n assert config.n_attn % config.n_head == 0\n self.n_head = config.n_head\n self.ctx_len = config.ctx_len\n self.head_size = config.n_attn // config.n_head\n if time_shift:\n self.time_shift = nn.ZeroPad2d((0,0,1,-1))" + }, + { + "comment": "This code is initializing the model for the Transformer architecture. It defines the query, key, value, and output layers, as well as a rotary embedding layer. The forward function then reshapes the input and splits it into query, key, and value matrices, which are used in subsequent computations.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v1/src/model.py\":231-252", + "content": " self.query = nn.Linear(config.n_embd, config.n_attn)\n self.key = nn.Linear(config.n_embd, config.n_attn)\n self.value = nn.Linear(config.n_embd, config.n_attn)\n self.register_buffer(\"mask\", torch.tril(torch.ones(config.ctx_len, config.ctx_len)))\n self.rotary_ndims = int(self.head_size * 0.5)\n self.rotary_emb = RotaryEmbedding(self.rotary_ndims)\n self.output = nn.Linear(config.n_attn, config.n_embd)\n def forward(self, x):\n B, T, C = x.size()\n if hasattr(self, 'time_shift'):\n x = torch.cat([self.time_shift(x[:, :, :C//2]), x[:, :, C//2:]], dim = -1)\n q = self.query(x).view(B, T, self.n_head, self.head_size).transpose(1, 2) # (B, T, C) -> (B, nh, T, hs)\n k = self.key(x).view(B, T, self.n_head, self.head_size).transpose(1, 2) # (B, T, C) -> (B, nh, T, hs)\n v = self.value(x).view(B, T, self.n_head, self.head_size).transpose(1, 2) # (B, T, C) -> (B, nh, T, hs)\n q, query_pass = q[..., :self.rotary_ndims], q[..., self.rotary_ndims:]" + }, + { + "comment": "This code applies rotary encoding to query and key tensors, computes self-attention weights using dot product between query and key tensors, applies a causal mask for sequence generation, normalizes the attention weights using softmax, multiplies the weighted keys with corresponding values, transposes and reshapes the result, and finally passes it through an output layer.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v1/src/model.py\":253-269", + "content": " k, key_pass = k[..., :self.rotary_ndims], k[..., self.rotary_ndims:]\n cos, sin = self.rotary_emb(q, seq_len=T)\n q, k = apply_rotary_pos_emb(q, k, cos, sin) # rotary encoding\n q = torch.cat((q, query_pass), dim=-1)\n k = torch.cat((k, key_pass), dim=-1)\n att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) # self-attention: (B, nh, T, hs) * (B, nh, hs, T) -> (B, nh, T, T)\n att = att.masked_fill(self.mask[:T,:T] == 0, float('-inf')) # causal mask\n att = F.softmax(att, dim = -1) # softmax\n x = att @ v # (B, nh, T, T) * (B, nh, T, hs) -> (B, nh, T, hs)\n x = x.transpose(1, 2).contiguous().view(B, T, -1) # (B, nh, T, hs) -> (B, T, nh, hs) -> (B, T, C)\n x = self.output(x)\n return x\nclass GeGLU(torch.nn.Module):" + }, + { + "comment": "In the given code, a Multi-Head Attention (MHA) module is being defined. It takes in an embedding dimension and creates multiple attention heads. Each head performs scaled dot product attention before concatenating and applying linear transformations to obtain final outputs.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v1/src/model.py\":270-298", + "content": " def __init__(self, config, layer_id, time_shift = False):\n super().__init__()\n self.layer_id = layer_id\n if time_shift:\n self.time_shift = nn.ZeroPad2d((0,0,1,-1))\n hidden_sz = 3 * config.n_ffn\n self.key = nn.Linear(config.n_embd, hidden_sz)\n self.value = nn.Linear(config.n_embd, hidden_sz)\n self.weight = nn.Linear(hidden_sz, config.n_embd)\n def forward(self, x):\n B, T, C = x.size()\n if hasattr(self, 'time_shift'):\n x = torch.cat([self.time_shift(x[:, :, :C//2]), x[:, :, C//2:]], dim = -1)\n k = self.key(x)\n v = self.value(x) \n y = self.weight(F.gelu(k) * v)\n return y\n########################################################################################################\n# MHA_pro: with more tricks\n########################################################################################################\nclass MHA_pro(nn.Module):\n def __init__(self, config, layer_id):\n super().__init__()" + }, + { + "comment": "This code initializes the necessary parameters and layers for an attention mechanism in a transformer model. It sets layer-specific attributes, creates learnable parameters, registers a buffer for masking, and defines convolutional and embedding layers for processing input embeddings.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v1/src/model.py\":299-319", + "content": " self.layer_id = layer_id\n assert config.n_attn % config.n_head == 0\n self.n_head = config.n_head\n self.ctx_len = config.ctx_len\n self.head_size = config.n_attn // config.n_head\n self.time_w = nn.Parameter(torch.ones(self.n_head, config.ctx_len))\n self.time_alpha = nn.Parameter(torch.ones(self.n_head, 1, config.ctx_len))\n self.time_beta = nn.Parameter(torch.ones(self.n_head, config.ctx_len, 1))\n self.time_gamma = nn.Parameter(torch.ones(config.ctx_len, 1))\n self.register_buffer(\"mask\", torch.tril(torch.ones(config.ctx_len, config.ctx_len)))\n self.time_shift = nn.ZeroPad2d((0,0,1,-1))\n self.query = nn.Linear(config.n_embd, config.n_attn)\n self.key = nn.Linear(config.n_embd, config.n_attn)\n self.value = nn.Linear(config.n_embd, config.n_attn)\n self.rotary_ndims = int(self.head_size * 0.5)\n self.rotary_emb = RotaryEmbedding(self.rotary_ndims)\n self.head_mix = nn.Conv2d(self.n_head, self.n_head, kernel_size=1, bias=False) # talking heads" + }, + { + "comment": "This code defines a model for attention in transformer architecture. It includes the linear layer, time-shift mixing operation, and query/key/value projections. The rotary embedding is used for positional encoding, and the forward function performs matrix multiplications and element-wise operations.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v1/src/model.py\":321-338", + "content": " self.output = nn.Linear(config.n_attn, config.n_embd)\n def forward(self, x):\n B, T, C = x.size()\n TT = self.ctx_len\n w = F.pad(self.time_w, (0, TT))\n w = torch.tile(w, [TT])\n w = w[:, :-TT].reshape(-1, TT, 2 * TT - 1)\n w = w[:, :, TT-1:] # w is now a circulant matrix\n w = w[:, :T, :T] * self.time_alpha[:, :, :T] * self.time_beta[:, :T, :]\n x = torch.cat([self.time_shift(x[:, :, :C//2]), x[:, :, C//2:]], dim = -1) # time-shift mixing\n q = self.query(x).view(B, T, self.n_head, self.head_size).transpose(1, 2) # (B, T, C) -> (B, nh, T, hs)\n k = self.key(x).view(B, T, self.n_head, self.head_size).transpose(1, 2) # (B, T, C) -> (B, nh, T, hs)\n v = self.value(x).view(B, T, self.n_head, self.head_size).transpose(1, 2) # (B, T, C) -> (B, nh, T, hs)\n q, query_pass = q[..., :self.rotary_ndims], q[..., self.rotary_ndims:]\n k, key_pass = k[..., :self.rotary_ndims], k[..., self.rotary_ndims:]" + }, + { + "comment": "This code applies rotary positional encoding to the queries and keys, concatenates them with additional vectors, performs self-attention by multiplying and dividing query-key matrix by weights, applies a causal mask, calculates attention scores using softmax, multiplies attention scores by values, and finally reshapes the output", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v1/src/model.py\":339-351", + "content": " cos, sin = self.rotary_emb(q, seq_len=T)\n q, k = apply_rotary_pos_emb(q, k, cos, sin) # rotary encoding\n q = torch.cat((q, query_pass), dim=-1)\n k = torch.cat((k, key_pass), dim=-1) \n att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) # self-attention: (B, nh, T, hs) * (B, nh, hs, T) -> (B, nh, T, T)\n att = att.masked_fill(self.mask[:T,:T] == 0, float('-inf')) # causal mask\n att = F.softmax(att, dim = -1) # softmax\n att = att * w # time-weighting\n att = self.head_mix(att) # talking heads\n x = att @ v # (B, nh, T, T) * (B, nh, T, hs) -> (B, nh, T, hs)\n x = x.transpose(1, 2).contiguous().view(B, T, -1) # (B, nh, T, hs) -> (B, T, nh, hs) -> (B, T, C)" + }, + { + "comment": "This code defines two classes, RMSNorm and FixedNorm, both extending the nn.Module class in PyTorch. These classes are used as normalization layers for a GPT (Generative Pretrained Transformer) model. The RMSNorm class applies root mean square normalization to the input tensor x, while the FixedNorm class performs fixed normalization. Both classes return normalized inputs after applying a weight parameter to the output. The code also includes an initialization for the GPTConfig class which takes parameters such as vocab_size and ctx_len.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v1/src/model.py\":353-384", + "content": " x = self.output(x) * self.time_gamma[:T, :]\n return x\n########################################################################################################\n# The GPT Model with our blocks\n########################################################################################################\nclass RMSNorm(nn.Module):\n def __init__(self, d):\n super().__init__()\n self.dd = d ** (-1. / 2)\n self.weight = nn.Parameter(torch.ones(d))\n def forward(self, x):\n norm_x = x.norm(2, dim=-1, keepdim=True)\n x_normed = x / (norm_x * self.dd + 1e-12)\n return self.weight * x_normed\nclass FixedNorm(nn.Module):\n def __init__(self, d):\n super().__init__()\n self.dd = d ** (-1. / 2)\n def forward(self, x):\n norm_x = x.norm(2, dim=-1, keepdim=True)\n x_normed = x / (norm_x * self.dd + 1e-12)\n return x_normed\n########################################################################################################\nclass GPTConfig:\n def __init__(self, vocab_size, ctx_len, **kwargs):" + }, + { + "comment": "This code initializes a `Block` object with a specified configuration and layer ID. It includes multiple layers of normalization (LayerNorm) and different attention mechanisms depending on the model type specified in the configuration. The attention mechanisms can be RWKV_TimeMix, MHA_rotary, MHA_shift, or MHA_pro. These blocks are used to create a transformer model with adaptive layers for different tasks.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v1/src/model.py\":385-413", + "content": " self.vocab_size = vocab_size\n self.ctx_len = ctx_len\n for k,v in kwargs.items():\n setattr(self, k, v)\nclass Block(nn.Module):\n def __init__(self, config, layer_id):\n super().__init__()\n self.config = config\n self.ln1 = nn.LayerNorm(config.n_embd)\n self.ln2 = nn.LayerNorm(config.n_embd)\n if config.model_type == 'RWKV':\n # self.ln1 = FixedNorm(config.n_embd)\n # self.ln2 = FixedNorm(config.n_embd)\n self.attn = RWKV_TimeMix(config, layer_id)\n self.mlp = RWKV_ChannelMix(config, layer_id)\n elif config.model_type == 'MHA_rotary':\n self.attn = MHA_rotary(config, layer_id)\n self.mlp = GeGLU(config, layer_id)\n elif config.model_type == 'MHA_shift':\n self.attn = MHA_rotary(config, layer_id, time_shift=True)\n self.mlp = GeGLU(config, layer_id, time_shift=True)\n elif config.model_type == 'MHA_pro':\n self.attn = MHA_pro(config, layer_id)" + }, + { + "comment": "The code initializes a GPT model with the given configuration. It includes an embedding layer, multiple blocks, final layer normalization, and attention parameters for context generation. The model type is checked to be 'RWKV' specifically.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v1/src/model.py\":414-444", + "content": " self.mlp = RWKV_ChannelMix(config, layer_id)\n def forward(self, x):\n x = x + self.attn(self.ln1(x))\n x = x + self.mlp(self.ln2(x))\n return x\nclass GPT(nn.Module):\n def __init__(self, config):\n super().__init__()\n self.config = config\n self.tok_emb = nn.Embedding(config.vocab_size, config.n_embd)\n self.blocks = nn.Sequential(*[Block(config, i) for i in range(config.n_layer)])\n self.ln_f = nn.LayerNorm(config.n_embd)\n self.time_out = nn.Parameter(torch.ones(1,config.ctx_len,1)) # reduce confidence of early tokens\n self.head = nn.Linear(config.n_embd, config.vocab_size, bias=False)\n self.head_q = nn.Linear(config.n_embd, 256)\n self.head_q.scale_init = 0.01\n self.head_k = nn.Linear(config.n_embd, 256)\n self.head_k.scale_init = 0.01\n self.register_buffer(\"copy_mask\", torch.tril(torch.ones(config.ctx_len, config.ctx_len)))\n self.ctx_len = config.ctx_len\n if self.config.model_type == 'RWKV':" + }, + { + "comment": "This code initializes and configures the RWKV model. It initializes the weights using normal distribution with mean 0.0 and standard deviation 0.01. It also separates out parameters to be regularized by weight decay or not, whitelisting Linear layers while blacklisting RMSNorm, LayerNorm, and Embedding layers.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v1/src/model.py\":445-471", + "content": " RWKV_Init(self, config)\n else:\n self.apply(self._init_weights)\n logger.info(\"number of parameters: %e\", sum(p.numel() for p in self.parameters()))\n def get_ctx_len(self):\n return self.ctx_len\n def _init_weights(self, module):\n if isinstance(module, (nn.Linear, nn.Embedding)):\n module.weight.data.normal_(mean=0.0, std=0.01)\n if isinstance(module, nn.Linear) and module.bias is not None:\n module.bias.data.zero_()\n def configure_optimizers(self, train_config):\n # separate out all parameters to those that will and won't experience regularizing weight decay\n decay = set()\n no_decay = set()\n whitelist_weight_modules = (nn.Linear, )\n blacklist_weight_modules = (RMSNorm, nn.LayerNorm, nn.Embedding)\n for mn, m in self.named_modules():\n for pn, p in m.named_parameters():\n fpn = '%s.%s' % (mn, pn) if mn else pn # full param name\n if pn.endswith('bias') or ('time' in fpn) or ('head' in fpn):" + }, + { + "comment": "This code is organizing the model's parameters into two categories: decay and no_decay. It uses sets to ensure every parameter is considered, then creates optimizer groups for each category with different weight decay values. This helps in training by applying different learning rates to different parameters during backpropagation.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v1/src/model.py\":472-488", + "content": " no_decay.add(fpn)\n elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):\n decay.add(fpn)\n elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):\n no_decay.add(fpn)\n # validate that we considered every parameter\n param_dict = {pn: p for pn, p in self.named_parameters()}\n inter_params = decay & no_decay\n union_params = decay | no_decay\n assert len(inter_params) == 0, \"parameters %s made it into both decay/no_decay sets!\" % (str(inter_params), )\n assert len(param_dict.keys() - union_params) == 0, \"parameters %s were not separated into either decay/no_decay set!\" \\\n % (str(param_dict.keys() - union_params), )\n optim_groups = [\n {\"params\": [param_dict[pn] for pn in sorted(list(decay))], \"weight_decay\": train_config.weight_decay},\n {\"params\": [param_dict[pn] for pn in sorted(list(no_decay))], \"weight_decay\": 0.0}," + }, + { + "comment": "This code defines a model and its forward pass. It takes an input index, performs token embedding and blocks transformations, then calculates query and key matrices for attention. It applies the attention mechanism by multiplying queries and keys, scales the result, fills zeros, and multiplies with one-hot encoded indices. It also reduces the confidence of early tokens and adds the attention result to the model output. Finally, it calculates the cross-entropy loss if targets are provided.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v1/src/model.py\":489-516", + "content": " ]\n optimizer = torch.optim.AdamW(optim_groups, lr=train_config.learning_rate, betas=train_config.betas, eps=train_config.eps)\n return optimizer\n def forward(self, idx, targets=None):\n B, T = idx.size()\n assert T <= self.ctx_len, \"Cannot forward, because len(input) > model ctx_len.\"\n x = self.tok_emb(idx)\n x = self.blocks(x)\n x = self.ln_f(x)\n q = self.head_q(x)[:,:T,:]\n k = self.head_k(x)[:,:T,:]\n c = (q @ k.transpose(-2, -1)) * (1.0 / 256)\n c = c.masked_fill(self.copy_mask[:T,:T] == 0, 0)\n c = c @ F.one_hot(idx, num_classes = self.config.vocab_size).float()\n x = x * self.time_out[:, :T, :] # reduce confidence of early tokens\n x = self.head(x) + c\n loss = None\n if targets is not None:\n loss = F.cross_entropy(x.view(-1, x.size(-1)), targets.view(-1))\n return x, loss" + } + ] +} \ No newline at end of file diff --git a/docs/doc/58f8ce2e-8500-43af-922b-ce7dcbd79acd.json b/docs/doc/58f8ce2e-8500-43af-922b-ce7dcbd79acd.json new file mode 100644 index 00000000..f5348760 --- /dev/null +++ b/docs/doc/58f8ce2e-8500-43af-922b-ce7dcbd79acd.json @@ -0,0 +1,15 @@ +{ + "summary": "This code uses top-k and top-p sampling techniques for model logits, combining them with temperature scaling and min_p_ratio control for text generation. It also allows setting a seed for reproducibility.", + "details": [ + { + "comment": "This code defines functions for top-k, top-p sampling, and a combination of these techniques. Top-k samples from the k highest logits while setting lower logits to negative infinity. Top-p keeps the p percent of highest probabilities while setting others to zero. The sample_logits function combines these methods, applies temperature scaling, and a min_p_ratio for control over sampling behavior.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v1/src/utils.py\":0-31", + "content": "import random\nimport numpy as np\nimport torch\nimport torch.nn as nn\nfrom torch.nn import functional as F\ndef top_k_logits(logits, k):\n v, ix = torch.topk(logits, k)\n out = logits.clone()\n out[out < v[:, [-1]]] = -float('Inf')\n return out\ndef top_p_probs(probs, p):\n out = probs.clone()\n sorted_probs, sorted_indices = torch.sort(out, descending=True)\n cumulative_probs = torch.cumsum(sorted_probs, dim=-1)\n sorted_indices_to_remove = cumulative_probs > p\n sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()\n sorted_indices_to_remove[..., 0] = 0 \n indices_to_remove = sorted_indices[sorted_indices_to_remove]\n out[indices_to_remove] = 0\n return out\n# top-p + top-k + pow&ratio sampling\ndef sample_logits(logits, pos, temperature=1.0, top_k=None, top_p=None, min_p_pow=None, min_p_ratio=None):\n logits = logits[:, pos, :] / temperature\n probs = F.softmax(logits, dim=-1)\n if min_p_ratio is not None:\n limit = torch.pow(torch.max(probs), min_p_pow) * min_p_ratio" + }, + { + "comment": "This code snippet is performing model sampling using logits, handling top-k and top-p techniques for text generation. It also includes a seed setting function to ensure reproducibility.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v1/src/utils.py\":32-49", + "content": " logits[probs < limit] = -float('Inf')\n if top_k is not None:\n logits = top_k_logits(logits, top_k)\n probs = F.softmax(logits, dim=-1)\n if top_p is not None:\n probs[0] = top_p_probs(probs[0], top_p)\n ix = torch.multinomial(probs, num_samples=1)\n return ix[0][0].cpu()\ndef set_seed(seed):\n random.seed(seed)\n np.random.seed(seed)\n torch.manual_seed(seed)\n torch.cuda.manual_seed_all(seed)" + } + ] +} \ No newline at end of file diff --git a/docs/doc/5d023d65-636b-49a9-b1bc-2ba12da3cc88.json b/docs/doc/5d023d65-636b-49a9-b1bc-2ba12da3cc88.json new file mode 100644 index 00000000..52595c45 --- /dev/null +++ b/docs/doc/5d023d65-636b-49a9-b1bc-2ba12da3cc88.json @@ -0,0 +1,35 @@ +{ + "summary": "The Trainer class trains a model, handles datasets and configurations, performs backpropagation, adjusts learning rate, logs loss, initializes progress bar, runs epochs, saves the model, and resets tokens counter for decay.", + "details": [ + { + "comment": "TrainerConfig class sets the maximum number of epochs, batch size, learning rate, optimizer settings, and other training parameters. The Trainer class initializes the model, train and test datasets, and a TrainerConfig object. It allows for customization by passing keyword arguments to the config instance.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v1/src/trainer.py\":0-34", + "content": "import math, sys, datetime\nimport logging\nimport numpy as np\nfrom tqdm.auto import tqdm\nimport torch\nimport torch.optim as optim\nfrom torch.optim.lr_scheduler import LambdaLR\nfrom torch.utils.data.dataloader import DataLoader\nlogger = logging.getLogger(__name__)\n# print('logging to wandb... (comment it if you don\\'t have wandb)')\n# import wandb # comment this if you don't have wandb\nclass TrainerConfig:\n max_epochs = 10\n batch_size = 64\n learning_rate = 4e-4\n betas = (0.9, 0.99)\n eps = 1e-8\n grad_norm_clip = 1.0\n weight_decay = 0.01\n lr_decay = False # linear warmup followed by cosine decay\n warmup_tokens = 375e6 # these two numbers come from the GPT-3 paper\n final_tokens = 260e9 # at which point do we reach lr_final\n epoch_save_frequency = 0\n epoch_save_path = 'trained-'\n num_workers = 0 # for DataLoader\n def __init__(self, **kwargs):\n for k,v in kwargs.items():\n setattr(self, k, v)\nclass Trainer:\n def __init__(self, model, train_dataset, test_dataset, config):" + }, + { + "comment": "This code initializes the trainer class, setting its model, train and test datasets, config, average loss, and steps. It also checks for availability of CUDA and moves the model to GPU if available. The get_run_name method returns a string based on the model's configuration parameters.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v1/src/trainer.py\":35-56", + "content": " self.model = model\n self.train_dataset = train_dataset\n self.test_dataset = test_dataset\n self.config = config\n self.avg_loss = -1\n self.steps = 0\n if 'wandb' in sys.modules:\n cfg = model.config\n for k in config.__dict__:\n setattr(cfg, k, config.__dict__[k]) # combine cfg\n wandb.init(project=\"RWKV-LM\", name=self.get_run_name() + '-' + datetime.datetime.today().strftime('%Y-%m-%d-%H-%M-%S'), config=cfg, save_code=False)\n self.device = 'cpu'\n if torch.cuda.is_available(): # take over whatever gpus are on the system\n self.device = torch.cuda.current_device()\n self.model = torch.nn.DataParallel(self.model).to(self.device)\n def get_run_name(self):\n raw_model = self.model.module if hasattr(self.model, \"module\") else self.model\n cfg = raw_model.config\n run_name = str(cfg.vocab_size) + '-' + str(cfg.ctx_len) + '-' + cfg.model_type + '-' + str(cfg.n_layer) + '-' + str(cfg.n_embd)" + }, + { + "comment": "Trains the model based on specified dataset, applies optimization for the configured optimizer and calculates average loss across all GPUs.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v1/src/trainer.py\":57-80", + "content": " return run_name\n def train(self):\n model, config = self.model, self.config\n raw_model = model.module if hasattr(self.model, \"module\") else model\n optimizer = raw_model.configure_optimizers(config)\n def run_epoch(split):\n is_train = split == 'train'\n model.train(is_train)\n data = self.train_dataset if is_train else self.test_dataset\n loader = DataLoader(data, shuffle=True, pin_memory=True,\n batch_size=config.batch_size,\n num_workers=config.num_workers)\n pbar = tqdm(enumerate(loader), total=len(loader), bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}') if is_train else enumerate(loader)\n for it, (x, y) in pbar:\n x = x.to(self.device) # place data on the correct device\n y = y.to(self.device)\n with torch.set_grad_enabled(is_train):\n _, loss = model(x, y) # forward the model\n loss = loss.mean() # collapse all losses if they are scattered on multiple gpus" + }, + { + "comment": "This code snippet checks if training is ongoing. If it is, it performs backpropagation, updates parameters, clips gradients, and handles learning rate decay based on the number of tokens processed. It uses linear warmup for the first config.warmup_tokens and cosine decay afterwards.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v1/src/trainer.py\":82-98", + "content": " if is_train: # backprop and update the parameters \n model.zero_grad()\n loss.backward()\n torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_norm_clip)\n optimizer.step()\n if config.lr_decay: # decay the learning rate based on our progress\n self.tokens += (y >= 0).sum() # number of tokens processed this step (i.e. label is not -100)\n lr_final_factor = config.lr_final / config.learning_rate\n if self.tokens < config.warmup_tokens:\n # linear warmup\n lr_mult = lr_final_factor + (1 - lr_final_factor) * float(self.tokens) / float(config.warmup_tokens)\n progress = 0\n else:\n # cosine learning rate decay\n progress = float(self.tokens - config.warmup_tokens) / float(max(1, config.final_tokens - config.warmup_tokens))" + }, + { + "comment": "This code adjusts the learning rate based on progress and fine-tunes it. It then updates the optimizer's learning rate, logs the loss, and calculates a moving average of the loss for tracking.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v1/src/trainer.py\":99-118", + "content": " # progress = min(progress * 1.1, 1.0) # more fine-tuning with low LR\n lr_mult = (0.5 + lr_final_factor / 2) + (0.5 - lr_final_factor / 2) * math.cos(math.pi * progress) # better 1.0 ~ 0.1\n lr = config.learning_rate * lr_mult\n for param_group in optimizer.param_groups:\n param_group['lr'] = lr\n else:\n lr = config.learning_rate\n now_loss = loss.item() # report progress\n if 'wandb' in sys.modules:\n wandb.log({\"loss\": now_loss}, step = self.steps * self.config.batch_size)\n self.steps += 1\n if self.avg_loss < 0:\n self.avg_loss = now_loss\n else:\n # factor = max(1.0 / 300, 1.0 / math.sqrt(it + 1))\n factor = 1 / (it + 1)\n self.avg_loss = self.avg_loss * (1.0 - factor) + now_loss * factor" + }, + { + "comment": "This code is initializing a progress bar and updating it with epoch, progress percentage, iteration, perplexity, loss value, and learning rate. It also resets the tokens counter for learning rate decay and runs the training epochs. If the current epoch meets save frequency or is the last epoch, it saves the model to a file.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v1/src/trainer.py\":119-129", + "content": " pbar.set_description(f\"epoch {epoch+1} progress {progress*100.0:.2f}% iter {it}: ppl {math.exp(self.avg_loss):.2f} loss {self.avg_loss:.4f} lr {lr:e}\")\n while True:\n self.tokens = 0 # counter used for learning rate decay\n for epoch in range(config.max_epochs):\n run_epoch('train')\n if (self.config.epoch_save_frequency > 0 and epoch % self.config.epoch_save_frequency == 0) or (epoch == config.max_epochs - 1):\n raw_model = self.model.module if hasattr(self.model, \"module\") else self.model # DataParallel wrappers keep raw model object in .module\n torch.save(raw_model, self.config.epoch_save_path + str(epoch+1) + '.pth')" + } + ] +} \ No newline at end of file diff --git a/docs/doc/5f8882f4-7e43-418c-9690-383ff0cc9e42.json b/docs/doc/5f8882f4-7e43-418c-9690-383ff0cc9e42.json new file mode 100644 index 00000000..560d719b --- /dev/null +++ b/docs/doc/5f8882f4-7e43-418c-9690-383ff0cc9e42.json @@ -0,0 +1,20 @@ +{ + "summary": "This code defines a tokenizer class using a trie data structure, with methods for encoding and decoding bytes. The TRIE_TOKENIZER initializes the trie by reading lines from a file and includes a 'find_longest' method for finding the longest token within a key string.", + "details": [ + { + "comment": "Class \"TRIE\" for RWKV Language Model - represents a Trie data structure used for tokenizing input strings, storing values associated with each character path in the trie.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/tokenizer/rwkv_tokenizer.py\":0-31", + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nclass TRIE:\n __slots__ = tuple(\"ch,to,values,front\".split(\",\"))\n to:list\n values:set\n def __init__(self, front=None, ch=None):\n self.ch = ch\n self.to = [None for ch in range(256)]\n self.values = set()\n self.front = front\n def __repr__(self):\n fr = self\n ret = []\n while(fr!=None):\n if(fr.ch!=None):\n ret.append(fr.ch)\n fr = fr.front\n return \" \"%(ret[::-1], self.values)\n def add(self, key:bytes, idx:int=0, val=None):\n if(idx == len(key)):\n if(val is None):\n val = key\n self.values.add(val)\n return self\n ch = key[idx]\n if(self.to[ch] is None):\n self.to[ch] = TRIE(front=self, ch=ch)" + }, + { + "comment": "Code is defining a TRIE data structure for tokenizing and mapping indices to tokens in a specified file. The `TRIE_TOKENIZER` class initializes the data structure by reading lines from a file, converting values to bytes, storing them in the TRIE, and creating a reverse index. The `find_longest` method is used for finding the longest token within a given key string.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/tokenizer/rwkv_tokenizer.py\":32-65", + "content": " return self.to[ch].add(key, idx=idx+1, val=val)\n def find_longest(self, key:bytes, idx:int=0):\n u:TRIE = self\n ch:int = key[idx]\n while(u.to[ch] is not None):\n u = u.to[ch]\n idx += 1\n if(u.values):\n ret = idx, u, u.values\n if(idx==len(key)):\n break\n ch = key[idx]\n return ret\nclass TRIE_TOKENIZER():\n def __init__(self, file_name):\n self.idx2token = {}\n sorted = [] # must be already sorted\n with open(file_name, \"r\", encoding=\"utf-8\") as f:\n lines = f.readlines()\n for l in lines:\n idx = int(l[:l.index(' ')])\n x = eval(l[l.index(' '):l.rindex(' ')])\n x = x.encode(\"utf-8\") if isinstance(x, str) else x\n assert isinstance(x, bytes)\n assert len(x) == int(l[l.rindex(' '):])\n sorted += [x]\n self.idx2token[idx] = x\n self.token2idx = {}\n for k,v in self.idx2token.items():\n self.token2idx[v] = int(k)" + }, + { + "comment": "This code defines a tokenizer class that can encode and decode bytes using a trie data structure. The encodeBytes method converts input bytes to tokens, while the decodeBytes method reconverts tokens back into bytes. The encode and decode methods handle Unicode strings. The printTokens method prints the tokens along with their indices.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/tokenizer/rwkv_tokenizer.py\":67-102", + "content": " self.root = TRIE()\n for t, i in self.token2idx.items():\n _ = self.root.add(t, val=(t, i))\n def encodeBytes(self, src:bytes):\n idx:int = 0\n tokens = []\n while (idx < len(src)):\n _idx:int = idx\n idx, _, values = self.root.find_longest(src, idx)\n assert(idx != _idx)\n _, token = next(iter(values)) \n tokens.append(token)\n return tokens\n def decodeBytes(self, tokens):\n return b''.join(map(lambda i: self.idx2token[i], tokens))\n def encode(self, src):\n return self.encodeBytes(src.encode(\"utf-8\"))\n def decode(self, tokens):\n try:\n return self.decodeBytes(tokens).decode('utf-8')\n except:\n return '\\ufffd' # bad utf-8\n def printTokens(self, tokens):\n for i in tokens:\n s = self.idx2token[i]\n try:\n s = s.decode('utf-8')\n except:\n pass\n print(f'{repr(s)}{i}', end=' ')\n print()" + } + ] +} \ No newline at end of file diff --git a/docs/doc/724c91af-09d9-4076-a1f3-478b2e543843.json b/docs/doc/724c91af-09d9-4076-a1f3-478b2e543843.json new file mode 100644 index 00000000..a5108830 --- /dev/null +++ b/docs/doc/724c91af-09d9-4076-a1f3-478b2e543843.json @@ -0,0 +1,85 @@ +{ + "summary": "The code utilizes DeepSpeed, VGG16 and RWKV-LM layers, VGG-19 pretrained features, L2 pooling layers, calculates distances for score calculation, and defines the RWKV-v4neo language model with Conv2d layers, BatchNorm2d, Mish activation function, LightningModule, optimizer, and encoder-decoder architecture for multi-device training support.", + "details": [ + { + "comment": "This code imports necessary libraries and defines functions and modules for a machine learning model. It uses DeepSpeed for efficient training and includes functions like L2pooling for data processing.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model_img.py\":0-29", + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nimport numpy as np\nimport os, math, gc\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\nimport torchvision as vision\nimport pytorch_lightning as pl\nfrom pytorch_lightning.utilities import rank_zero_info, rank_zero_only\nfrom pytorch_lightning.strategies import DeepSpeedStrategy\nimport deepspeed\nfrom deepspeed.ops.adam import DeepSpeedCPUAdam, FusedAdam\n# from pytorch_msssim import MS_SSIM\ndef __nop(ob):\n return ob\nMyModule = torch.jit.ScriptModule\n# MyFunction = __nop\nMyFunction = torch.jit.script_method\nimport clip\nfrom transformers import CLIPModel\nclass L2pooling(nn.Module):\n def __init__(self, filter_size=5, stride=2, channels=None, pad_off=0):\n super(L2pooling, self).__init__()\n self.padding = (filter_size - 2) // 2" + }, + { + "comment": "This code defines a class for a neural network model. The model has stages, and each stage contains convolutional layers from the VGG16 model followed by a custom layer called RWKV-LM/RWKV-v4neo/src/model_img.py:30-62. The custom layer applies a Hanning window function to the filter, then normalizes it, and repeats it for each channel. Finally, it performs convolution with the input image using specified stride, padding, and groups.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model_img.py\":30-62", + "content": " self.stride = stride\n self.channels = channels\n a = np.hanning(filter_size)[1:-1]\n g = torch.Tensor(a[:, None] * a[None, :])\n g = g / torch.sum(g)\n self.register_buffer(\n \"filter\", g[None, None, :, :].repeat((self.channels, 1, 1, 1))\n )\n def forward(self, input):\n input = input**2\n out = F.conv2d(\n input,\n self.filter,\n stride=self.stride,\n padding=self.padding,\n groups=input.shape[1],\n )\n return (out + 1e-12).sqrt()\nclass DISTS(torch.nn.Module):\n def __init__(self, load_weights=True):\n super(DISTS, self).__init__()\n vgg_pretrained_features = vision.models.vgg16(\n weights=\"VGG16_Weights.IMAGENET1K_V1\"\n ).features\n self.stage1 = torch.nn.Sequential()\n self.stage2 = torch.nn.Sequential()\n self.stage3 = torch.nn.Sequential()\n self.stage4 = torch.nn.Sequential()\n self.stage5 = torch.nn.Sequential()\n for x in range(0, 4):" + }, + { + "comment": "The code creates a model architecture by adding modules to the stages of the network. It uses VGG-19 pretrained features for each stage and adds L2 pooling layers in between. The mean and std values are registered as buffers, and a list of channel numbers is created.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model_img.py\":63-85", + "content": " self.stage1.add_module(str(x), vgg_pretrained_features[x])\n self.stage2.add_module(str(4), L2pooling(channels=64))\n for x in range(5, 9):\n self.stage2.add_module(str(x), vgg_pretrained_features[x])\n self.stage3.add_module(str(9), L2pooling(channels=128))\n for x in range(10, 16):\n self.stage3.add_module(str(x), vgg_pretrained_features[x])\n self.stage4.add_module(str(16), L2pooling(channels=256))\n for x in range(17, 23):\n self.stage4.add_module(str(x), vgg_pretrained_features[x])\n self.stage5.add_module(str(23), L2pooling(channels=512))\n for x in range(24, 30):\n self.stage5.add_module(str(x), vgg_pretrained_features[x])\n self.register_buffer(\n \"mean\", torch.tensor([0.485, 0.456, 0.406]).view(1, -1, 1, 1)\n )\n self.register_buffer(\n \"std\", torch.tensor([0.229, 0.224, 0.225]).view(1, -1, 1, 1)\n )\n self.chns = [3, 64, 128, 256, 512, 512]\n self.register_buffer(" + }, + { + "comment": "The code initializes two parameters, \"alpha\" and \"beta\", with random values and normalizes their data. It then loads weights from a file and assigns them to the respective parameters. Finally, it sets the gradient flag to False for all parameters and defines forward functions to perform calculations on input features.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model_img.py\":86-115", + "content": " \"alpha\", nn.Parameter(torch.randn(1, sum(self.chns), 1, 1))\n )\n self.register_buffer(\"beta\", nn.Parameter(torch.randn(1, sum(self.chns), 1, 1)))\n self.alpha.data.normal_(0.1, 0.01)\n self.beta.data.normal_(0.1, 0.01)\n weights = torch.load(\"test/DISTS_weights.pt\")\n self.alpha.data = weights[\"alpha\"]\n self.beta.data = weights[\"beta\"]\n for param in self.parameters():\n param.requires_grad = False\n def forward_once(self, x):\n h = (x - self.mean) / self.std\n h = self.stage1(h)\n h_relu1_2 = h\n h = self.stage2(h)\n h_relu2_2 = h\n h = self.stage3(h)\n h_relu3_3 = h\n h = self.stage4(h)\n h_relu4_3 = h\n h = self.stage5(h)\n h_relu5_3 = h\n return [x, h_relu1_2, h_relu2_2, h_relu3_3, h_relu4_3, h_relu5_3]\n def forward(self, x, y, require_grad=False, batch_average=False):\n if require_grad:\n feats0 = self.forward_once(x)\n feats1 = self.forward_once(y)" + }, + { + "comment": "This code calculates the distances between two feature embeddings and assigns weights to them based on alpha and beta. It first performs a forward pass for x and y, then normalizes alpha and beta by dividing their sums with the total number of channels. For each channel, it computes the mean and variance of x and y, and also calculates the covariance between x and y. Finally, it applies weights to the distances and sums them up for both embeddings.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model_img.py\":116-140", + "content": " else:\n with torch.no_grad():\n feats0 = self.forward_once(x)\n feats1 = self.forward_once(y)\n dist1 = 0\n dist2 = 0\n c1 = 1e-6\n c2 = 1e-6\n w_sum = self.alpha.sum() + self.beta.sum()\n alpha = torch.split(self.alpha / w_sum, self.chns, dim=1)\n beta = torch.split(self.beta / w_sum, self.chns, dim=1)\n for k in range(len(self.chns)):\n x_mean = feats0[k].mean([2, 3], keepdim=True)\n y_mean = feats1[k].mean([2, 3], keepdim=True)\n S1 = (2 * x_mean * y_mean + c1) / (x_mean**2 + y_mean**2 + c1)\n dist1 = dist1 + (alpha[k] * S1).sum(1, keepdim=True)\n x_var = ((feats0[k] - x_mean) ** 2).mean([2, 3], keepdim=True)\n y_var = ((feats1[k] - y_mean) ** 2).mean([2, 3], keepdim=True)\n xy_cov = (feats0[k] * feats1[k]).mean(\n [2, 3], keepdim=True\n ) - x_mean * y_mean\n S2 = (2 * xy_cov + c2) / (x_var + y_var + c2)\n dist2 = dist2 + (beta[k] * S2).sum(1, keepdim=True)" + }, + { + "comment": "This code contains a function that calculates a score based on distances and returns it. If batch_average is True, the score is averaged across all elements in the batch. The ToBinary class performs binary rounding of input values. The R_ENCODER class initializes a model with BatchNorm2d and Conv2d layers for image processing.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model_img.py\":142-172", + "content": " score = 1 - (dist1 + dist2).squeeze()\n if batch_average:\n return score.mean()\n else:\n return score\n class ToBinary(torch.autograd.Function):\n @staticmethod\n def forward(ctx, x):#, noise_scale):\n # if noise_scale > 0:\n # noise_min = 0.5 - noise_scale / 2\n # noise_max = 0.5 + noise_scale / 2\n # return torch.floor(x + torch.empty_like(x).uniform_(noise_min, noise_max))\n # else:\n return torch.floor(x + 0.5) # no need for noise when we have plenty of data\n @staticmethod\n def backward(ctx, grad_output):\n return grad_output.clone()#, None\n########################################################################################################\nclass R_ENCODER(MyModule):\n def __init__(self, args):\n super().__init__()\n self.args = args\n dd = 8\n self.Bxx = nn.BatchNorm2d(dd*64)\n self.CIN = nn.Conv2d(3, dd, kernel_size=3, padding=1)" + }, + { + "comment": "This code defines multiple convolutional layers (Conv2d) and batch normalization layers (BatchNorm2d) for a neural network model. The layers have different input/output dimensions, kernel sizes, and padding values to perform feature extraction and normalization in the model's architecture.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model_img.py\":173-192", + "content": " self.Cx0 = nn.Conv2d(dd, 32, kernel_size=3, padding=1)\n self.Cx1 = nn.Conv2d(32, dd, kernel_size=3, padding=1)\n self.B00 = nn.BatchNorm2d(dd*4)\n self.C00 = nn.Conv2d(dd*4, 256, kernel_size=3, padding=1)\n self.C01 = nn.Conv2d(256, dd*4, kernel_size=3, padding=1)\n self.C02 = nn.Conv2d(dd*4, 256, kernel_size=3, padding=1)\n self.C03 = nn.Conv2d(256, dd*4, kernel_size=3, padding=1)\n self.B10 = nn.BatchNorm2d(dd*16)\n self.C10 = nn.Conv2d(dd*16, 256, kernel_size=3, padding=1)\n self.C11 = nn.Conv2d(256, dd*16, kernel_size=3, padding=1)\n self.C12 = nn.Conv2d(dd*16, 256, kernel_size=3, padding=1)\n self.C13 = nn.Conv2d(256, dd*16, kernel_size=3, padding=1)\n self.B20 = nn.BatchNorm2d(dd*64)\n self.C20 = nn.Conv2d(dd*64, 256, kernel_size=3, padding=1)\n self.C21 = nn.Conv2d(256, dd*64, kernel_size=3, padding=1)\n self.C22 = nn.Conv2d(dd*64, 256, kernel_size=3, padding=1)\n self.C23 = nn.Conv2d(256, dd*64, kernel_size=3, padding=1)" + }, + { + "comment": "This code defines a forward function for a neural network. It uses Mish activation functions and applies convolutional layers with batch normalization for feature extraction and image processing. The output is generated by combining the outputs of multiple convolutional layers, and pixel unshuffling is used to change the channel dimension.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model_img.py\":193-220", + "content": " # self.B21 = nn.BatchNorm2d(dd*64)\n # self.C24 = nn.Conv2d(dd*64, 256, kernel_size=3, padding=1)\n # self.C25 = nn.Conv2d(256, dd*64, kernel_size=3, padding=1)\n # self.C26 = nn.Conv2d(dd*64, 256, kernel_size=3, padding=1)\n # self.C27 = nn.Conv2d(256, dd*64, kernel_size=3, padding=1)\n self.COUT = nn.Conv2d(dd*64, args.my_img_bit, kernel_size=3, padding=1)\n @MyFunction\n def forward(self, img):\n ACT = F.mish\n x = self.CIN(img)\n xx = self.Bxx(F.pixel_unshuffle(x, 8))\n x = x + self.Cx1(ACT(self.Cx0(x)))\n x = F.pixel_unshuffle(x, 2)\n x = x + self.C01(ACT(self.C00(ACT(self.B00(x)))))\n x = x + self.C03(ACT(self.C02(x)))\n x = F.pixel_unshuffle(x, 2)\n x = x + self.C11(ACT(self.C10(ACT(self.B10(x)))))\n x = x + self.C13(ACT(self.C12(x)))\n x = F.pixel_unshuffle(x, 2)\n x = x + self.C21(ACT(self.C20(ACT(self.B20(x)))))\n x = x + self.C23(ACT(self.C22(x)))\n # x = x + self.C25(ACT(self.C24(ACT(self.B21(x)))))" + }, + { + "comment": "The given code is a part of the RWKV model implementation. It defines the Decoder class which takes in arguments and initializes multiple Conv2d layers for processing. The Conv2d layers are responsible for feature extraction and down-sampling. The batch normalization layer (BatchNorm2d) helps with speeding up the training and improving model performance by reducing internal covariate shift. However, there is a comment suggesting that the BatchNorm2d could be removed or reduced to improve performance and speed. Additionally, some of the Conv2d layers are commented out, indicating they might be unused or under development.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model_img.py\":221-244", + "content": " # x = x + self.C27(ACT(self.C26(x)))\n x = self.COUT(x + xx)\n return torch.sigmoid(x)\n########################################################################################################\nclass R_DECODER(MyModule):\n def __init__(self, args):\n super().__init__()\n self.args = args\n dd = 8\n self.CIN = nn.Conv2d(args.my_img_bit, dd*64, kernel_size=3, padding=1)\n self.B00 = nn.BatchNorm2d(dd*64)\n self.C00 = nn.Conv2d(dd*64, 256, kernel_size=3, padding=1)\n self.C01 = nn.Conv2d(256, dd*64, kernel_size=3, padding=1)\n self.C02 = nn.Conv2d(dd*64, 256, kernel_size=3, padding=1)\n self.C03 = nn.Conv2d(256, dd*64, kernel_size=3, padding=1)\n # self.B01 = nn.BatchNorm2d(dd*64)\n # self.C04 = nn.Conv2d(dd*64, 256, kernel_size=3, padding=1)\n # self.C05 = nn.Conv2d(256, dd*64, kernel_size=3, padding=1)\n # self.C06 = nn.Conv2d(dd*64, 256, kernel_size=3, padding=1)\n # self.C07 = nn.Conv2d(256, dd*64, kernel_size=3, padding=1)" + }, + { + "comment": "This code defines a model for the RWKV-v4neo language model. It includes multiple convolutional and batch normalization layers, as well as the Mish activation function (F.mish). The forward function applies these layers to an input code and performs addition operations between different layer outputs.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model_img.py\":246-269", + "content": " self.B10 = nn.BatchNorm2d(dd*16)\n self.C10 = nn.Conv2d(dd*16, 256, kernel_size=3, padding=1)\n self.C11 = nn.Conv2d(256, dd*16, kernel_size=3, padding=1)\n self.C12 = nn.Conv2d(dd*16, 256, kernel_size=3, padding=1)\n self.C13 = nn.Conv2d(256, dd*16, kernel_size=3, padding=1)\n self.B20 = nn.BatchNorm2d(dd*4)\n self.C20 = nn.Conv2d(dd*4, 256, kernel_size=3, padding=1)\n self.C21 = nn.Conv2d(256, dd*4, kernel_size=3, padding=1)\n self.C22 = nn.Conv2d(dd*4, 256, kernel_size=3, padding=1)\n self.C23 = nn.Conv2d(256, dd*4, kernel_size=3, padding=1)\n self.Cx0 = nn.Conv2d(dd, 32, kernel_size=3, padding=1)\n self.Cx1 = nn.Conv2d(32, dd, kernel_size=3, padding=1)\n self.COUT = nn.Conv2d(dd, 3, kernel_size=3, padding=1)\n @MyFunction\n def forward(self, code):\n ACT = F.mish\n x = self.CIN(code)\n x = x + self.C01(ACT(self.C00(ACT(self.B00(x)))))\n x = x + self.C03(ACT(self.C02(x)))\n # x = x + self.C05(ACT(self.C04(ACT(self.B01(x)))))" + }, + { + "comment": "This code snippet belongs to a LightningModule class in the RWKV-v4neo package. It includes a cosine_loss function and an RWKV_IMG class which has an encoder, decoder, and clip_model as its components. The encoder and decoder are instances of R_ENCODER and R_DECODER classes respectively. The code snippet defines operations to be performed on the input x using various transformations and normalizations before returning a sigmoid transformed output.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model_img.py\":270-305", + "content": " # x = x + self.C07(ACT(self.C06(x)))\n x = F.pixel_shuffle(x, 2)\n x = x + self.C11(ACT(self.C10(ACT(self.B10(x)))))\n x = x + self.C13(ACT(self.C12(x)))\n x = F.pixel_shuffle(x, 2)\n x = x + self.C21(ACT(self.C20(ACT(self.B20(x)))))\n x = x + self.C23(ACT(self.C22(x)))\n x = F.pixel_shuffle(x, 2)\n x = x + self.Cx1(ACT(self.Cx0(x)))\n x = self.COUT(x)\n return torch.sigmoid(x)\n########################################################################################################`\ndef cosine_loss(x, y):\n x = F.normalize(x, dim=-1)\n y = F.normalize(y, dim=-1)\n return 1 - torch.einsum('ij,ij->i',[x,y])\nclass RWKV_IMG(pl.LightningModule):\n def __init__(self, args):\n super().__init__()\n self.args = args\n self.encoder = R_ENCODER(args)\n self.decoder = R_DECODER(args)\n self.clip_model = None\n clip_name = args.my_img_clip\n if clip_name == 'B32':\n clip_name = 'ViT-B/32'\n elif clip_name == 'B16':" + }, + { + "comment": "In the provided code snippet, the `clip_name` is assigned based on certain conditions. If it's 'ViT-B/16', no change. If 'L14', it's changed to 'ViT-L/14'. And if 'OB32', it becomes \"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\" and a pretrained CLIP model is loaded for this case. The code also initializes `clip_mean` and `clip_std` buffers with specific values, sets parameters of 'clip_model' as non-trainable, and configures the optimizers based on 'args'.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model_img.py\":306-331", + "content": " clip_name = 'ViT-B/16'\n elif clip_name == 'L14':\n clip_name = 'ViT-L/14'\n elif clip_name == 'OB32':\n clip_name = \"laion/CLIP-ViT-B-32-laion2B-s34B-b79K\"\n self.clip_model = CLIPModel.from_pretrained(clip_name)\n self.clip_model.encode_image = self.clip_model.get_image_features\n if self.clip_model == None:\n self.clip_model, _ = clip.load(clip_name, jit = True)\n self.register_buffer(\n \"clip_mean\", torch.tensor([0.48145466, 0.4578275, 0.40821073]).view(1, 3, 1, 1)\n )\n self.register_buffer(\n \"clip_std\", torch.tensor([0.26862954, 0.26130258, 0.27577711]).view(1, 3, 1, 1)\n )\n for n, p in self.named_parameters():\n if 'clip_model' in n:\n p.requires_grad = False\n self.loss_dists = DISTS()\n # self.loss_ssim = MS_SSIM(data_range=1, size_average=True, channel=3)\n def configure_optimizers(self):\n args = self.args\n optim_groups = [" + }, + { + "comment": "This code defines an optimizer function that chooses between DeepSpeedCPUAdam and FusedAdam based on the deepspeed_offload flag. The optimizer takes in optim_groups, lr (learning rate), betas, eps (epsilon), bias_correction, adam_w_mode, weight_decay, and amsgrad as parameters. It returns an instance of either DeepSpeedCPUAdam or FusedAdam depending on whether deepspeed_offload is True or False. The @property method deepspeed_offload retrieves the strategy from the trainer.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model_img.py\":332-359", + "content": " {\"params\": [p for n, p in self.named_parameters()], \"weight_decay\": 0.0},\n ]\n if self.deepspeed_offload:\n return DeepSpeedCPUAdam(\n optim_groups,\n lr=self.args.lr_init,\n betas=self.args.betas,\n eps=self.args.adam_eps,\n bias_correction=True,\n adamw_mode=False,\n weight_decay=0,\n amsgrad=False,\n )\n return FusedAdam(\n optim_groups,\n lr=self.args.lr_init,\n betas=self.args.betas,\n eps=self.args.adam_eps,\n bias_correction=True,\n adam_w_mode=False,\n weight_decay=0,\n amsgrad=False,\n )\n # return ZeroOneAdam(optim_groups, lr=self.args.lr_init, betas=self.args.betas, eps=self.args.adam_eps, bias_correction=True, weight_decay=0, amsgrad=False, cuda_aware=False)\n @property\n def deepspeed_offload(self) -> bool:\n strategy = self.trainer.strategy" + }, + { + "comment": "This code defines a model class that takes an image as input and outputs an image. It uses an encoder and decoder for processing the input. The model also has a training step where it saves images at specific global steps during training if running on multiple devices.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model_img.py\":360-384", + "content": " if isinstance(strategy, DeepSpeedStrategy):\n config = strategy.config[\"zero_optimization\"]\n return config.get(\"offload_optimizer\") or config.get(\"offload_param\")\n return False\n def forward(self, img):\n z = self.encoder(img)\n z = ToBinary.apply(z)#, self.args.my_img_noise_scale)\n out = self.decoder(z)\n return out\n def training_step(self, batch, batch_idx):\n args = self.args\n img, txt = batch\n out = self(img)\n if self.trainer.is_global_zero:\n if (self.trainer.global_step + 1) % (100 * int(args.devices)) == 0:\n img_dir = f\"test/image_model/{args.run_name}\"\n if not os.path.exists(img_dir):\n os.makedirs(img_dir)\n vision.utils.save_image(\n img[:4], f\"{img_dir}/{self.trainer.global_step}-src.jpg\"#, padding=0\n )\n vision.utils.save_image(\n out[:4], f\"{img_dir}/{self.trainer.global_step}-out.jpg\"#, padding=0" + }, + { + "comment": "This code snippet is for a deep learning model that takes input image and produces output, then calculates loss for each step during training. It uses different types of losses such as SSIM (line 396) and cosine similarity (lines 390-395). The code also handles global average pooling and all-gathering for distributed training.\n\nQuestion: What is the purpose of the 'args' argument used in this code?", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model_img.py\":385-412", + "content": " )\n # loss_ssim = 1 - self.loss_ssim(out, img)\n loss_dists = self.loss_dists(out, img, require_grad=True, batch_average=True)\n iii = self.clip_model.encode_image((img - self.clip_mean) / self.clip_std)\n ooo = self.clip_model.encode_image((out - self.clip_mean) / self.clip_std)\n loss_clip = torch.mean(cosine_loss(iii, ooo))\n if args.my_img_l1_scale > 0:\n loss_l1 = F.l1_loss(out, img)\n return loss_dists + loss_clip * args.my_img_clip_scale + loss_l1 * args.my_img_l1_scale\n else:\n return loss_dists + loss_clip * args.my_img_clip_scale\n def training_step_end(self, batch_parts):\n all = self.all_gather(batch_parts)\n if self.trainer.is_global_zero:\n self.trainer.my_loss_all = all\n def generate_init_weight(self):\n print(\n f\"\"\"\n############################################################################\n#\n# Init model weight (slow for large models)...\n#\n############################################################################" + }, + { + "comment": "This code snippet is used to create a dictionary of model parameters, where it loads each parameter from the model's state_dict and applies necessary scaling or conversion based on its name. It also handles different float modes such as fp16 and bf16 as specified in the environment variable RWKV_FLOAT_MODE. Finally, it collects garbage, empties CUDA cache, and returns the dictionary of parameters.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model_img.py\":413-445", + "content": "\"\"\"\n )\n m = {}\n for n in self.state_dict():\n scale = 1\n p = self.state_dict()[n]\n shape = p.shape\n ss = n.split('.')\n # if ss[0] in ['encoder', 'decoder']:\n # if ss[2] == 'bias':\n # scale = 0\n # # elif n == 'encoder.CIN.weight':\n # # nn.init.dirac_(p)\n # else:\n # try:\n # if ss[1][0] == 'C' and (int(ss[1][2]) % 2 == 1):\n # scale = 0\n # except:\n # pass\n # m[n] = p * scale\n m[n] = p\n m[n] = m[n].cpu()\n if os.environ[\"RWKV_FLOAT_MODE\"] == \"fp16\":\n m[n] = m[n].half()\n elif os.environ[\"RWKV_FLOAT_MODE\"] == \"bf16\":\n m[n] = m[n].bfloat16()\n gc.collect()\n torch.cuda.empty_cache()\n return m" + } + ] +} \ No newline at end of file diff --git a/docs/doc/73961a51-82e3-4a06-835a-c9a55c60adbb.json b/docs/doc/73961a51-82e3-4a06-835a-c9a55c60adbb.json new file mode 100644 index 00000000..237924fd --- /dev/null +++ b/docs/doc/73961a51-82e3-4a06-835a-c9a55c60adbb.json @@ -0,0 +1,40 @@ +{ + "summary": "TrainerConfig class establishes training parameters, GPU support, Wandb setup, initializes device, defines epoch function, creates optimizer, sets DataLoader and trains model. Code handles learning rate multiplier update, progress logging, and moving averages for loss calculation in specified epochs loop.", + "details": [ + { + "comment": "TrainerConfig class initializes model training parameters, setting maximum epochs to 10, batch size to 64, learning rate to 4e-4, gradient norm clipping threshold to 1.0, and optimizer betas and eps for Adam optimizer. The code also sets up environment for efficient GPU usage with CUDA and TF32 support.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v2-RNN/src/trainer.py\":0-35", + "content": "########################################################################################################\n# The RWKV v2-RNN Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nfrom torch.utils.data.dataloader import DataLoader\nfrom torch.optim.lr_scheduler import LambdaLR\nfrom torch.nn import functional as F\nimport torch.nn as nn\nimport torch.optim as optim\nimport torch\nfrom tqdm.auto import tqdm\nimport numpy as np\nimport logging\nimport os\nimport datetime\nimport sys\nimport math\n# import wandb # comment this if you don't have wandb\n# print('logging to wandb... (comment it if you don\\'t have wandb)')\nlogger = logging.getLogger(__name__)\ntorch.backends.cudnn.benchmark = True\ntorch.backends.cudnn.allow_tf32 = True\ntorch.backends.cuda.matmul.allow_tf32 = True\nlog_file = open(\"mylog.txt\", \"a\")\nclass TrainerConfig:\n max_epochs = 10\n batch_size = 64\n learning_rate = 4e-4\n betas = (0.9, 0.99)\n eps = 1e-8\n grad_norm_clip = 1.0" + }, + { + "comment": "The code snippet represents the initialisation of a Trainer class. It accepts a model, train and test datasets, and a configuration. If wandb (a popular ML experiment tracking tool) is available, it combines the config with the model's config and initiates an experiment with a specific project name and timestamped run name. The device is set to 'cpu', but if GPUs are available, they will be used.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v2-RNN/src/trainer.py\":36-66", + "content": " lr_decay = True # linear warmup followed by cosine decay\n warmup_tokens = 0\n final_tokens = 0\n epoch_save_frequency = 0\n epoch_save_path = 'trained-'\n num_workers = 0 # for DataLoader\n def __init__(self, **kwargs):\n for k, v in kwargs.items():\n setattr(self, k, v)\nclass Trainer:\n def __init__(self, model, train_dataset, test_dataset, config):\n self.model = model\n self.train_dataset = train_dataset\n self.test_dataset = test_dataset\n self.config = config\n self.avg_loss = -1\n self.steps = 0\n if 'wandb' in sys.modules:\n cfg = model.config\n for k in config.__dict__:\n setattr(cfg, k, config.__dict__[k]) # combine cfg\n wandb.init(project=\"RWKV-LM\", name=self.get_run_name() + '-' +\n datetime.datetime.today().strftime('%Y-%m-%d-%H-%M-%S'), config=cfg, save_code=False)\n self.device = 'cpu'\n if torch.cuda.is_available(): # take over whatever gpus are on the system" + }, + { + "comment": "The code initializes the device, retrieves the run name based on model configuration, and defines a function to run an epoch. It also creates an optimizer for the model and sets up a data loader if necessary, all within the context of training the model.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v2-RNN/src/trainer.py\":67-91", + "content": " self.device = torch.cuda.current_device()\n def get_run_name(self):\n raw_model = self.model.module if hasattr(\n self.model, \"module\") else self.model\n cfg = raw_model.config\n run_name = str(cfg.vocab_size) + '-' + str(cfg.ctx_len) + '-' + \\\n cfg.model_type + '-' + str(cfg.n_layer) + '-' + str(cfg.n_embd)\n return run_name\n def train(self):\n model, config = self.model, self.config\n raw_model = model.module if hasattr(self.model, \"module\") else model\n optimizer = raw_model.configure_optimizers(config)\n def run_epoch(split):\n is_train = split == 'train'\n model.train(is_train)\n data = self.train_dataset if is_train else self.test_dataset\n if config.num_workers > 0:\n loader = DataLoader(data, shuffle=False, pin_memory=True,\n batch_size=config.batch_size,\n num_workers=config.num_workers)\n else:" + }, + { + "comment": "This code creates a DataLoader for iterating through data with shuffling disabled, and sets the batch size and number of workers according to config settings. It then initializes a progress bar (pbar) to track the progress through the loader. In training mode, it loops through each iteration (it, x, y), moves tensors to the device, forwards model, zeroes gradients, backprops, applies gradient clipping if specified, and steps the optimizer. If learning rate decay is enabled, it also decays the learning rate based on progress.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v2-RNN/src/trainer.py\":92-116", + "content": " loader = DataLoader(data, shuffle=False,\n batch_size=config.batch_size,\n num_workers=config.num_workers)\n pbar = tqdm(enumerate(loader), total=len(\n loader), bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}') if is_train else enumerate(loader)\n for it, (x, y) in pbar:\n x = x.to(self.device) # place data on the correct device\n y = y.to(self.device)\n with torch.set_grad_enabled(is_train):\n _, loss = model(x, y) # forward the model\n if is_train: # backprop and update the parameters\n model.zero_grad()\n loss.backward()\n if config.grad_norm_clip > 0:\n torch.nn.utils.clip_grad_norm_(\n model.parameters(), config.grad_norm_clip)\n optimizer.step()\n if config.lr_decay: # decay the learning rate based on our progress" + }, + { + "comment": "This code calculates the learning rate multiplier based on the number of tokens processed. If fewer than warmup_tokens, uses linear warmup; otherwise, applies cosine learning rate decay. The learning rate is adjusted according to the current token count and configuration parameters.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v2-RNN/src/trainer.py\":117-131", + "content": " # number of tokens processed this step (i.e. label is not -100)\n self.tokens += (y >= 0).sum()\n lr_final_factor = config.lr_final / config.learning_rate\n if self.tokens < config.warmup_tokens:\n # linear warmup\n lr_mult = lr_final_factor + \\\n (1 - lr_final_factor) * float(self.tokens) / \\\n float(config.warmup_tokens)\n progress = 0\n else:\n # cosine learning rate decay\n progress = float(self.tokens - config.warmup_tokens) / float(\n max(1, config.final_tokens - config.warmup_tokens))\n lr_mult = (0.5 + lr_final_factor / 2) + (0.5 - lr_final_factor /\n 2) * math.cos(math.pi * progress) # better 1.0 ~ 0.1" + }, + { + "comment": "Updates learning rate based on config. Sets the learning rate for optimizer parameter groups and logs loss, steps, average loss, progress, and learning rate using WandB. Calculates and updates average loss using a moving average factor. Updates progress bar description with mini-epoch, progress percentage, perplexity, current loss, and learning rate in exponential format.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v2-RNN/src/trainer.py\":132-153", + "content": " lr = config.learning_rate * lr_mult\n for param_group in optimizer.param_groups:\n param_group['lr'] = lr\n else:\n lr = config.learning_rate\n now_loss = loss.item() # report progress\n self.lr = lr\n if 'wandb' in sys.modules:\n wandb.log({\"loss\": now_loss},\n step=self.steps * self.config.batch_size)\n self.steps += 1\n if self.avg_loss < 0:\n self.avg_loss = now_loss\n else:\n factor = 1 / (it + 1)\n self.avg_loss = self.avg_loss * \\\n (1.0 - factor) + now_loss * factor\n pbar.set_description(\n f\"mini-epoch {epoch+1} prog {progress*100.0:.2f}% iter {it}: ppl {math.exp(self.avg_loss):.2f} loss {self.avg_loss:.4f} lr {lr:e}\")" + }, + { + "comment": "The code initializes a tokens counter and loops over the specified number of epochs. For each epoch, it runs the 'train' function, logs information to a file, saves the model's state if necessary (based on config), and flushes the log file.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v2-RNN/src/trainer.py\":155-169", + "content": " self.tokens = 0 # counter used for learning rate decay\n for epoch in range(config.max_epochs):\n run_epoch('train')\n log_file.write(\n f'{epoch+1} {self.avg_loss:.6f} {math.exp(self.avg_loss):.4f} {self.lr:.8f} {datetime.datetime.now()} \\n')\n log_file.flush()\n if (self.config.epoch_save_frequency > 0 and epoch % self.config.epoch_save_frequency == 0) or (epoch == config.max_epochs - 1):\n # DataParallel wrappers keep raw model object in .module\n raw_model = self.model.module if hasattr(\n self.model, \"module\") else self.model\n torch.save(raw_model.state_dict(),\n self.config.epoch_save_path + str(epoch+1) + '.pth')" + } + ] +} \ No newline at end of file diff --git a/docs/doc/7a2a476d-7233-41a5-a4d2-eb6d22b8ced4.json b/docs/doc/7a2a476d-7233-41a5-a4d2-eb6d22b8ced4.json new file mode 100644 index 00000000..05cf0b6c --- /dev/null +++ b/docs/doc/7a2a476d-7233-41a5-a4d2-eb6d22b8ced4.json @@ -0,0 +1,30 @@ +{ + "summary": "The code imports libraries, defines a tokenizing class, utilizes Fermat's Little Theorem and Miller-Rabin primality test for prime number calculation, performs verification process to determine if a given number is prime, and returns True or False accordingly.", + "details": [ + { + "comment": "This code imports necessary libraries and defines a class for tokenizing text using either pre-trained tokenizers or custom word tables. It also records time usage for optimization purposes.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/utils.py\":0-29", + "content": "import json, time, random, os\nimport numpy as np\nimport torch\nfrom torch.nn import functional as F\ntime_slot = {}\ntime_ref = time.time_ns()\ndef record_time(name):\n if name not in time_slot:\n time_slot[name] = 1e20\n tt = (time.time_ns() - time_ref) / 1e9\n if tt < time_slot[name]:\n time_slot[name] = tt\nclass TOKENIZER():\n def __init__(self, WORD_NAME, UNKNOWN_CHAR='\\ue083'):\n if 'list' in str(type(WORD_NAME)):\n self.charMode = False\n if WORD_NAME[0] == WORD_NAME[1]:\n from transformers import PreTrainedTokenizerFast\n self.tokenizer = PreTrainedTokenizerFast(tokenizer_file=WORD_NAME[0])\n else:\n from transformers import GPT2TokenizerFast\n self.tokenizer = GPT2TokenizerFast(WORD_NAME[0], WORD_NAME[1])\n self.vocab_size = len(self.tokenizer)\n else:\n self.charMode = True\n with open(WORD_NAME + '.json', \"r\", encoding=\"utf-16\") as result_file:\n self.word_table = json.load(result_file)" + }, + { + "comment": "This code snippet is part of a model for text generation. The 'utils' class contains methods to refine the context by removing extra whitespace and unwanted characters, set vocabulary size based on the word table, map words to integers and vice versa, and sample logits to generate text using softmax function with option to specify temperature and top probabilities for specific characters.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/utils.py\":31-59", + "content": " self.vocab_size = len(self.word_table)\n self.stoi = {v: int(k) for k, v in self.word_table.items()}\n self.itos = {int(k): v for k, v in self.word_table.items()}\n self.UNKNOWN_CHAR = self.stoi[UNKNOWN_CHAR]\n def refine_context(self, context):\n context = context.strip().split('\\n')\n for c in range(len(context)):\n context[c] = context[c].strip().strip('\\u3000').strip('\\r')\n context = list(filter(lambda c: c != '', context))\n context = '\\n' + ('\\n'.join(context)).strip()\n if context == '':\n context = '\\n'\n return context\n def sample_logits(self, out, x, ctx_len, temperature=1.0, top_p_usual=None, top_p_newline=None):\n # out[self.UNKNOWN_CHAR] = -float('Inf')\n lastChar = int(x[-1])\n probs = F.softmax(out, dim=-1)\n if self.charMode:\n if self.itos[lastChar] == '\\n':\n top_p = top_p_newline\n else:\n top_p = top_p_usual\n else:" + }, + { + "comment": "This function calculates prime numbers using Fermat's Little Theorem and Miller-Rabin primality test. The prime number check is done only if the environment variable RWKV_RUN_DEVICE is set to \"cpu\". If not, it uses torch.multinomial() for selection.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/utils.py\":60-84", + "content": " top_p = top_p_usual\n if os.environ[\"RWKV_RUN_DEVICE\"] == \"cpu\":\n probs = probs.numpy()\n sorted_probs = np.sort(probs)[::-1]\n cumulative_probs = np.cumsum(sorted_probs)\n cutoff = float(sorted_probs[np.argmax(cumulative_probs > top_p)])\n probs[probs < cutoff] = 0\n if temperature != 1.0:\n probs = probs.pow(1.0 / temperature)\n probs = probs / np.sum(probs)\n out = np.random.choice(a=len(probs), p=probs)\n return out\n else:\n sorted_probs = torch.sort(probs, descending=True)[0]\n cumulative_probs = torch.cumsum(sorted_probs, dim=-1).cpu().numpy()\n cutoff = float(sorted_probs[np.argmax(cumulative_probs > top_p)])\n probs[probs < cutoff] = 0\n if temperature != 1.0:\n probs = probs.pow(1.0 / temperature)\n out = torch.multinomial(probs, num_samples=1)[0]\n return out\ndef MaybeIsPrime(number):\n if FermatPrimalityTest(number) and MillerRabinPrimalityTest(number):" + }, + { + "comment": "The code contains three functions: 1) FermatPrimalityTest checks if a number is prime using the Fermat Little Theorem; it returns True if number is greater than 1, and False otherwise. 2) MillerRabinPrimalityTest checks if a number is prime using the Miller-Rabin algorithm; it returns True for number=2, False for number=1 or even numbers, and starts by simplifying the odd part of the number. 3) The code uses a while loop to calculate the odd part of the number and determines the maximum iterations needed. The function then uses another loop to randomly select randomNumbers and test them for primality using the calculated parameters.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/utils.py\":85-121", + "content": " return True\n else:\n return False\ndef FermatPrimalityTest(number):\n if number > 1:\n for time in range(3):\n randomNumber = random.randint(2, number) - 1\n if pow(randomNumber, number - 1, number) != 1:\n return False\n return True\n else:\n return False\ndef MillerRabinPrimalityTest(number):\n if number == 2:\n return True\n elif number == 1 or number % 2 == 0:\n return False\n oddPartOfNumber = number - 1\n timesTwoDividNumber = 0\n while oddPartOfNumber % 2 == 0:\n oddPartOfNumber = oddPartOfNumber // 2\n timesTwoDividNumber = timesTwoDividNumber + 1\n for time in range(3):\n while True:\n randomNumber = random.randint(2, number) - 1\n if randomNumber != 0 and randomNumber != 1:\n break\n randomNumberWithPower = pow(randomNumber, oddPartOfNumber, number)\n if (randomNumberWithPower != 1) and (randomNumberWithPower != number - 1):\n iterationNumber = 1" + }, + { + "comment": "This code performs a verification process where it checks if the given number can be determined as prime. It does this by iterating through a range of values, squaring a random number and calculating its modulo with the input number until either all iterations are completed or the value no longer matches the expected result. If the latter occurs, it returns False indicating that the number is not prime. Otherwise, it returns True.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/utils.py\":123-129", + "content": " while (iterationNumber <= timesTwoDividNumber - 1) and (randomNumberWithPower != number - 1):\n randomNumberWithPower = pow(randomNumberWithPower, 2, number)\n iterationNumber = iterationNumber + 1\n if randomNumberWithPower != (number - 1):\n return False\n return True" + } + ] +} \ No newline at end of file diff --git a/docs/doc/8415154d-32e3-4f9c-935a-9cff25a6dd0c.json b/docs/doc/8415154d-32e3-4f9c-935a-9cff25a6dd0c.json new file mode 100644 index 00000000..cba8fbd9 --- /dev/null +++ b/docs/doc/8415154d-32e3-4f9c-935a-9cff25a6dd0c.json @@ -0,0 +1,30 @@ +{ + "summary": "The code snippet includes a tokenizer and class for binary file manipulation, indexing using RWKV language model, handling exceptions during decoding, verifying prime numbers, and displaying decoded results.", + "details": [ + { + "comment": "This code snippet imports various modules and defines a tokenizer object. It then provides instructions on how to use the make_data.py script for creating data from a JSONL file, including shuffling, duplicating, loading, tokenizing, saving, and computing magic prime for ctxlen 4096. The code is part of the RWKV language model's tokenization process.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/make_data.py\":0-33", + "content": "import json, math, random, sys, time, shutil, os, string, re, fileinput\nimport numpy as np\n\"\"\"\nHow to use:\npython make_data.py demo.jsonl 3 4096\nThis will:\n==> shuffle & duplicate demo.jsonl (for 3 epochs, good for finetuning) note: this will be very slow for large jsonl and we need more efficient code.\n==> load jsonl and tokenize\n==> save as demo.bin & demo.idx\n==> compute \"magic_prime\" for ctxlen 4096\nExample:\nAssume your source jsonl is:\n{\"text\":\"aa\"}\n{\"text\":\"bb\"}\n{\"text\":\"cc\"}\n{\"text\":\"dd\"}\nThe final binidx will be like (here \"/\" means end_of_doc, which is actually token [0]):\nbb/aa/dd/cc/dd/aa/bb/cc/dd/bb/cc/aa/\nwhere the data is repeated 3 times (each time with different shuffle)\n\"\"\"\n########################################################################################################\n# MMapIndexedDatasetBuilder\n########################################################################################################\nfrom tokenizer.rwkv_tokenizer import TRIE_TOKENIZER\ntokenizer = TRIE_TOKENIZER(\"tokenizer/rwkv_vocab_v20230424.txt\")" + }, + { + "comment": "This code defines a class `MMapIndexedDatasetBuilder` that allows adding items to a binary file and building an index. It uses the `MMapIndexedDataset` from `src.binidx`. The `add_item()` function appends arrays to the data file, `end_document()` marks document end, and `finalize()` writes the sizes of items in the data file and the document indices into an index file. It also checks if encoded raw is the same as original raw using tokenizer from rwkv.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/make_data.py\":34-62", + "content": "from src.binidx import MMapIndexedDataset\ndef index_file_path(prefix_path):\n return prefix_path + \".idx\"\ndef data_file_path(prefix_path):\n return prefix_path + \".bin\"\nclass MMapIndexedDatasetBuilder(object):\n def __init__(self, out_file, dtype=np.uint16):\n self._data_file = open(out_file, \"wb\")\n self._dtype = dtype\n self._sizes = []\n self._doc_idx = [0]\n def add_item(self, np_array):\n assert np_array.dtype == self._dtype\n self._data_file.write(np_array.tobytes(order=\"C\"))\n self._sizes.append(np_array.size)\n def end_document(self):\n self._doc_idx.append(len(self._sizes))\n def finalize(self, index_file):\n self._data_file.close()\n with MMapIndexedDataset.Index.writer(index_file, self._dtype) as index:\n index.write(self._sizes, self._doc_idx)\ncnt = 0\ndef add_raw(raw):\n global builder, cnt\n out = tokenizer.encode(raw)\n if tokenizer.decode(out) != raw:\n print(\"ERROR\" * 100)\n exit(0)\n out.append(0) # [0] = end_of_doc for rwkv tokenizer" + }, + { + "comment": "The code is loading a file, checking for non-empty lines, and then iterating over each epoch to generate data. The is_prime function checks if a number is prime or not. It writes the data to a temporary file before creating a final binary/index file with the specified length.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/make_data.py\":63-98", + "content": " builder.add_item(np.array(out, dtype=np.uint16))\n builder.end_document()\n if cnt % 500 == 0:\n print(cnt, end=\" \", flush=True)\n cnt += 1\ndef is_prime(n):\n if n <= 1:\n return False\n if n <= 3:\n return True\n if n % 2 == 0 or n % 3 == 0:\n return False\n i = 5\n while i * i <= n:\n if n % i == 0 or n % (i + 2) == 0:\n return False\n i += 6\n return True\n########################################################################################################\nN_EPOCH = int(sys.argv[2].strip())\nIN_FILE = sys.argv[1].strip()\nOUT_NAME = os.path.splitext(os.path.basename(IN_FILE))[0]\nCTX_LEN = int(sys.argv[3].strip())\nTEMP_FILE = \"make_data_temp.jsonl\"\nprint(f\"### Convert {IN_FILE} to {OUT_NAME}.bin/idx...\")\nwith open(IN_FILE, \"r\", encoding=\"utf-8\") as file:\n non_empty_lines = [line.strip() for line in file if line.strip()]\nprint(f\"### Found {len(non_empty_lines)} non-empty lines in {IN_FILE}\")\nfile = open(TEMP_FILE, \"w\", encoding=\"utf-8\")\nfor i in range(N_EPOCH):" + }, + { + "comment": "The code shuffles non-empty lines, writes them to a file, builds an indexed dataset from the temporary file, and finally verifies the result by checking if the last element is zero and displays a portion of the decoded data.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/make_data.py\":99-132", + "content": " print(f\"Shuffle: {i+1} out of {N_EPOCH}\")\n random.shuffle(non_empty_lines)\n for entry in non_empty_lines:\n file.write(entry + \"\\n\")\nfile.close()\n########################################################################################################\nprint(\"### Building binidx...\")\nbuilder = MMapIndexedDatasetBuilder(f\"{OUT_NAME}.bin\")\nwith fileinput.input(TEMP_FILE, encoding=\"utf-8\") as ffff:\n for line in ffff:\n x = json.loads(line)[\"text\"]\n add_raw(x)\nbuilder.finalize((f\"{OUT_NAME}.idx\"))\nprint(\"done\")\nprint(\"### Verifying result...\")\ndata = MMapIndexedDataset(OUT_NAME)\ndata_len = len(data)\ndata_size = len(data._bin_buffer) // data._index._dtype_size\nTODO = [0, data_len - 1]\nPREVIEW_LIMIT = 100\nfor idx in TODO:\n ptr, size = data._index[idx]\n dix = data.get(idx=idx, offset=0, length=size).astype(int)\n print(\"-\" * 70 + f\"[{OUT_NAME} idx {idx} sz {size}]\")\n assert dix[-1] == 0\n dix = dix[:-1]\n if len(dix) > PREVIEW_LIMIT:\n try:\n print(tokenizer.decode(dix[:PREVIEW_LIMIT]))" + }, + { + "comment": "Except blocks to handle potential exceptions when decoding dix with different lengths for preview, avoids UTF-8 bug. Else block to decode the entire dix. Checks if data size is greater than 3 times CTX_LEN and prints magic prime if it's a prime number within this condition.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/make_data.py\":133-157", + "content": " except:\n try:\n print(tokenizer.decode(dix[: PREVIEW_LIMIT + 1]))\n except:\n print(tokenizer.decode(dix[: PREVIEW_LIMIT + 2]))\n print(\"\u00b7 \" * 30)\n try: # avoid utf-8 bug\n print(tokenizer.decode(dix[-PREVIEW_LIMIT:]))\n except:\n try:\n print(tokenizer.decode(dix[-PREVIEW_LIMIT - 1 :]))\n except:\n print(tokenizer.decode(dix[-PREVIEW_LIMIT - 2 :]))\n else:\n print(tokenizer.decode(dix))\nprint(f\"{'-'*80}\\n### Final {OUT_NAME}.bin/idx has {data_size} tokens, {data_len} items. Dtype {data._index.dtype}\")\nif data_size >= CTX_LEN * 3:\n n_chunk = int(data_size // CTX_LEN) - 1\n for i in range(n_chunk, 0, -1):\n if i % 3 == 2:\n if is_prime(i):\n print(f\"\\n### magic_prime = {i} (for ctxlen {CTX_LEN})\\n\")\n exit(0)" + } + ] +} \ No newline at end of file diff --git a/docs/doc/88995b6c-bc41-49ba-b77d-d2d682869425.json b/docs/doc/88995b6c-bc41-49ba-b77d-d2d682869425.json new file mode 100644 index 00000000..87d26697 --- /dev/null +++ b/docs/doc/88995b6c-bc41-49ba-b77d-d2d682869425.json @@ -0,0 +1,25 @@ +{ + "summary": "This code defines a Dataset class for the RWKV v2-RNN Language Model, creating token lists and storing them in \"vocab.json\". It provides functions to convert input data into tokens using a pre-defined vocabulary, supports random access to data with fixed epoch length, initializes a word table, sets vocabulary size, establishes dictionaries, refines context, samples logits based on input, sorts probabilities, breaks loop at top_p, calculates cutoff value and modifies probabilities, applies optional temperature parameter, and returns a sample using multinomial sampling.", + "details": [ + { + "comment": "This code defines a custom Dataset class for the RWKV v2-RNN Language Model. It builds a token list from input data, stores it in \"vocab.json\", and calculates the dataset size and vocabulary size.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v2-RNN/src/utils.py\":0-32", + "content": "########################################################################################################\n# The RWKV v2-RNN Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nimport json\nimport random\nimport time\nimport math\nimport numpy as np\nimport torch\nimport torch.nn as nn\nfrom torch.nn import functional as F\nfrom torch.utils.data import Dataset\nclass Dataset(Dataset):\n def __init__(self, data, ctx_len, epoch_length_fixed):\n print('building token list...', end=' ')\n unique = sorted(list(set(data)))\n # print()\n # for u in unique:\n # print(u, end=' ')\n # print('\\n\\n')\n xx = 0\n xxObj = {}\n for u in unique:\n xxObj[xx] = u\n xx += 1\n with open('vocab.json', \"w\", encoding=\"utf-16\") as vocab_file:\n vocab_file.write(json.dumps(xxObj, ensure_ascii=False))\n data_size, vocab_size = len(data), len(unique)" + }, + { + "comment": "This code defines a class for data processing and loading, with functions to convert input data into tokens using a pre-defined vocabulary. It also provides random access to the data in fixed epoch length. The TOKENIZER class is initialized with a WORD_NAME file path and an UNKNOWN_CHAR placeholder.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v2-RNN/src/utils.py\":33-58", + "content": " print('data has %d tokens, %d unique.' % (data_size, vocab_size))\n self.stoi = {ch: i for i, ch in enumerate(unique)}\n self.itos = {i: ch for i, ch in enumerate(unique)}\n self.ctx_len = ctx_len\n self.epoch_length_fixed = epoch_length_fixed\n self.vocab_size = vocab_size\n self.data = data\n def __len__(self):\n return self.epoch_length_fixed\n def __getitem__(self, idx):\n # cheat: pick a random spot in dataset\n i = np.random.randint(0, len(self.data) - (self.ctx_len + 1))\n chunk = self.data[i:i+self.ctx_len+1]\n dix = [self.stoi[s] for s in chunk]\n x = torch.tensor(dix[:-1], dtype=torch.long,\n device=torch.device('cuda'))\n y = torch.tensor(dix[1:], dtype=torch.long,\n device=torch.device('cuda'))\n return x, y\nclass TOKENIZER():\n def __init__(self, WORD_NAME, UNKNOWN_CHAR='\\ue083'):\n with open(WORD_NAME + '.json', \"r\", encoding=\"utf-16\") as result_file:" + }, + { + "comment": "The code initializes a word table, sets vocabulary size, establishes string-to-int and int-to-string dictionaries, defines an UNKNOWN_CHAR, refines context by stripping whitespace and special characters, and samples logits based on input while applying softmax function and considering different top_p values for newline characters.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v2-RNN/src/utils.py\":59-89", + "content": " self.word_table = json.load(result_file)\n self.vocab_size = len(self.word_table)\n self.stoi = {v: int(k) for k, v in self.word_table.items()}\n self.itos = {int(k): v for k, v in self.word_table.items()}\n self.UNKNOWN_CHAR = self.stoi[UNKNOWN_CHAR]\n def refine_context(self, context):\n context = context.strip().split('\\n')\n for c in range(len(context)):\n context[c] = context[c].strip().strip('\\u3000').strip('\\r')\n context = list(filter(lambda c: c != '', context))\n context = '\\n' + ('\\n'.join(context)).strip()\n if context == '':\n context = '\\n'\n return context\n def sample_logits(self, out, x, ctx_len, temperature=1.0, top_p_usual=None, top_p_newline=None):\n # out[self.UNKNOWN_CHAR] = -float('Inf')\n lastChar = int(x[-1])\n probs = F.softmax(torch.tensor(out), dim=-1)\n if self.itos[lastChar] == '\\n':\n top_p = top_p_newline\n else:\n top_p = top_p_usual" + }, + { + "comment": "Sorts probabilities and breaks loop when cumulative probability exceeds top_p. Calculates a cutoff value based on the sorted probabilities and sets low probabilities to 0. Optionally applies temperature parameter. Returns a single sample from the modified probabilities using multinomial sampling.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v2-RNN/src/utils.py\":91-121", + "content": " sorted_probs, s_index = torch.sort(probs, descending=True)\n # for j in range(30):\n # pp = sorted_probs[j].item()\n # if pp < 0.005:\n # break\n # ss = self.itos[int(s_index[j])].replace('\\n','_')\n # print(f'{math.floor(pp*100):>3.0f}{ss}', end='')\n # print('')\n cumulative_probs = torch.cumsum(sorted_probs, dim=-1).numpy()\n cutoff = float(sorted_probs[np.argmax(cumulative_probs > top_p)])\n probs[probs < cutoff] = 0\n # print(\"[\" + str(round(cutoff,4)) + ' ' + str(round(to_float(sum(probs)),3)) + \"]\", end = \"\")\n if temperature != 1.0:\n probs = probs.pow(1.0 / temperature)\n return torch.multinomial(probs, num_samples=1)[0]\ndef to_float(x):\n return x.cpu().detach().numpy().flatten()[0].astype(float)\ndef set_seed(seed):\n random.seed(seed)\n np.random.seed(seed)\n torch.manual_seed(seed)\n torch.cuda.manual_seed_all(seed)" + } + ] +} \ No newline at end of file diff --git a/docs/doc/8b995a8e-48ca-463c-8022-f8971cf28e49.json b/docs/doc/8b995a8e-48ca-463c-8022-f8971cf28e49.json new file mode 100644 index 00000000..c6811cef --- /dev/null +++ b/docs/doc/8b995a8e-48ca-463c-8022-f8971cf28e49.json @@ -0,0 +1,1500 @@ +{ + "summary": "A Sydney restaurant attack leads to an arrest, while AI improves text readability and sentiment analysis. The Tenors face lawsuits over copyrighted content, CyberTrust merges for growth, and AI is applied in various fields. Code sorts dictionaries, translation tools align sentences, and HIV care models are needed in developing countries.", + "details": [ + { + "comment": "This code generates an article based on the given information about a man charged with murder and attempted murder after a woman and her date were stabbed at a restaurant in Sydney, Australia. The article includes details about the incident, suspect, and impact on the community.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":0-0", + "content": "{\"text\": \"System: You are an AI assistant. You will be given a task. You must generate a detailed and long answer.\\n\\nUser: Write an article based on this \\\"A man has been charged with murder and attempted murder after a woman and the man she was on a date with were stabbed at a restaurant in Sydney, Australia.\\\"\\n\\nAssistant: Title: Tragedy Strikes in Sydney: Victims Stabbed during Dinner Date, Suspect Charged with Murder and Attempted Murder\\nIntroduction\\nA seemingly ordinary evening took a tragic turn in Sydney, Australia when a romantic dinner date at a local restaurant ended in bloodshed, as a man allegedly attacked a couple, stabbing both victims. Local authorities have charged the suspect with murder and attempted murder in connection with the horrific crime. In this article, we delve into the chilling incident, the suspect, and the impact it has had on the community.\\nThe Incident\\nThe alarming event took place at a popular restaurant in Sydney, Australia. A woman and her date" + }, + { + "comment": "Code snippet depicts a tragic incident involving a couple attacked in a restaurant, with one dying and the other critically injured. The attacker had no known connections to the victims, leaving investigators baffled about motive. Police eventually identify and arrest the suspect.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":0-0", + "content": " were enjoying their meal \u2013 blissfully unaware of the tragic circumstances that would soon unfold. As they shared conversation and indulged in their dinner, a man approached the couple and, without warning, began to stab them with a sharp object.\\nPatrons at the restaurant scrambled for safety, as the assailant continued his violent rampage. Bystanders quickly called emergency services, who promptly arrived at the scene. Although both victims were rushed to a nearby hospital, the woman succumbed to her injuries. The man she was on a date with remained in critical condition, fighting for his life.\\nThe Suspect\\nInitial investigations revealed that the alleged attacker had no known connections to the victims \u2013 adding to the mystifying nature of this sudden and brutal assault. Witnesses reported that the assailant seemed to have no specific motive and appeared to carry out the act senselessly.\\nFollowing a thorough investigation, local police identified and arrested the suspect. During th" + }, + { + "comment": "The code describes a tragic incident involving an alleged attacker with a history of criminal behavior and troubled past, who has been charged with murder and attempted murder. The community is reeling from the event, the restaurant where it happened has closed, and there's discussion on reassessing security measures to reinforce public safety.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":0-0", + "content": "e process, it was discovered that the alleged attacker had a history of criminal behavior and a troubled past, though it is unclear if this played a role in the tragic incident.\\nAuthorities have formally charged the man with murder and attempted murder in connection with the heinous crime. He awaits a hearing to determine a trial date and, if convicted, could face a life sentence in prison.\\nThe Community's Response\\nThe shocking nature of the crime has left the residents of Sydney reeling, as they struggle to come to terms with the harrowing event. The restaurant where the attack occurred has since been closed, with a makeshift memorial being created outside to commemorate the victims.\\nMany have questioned how such a vicious crime could happen in what is considered to be one of the safest cities in the world. This tragic event has spurred local officials to reassess current security measures and devise strategies to reinforce public safety. Additionally, discussions surrounding ment" + }, + { + "comment": "This code is a JSON formatted text containing information about a tragic incident, including the community's response and determination to prevent similar incidents in the future. The passage expresses grief for the victims, support for their families, and solidarity in the face of senseless acts of violence that can happen anywhere.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":0-0", + "content": "al health and criminal rehabilitation have surfaced as residents seek to comprehend the actions of the alleged perpetrator and prevent similar incidents from occurring in the future.\\nIn the wake of the stabbing, the community has banded together with an outpouring of grief and support for the victims and their families. Candlelight vigils have been held, and an online fundraising campaign is underway to assist the surviving victim with his medical expenses and recovery.\\nConclusion\\nThe tragic attack in Sydney serves as a chilling reminder that senseless acts of violence can happen anywhere and at any time. The community's response to this horrific and seemingly random act of brutality has been one of solidarity and determination to prevent such incidents in the future. As the case unfolds, the victims and their families remain in the hearts of the community, who are grieving the devastating loss of a life cut tragically short and supporting the recovering victim as he continues to endure this unimaginable ordeal.\"}" + }, + { + "comment": "The code describes a background article on water stratification, its effects on nutrient mixing and primary production in an area. It then introduces a new situation involving High Lake and Low Lake with stratified water and poor nutrients.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":1-1", + "content": "{\"text\": \"User: Q:I read this background article the other day: Water stratification is when water masses with different properties - salinity (halocline), oxygenation (chemocline), density (pycnocline), temperature (thermocline) - form layers that act as barriers to water mixing which could lead to anoxia or euxinia.[1] These layers are normally arranged according to density, with the least dense water masses sitting above the more dense layers. Water stratification also creates barriers to nutrient mixing between layers. This can affect the primary production in an area by limiting photosynthetic processes. When nutrients from the benthos cannot travel up into the photic zone, phytoplankton may be limited by nutrient availability. Lower primary production also leads to lower net productivity in waters.[2] I am facing a new situation today: High lake had a lot of fresh and salt water mixing, therefore presented layers of stratified water.The water was poor in nutrients.Low lake had " + }, + { + "comment": "The code appears to be related to a question-answering task with information about lakes and their dissolved nutrient levels. The answer provided suggests that High Lake has fewer dissolved nutrients due to stratified water layers acting as a barrier for nutrient mixing, resulting in poorer nutrients compared to the other lake mentioned.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":1-2", + "content": "no such problems and it was a very abundant and prolific fishing spot. Using the knowledge I acquired from the background article, how should I answer correctly the following question regarding my new situation: Which lake had fewer dissolved nutrients?\\nA:\\n\\nAssistant: The lake with fewer dissolved nutrients would be High lake. This is because it had layers of stratified water, which acts as a barrier to nutrient mixing. As a result, it was poor in nutrients.\"}\n{\"text\": \"System: You are an AI assistant. You will be given a task. You must generate a detailed and long answer.\\n\\nUser: Summarize this article in one sentence.\\nYou'll find this in your Start menu. You can also press \u229e Win and type \\\"xbox.\\\" You can use the Xbox app to take a screenshot of any game running in Windows 10. You'll find this in the menu on the left side of the screen. This will display the DVR and screenshot settings. The Game DVR will need to be enabled in order to take screenshots with the app. Click the sli" + }, + { + "comment": "The code explains how to use the Xbox app in Windows 10 for taking screenshots of games by enabling the Game DVR and using built-in or custom shortcuts, saving them in the Captures folder, which can be relocated if desired.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":2-2", + "content": "der to toggle it on. When the Game DVR is enabled, the built-in shortcut is \u229e Win+Alt+PrtScn. You can click the empty field next to this and set your own custom shortcut if you want. This will open the folder that your screenshots will be saved in. If you want to change where your screenshots are saved, you'll need to move the Captures folder to that location. Screenshots will automatically save to the Captures folder, wherever it is. For example, to move it to your Pictures folder, move up one directory and then drag the Captures folder to the Pictures folder in your sidebar. The Xbox app does not need to be open to use the screenshot feature. It will be saved to the Captures folder that you may have moved earlier.\\n\\nAssistant: The article explains how to use the Xbox app in Windows 10 to take screenshots of games by enabling the Game DVR and using the built-in shortcut or a custom shortcut, with the screenshots saved in the Captures folder which can be moved to a different location if desired.\"}" + }, + { + "comment": "The code is a dialogue between an AI assistant and a user, where the user provides instructions to correct a sentence's capitalization. The AI completes the task by identifying errors in capitalization and applying corrections.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":3-4", + "content": "{\"text\": \"System: You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: what brought him into the field was the idea of louis de broglie that particles have a wave aspect. schr\u00f6dinger sought our de broglie's work and read it avidly.\\nRepeat this setence, but with the correct capitalization.\\n\\nAssistant: What brought him into the field was the idea of Louis de Broglie that particles have a wave aspect. Schr\u00f6dinger sought out de Broglie's work and read it avidly.\\n1. I capitalized the first letter of the first word of the sentence, \\\"What.\\\"\\n2. I capitalized the first letter of the proper name \\\"Louis\\\" and \\\"Broglie\\\" (last name).\\n3. I capitalized the first letter of the proper name \\\"Schr\u00f6dinger.\\\"\\n4. I corrected the typo \\\"our\\\" to \\\"out.\\\"\"}\n{\"text\": \"System: You are an AI assistant. User will you give you a task. Your goal is to complete the " + }, + { + "comment": "This code is tasked with adding spaces to an uninterrupted string of text, guided by rules of the English language and presumptions about formats. It's breaking down company details, separating individual words and address details, and considering bullet characters as separators.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":4-4", + "content": "task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: Add spaces: ifmelectronicgmbh\u2022Friedrichstra\u00dfe1\u202245128Essen\u2014Wereservetherighttomaketechnicalalterationswithoutpriornotice.\u2014GB\u2014SV5200\u201403.06.2015\\n\\nAssistant: The task provided is to add spaces to a long, uninterrupted string of text to make it more readable. While determining where to place spaces, I'll be guided by the rules of the English language, as well as presumptions about the format of addresses, company names, dates, and technical/legal notes.\\n1. Breaking down company details:\\nFirst, I'm going to separate individual words and details related to the company and address. \\n- \\\"ifmelectronicgmbh\\\": This appears to be a company name, which should be \\\"ifm electronic gmbh\\\".\\n- \\\"\u2022Friedrichstra\u00dfe1\u202245128Essen\\\": This seems to be a street address in Germany (Friedrichstra\u00dfe 1, 45128 Essen).\\n- The bullet character (\u2022) can be considered as a separator.\\nAfter making these chang" + }, + { + "comment": "Analyzing text and separating segments, identifying reservation clause, product details, and date.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":4-4", + "content": "es, the phrase so far is: \\\"ifm electronic gmbh \u2022 Friedrichstra\u00dfe 1 \u2022 45128 Essen\\\".\\n2. Handling reservation clause:\\nNext, I notice the segment of text starting with \\\"Wereservetherighttomaketechnicalalterationswithoutpriornotice.\\\" This appears to be a standard non-liability clause. Let's break it down:\\n- \\\"Wereservetherighttomaketechnicalalterationswithoutpriornotice\\\": This can be separated into \\\"We reserve the right to make technical alterations without prior notice.\\\"\\nAfter adding these modifications, the sentence so far becomes \\\"ifm electronic gmbh \u2022 Friedrichstra\u00dfe 1 \u2022 45128 Essen - We reserve the right to make technical alterations without prior notice.\\\"\\n3. Identifying product details and date:\\nTo wrap up, I'll divide the final details which seem to be related to a product model number and the date:\\n- \\\"\u2014GB\u2014SV5200\u201403.06.2015\\\": Here, I can establish that \\\"GB\\\" and \\\"SV5200\\\" are likely separate entities (looks like product identifiers or codes), and \\\"03.06.2015\\\" is" + }, + { + "comment": "The code is demonstrating how to split a text into more readable sentences and phrases while preserving the meaning. It explains that this method helps improve clarity for readers by breaking down the original text into smaller, more digestible segments. The code also provides an example of identifying the sentiment in a movie review sentence, showing that it's positive as people enjoy watching the movie repeatedly during Christmas time.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":4-6", + "content": " probably a date.\\nTherefore, this segment becomes: \\\"- GB - SV5200 - 03.06.2015.\\\"\\nSo, the final version of the separated text is: \\\"ifm electronic gmbh \u2022 Friedrichstra\u00dfe 1 \u2022 45128 Essen - We reserve the right to make technical alterations without prior notice. - GB - SV5200 - 03.06.2015.\\\"\\nThis step-wise breakdown splits the original text into more readable sentences and phrases while maintaining their meanings and serving the reader's clarity.\"}\n{\"text\": \"System: You are a helpful assistant, who always provide explanation. Think like you are answering to a five year old.\\n\\nUser: What is the sentiment of the following movie review sentence?\\na christmas perennial\\n\\nAssistant: A \\\"Christmas perennial\\\" means a movie that is enjoyed every Christmas and comes back like a tradition. So, the sentiment of this movie review sentence is positive because it suggests that people like to watch this movie again and again during Christmas time.\"}\n{\"text\": \"User: What type of details about the t" + }, + { + "comment": "This code snippet appears to be a section of text from a biography of the vocal quartet, \"The Tenors,\" previously known as \"The Canadian Tenors.\" It provides information about the members' names, origin, musical style, notable performances, and venues.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":6-6", + "content": "enors can be gathered from the following bio? Bio: the tenors -lrb- formerly known as the canadian tenors -rrb- are a vocal quartet consisting of remigio pereira , victor micallef , fraser walters , and clifton murray . they perform operatic pop music that is a mixture of classical and pop , featuring songs such as `` the prayer '' and `` panis angelicus '' from the former genre , and leonard cohen 's `` hallelujah '' from the latter . originating from canada , pereira from ottawa/gatineau , micallef from toronto , walters from vancouver and murray from port mcneil , the tenors have performed on more than 60 international television programs . they appeared on the oprah winfrey show with celine dion in 2010 , at the 63rd primetime emmy awards , at the opening ceremonies of 2010 winter olympics in vancouver and on itv -lrb- tv network -rrb- diamond jubilee show at windsor castle for hm queen elizabeth ii . the venues they have performed in include the tel aviv opera house , and the ma" + }, + { + "comment": "This code describes the history and achievements of a musical group, mentioning their performances, shared stages with famous artists, album releases, and awards.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":6-6", + "content": "ndalay bay events center in las vegas . they also appeared on the 2009 grey cup broadcast and on cbc television 's `` holiday festival of ice '' . they have shared the stage with sarah mclachlan , neil young , paul anka , justin bieber , paul mccartney , brian mcknight , david foster , and natalie cole . universal music group released their platinum-selling self-titled debut album and double-platinum holiday album , the perfect gift . the tenors filmed a pbs special at the smith center for the performing arts that began airing in 2012 . the canadian tenors were renamed the tenors in november 2012 in advance of their third album lead with your heart . also in 2012 , the tenors signed a us distribution deal with verve records . the tenors latest release lead with your heart went platinum in canada in seven weeks . the album garnered a 2013 juno award for adult contemporary album of the year and a juno award nomination for jack richardson for producer of the year for bob ezrin 's producti" + }, + { + "comment": "Code snippet is providing information about The Tenors, a group formerly known as The Canadian Tenors. They perform operatic pop music and have performed on international television programs and shared stage with various artists.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":6-6", + "content": "on of the title track `` forever young . '' lead with your heart was featured on katie , good...\\nThe answer to this question is:\\n\\nAssistant: From the bio, we can gather that The Tenors:\\n1. Were formerly known as The Canadian Tenors.\\n2. Consist of Remigio Pereira, Victor Micallef, Fraser Walters, and Clifton Murray.\\n3. Perform operatic pop music, a mixture of classical and pop.\\n4. Have songs such as \\\"The Prayer,\\\" \\\"Panis Angelicus,\\\" and Leonard Cohen's \\\"Hallelujah\\\" in their repertoire.\\n5. Originated from Canada, with members from Ottawa/Gatineau, Toronto, Vancouver, and Port McNeil.\\n6. Have performed on more than 60 international television programs, including The Oprah Winfrey Show, the 63rd Primetime Emmy Awards, and the 2010 Winter Olympics opening ceremonies.\\n7. Performed at venues such as the Tel Aviv Opera House and Mandalay Bay Events Center in Las Vegas.\\n8. Have shared the stage with artists like Sarah McLachlan, Neil Young, Paul Anka, Justin Bieber, Paul McCartn" + }, + { + "comment": "This code contains information about an unnamed group of artists, discussing their accomplishments and name changes. The most logical next event would be related to their future plans or projects. Other answers mentioning YouTube copyright issues are unrelated to this topic.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":6-7", + "content": "ey, Brian McKnight, David Foster, and Natalie Cole.\\n9. Released platinum-selling self-titled debut album, a double-platinum holiday album called The Perfect Gift, and their third album, Lead With Your Heart.\\n10. Filmed a PBS special at the Smith Center for the Performing Arts.\\n11. Renamed themselves from The Canadian Tenors to The Tenors in November 2012.\\n12. Signed a US distribution deal with Verve Records in 2012.\\n13. Received a 2013 Juno Award for Adult Contemporary Album of the Year and a Juno Award nomination for Jack Richardson Producer of the Year for their album Lead With Your Heart.\"}\n{\"text\": \"System: You should describe the task and explain your answer. While answering a multiple choice question, first output the correct answer(s). Then explain why other answers are wrong. Think like you are answering to a five year old.\\n\\nUser: What is the most logical next event?\\nHow to unblock copyright infringement on youtube \\nUnderstand why you may have received a content id clai" + }, + { + "comment": "Content id system identifies potential copyrighted content in videos by scanning for audio, video, and images. If a match occurs, original owner is notified, and they can take action. Copyright infringement lawsuits can be lengthy.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":7-7", + "content": "m. \\nContent id is a system that identifies potential copyrighted content in videos by scanning uploaded videos for previously-uploaded content. The system will scan for audio, video, and images.\\nChoose your answer from:\\na). Answer as many questions as you can. If you are currently having trouble out of confusion or have not answered a question assigned by your user, then this is a great way to check for copyright infringement.\\nb). If a match occurs, the original owner is notified and a content id claim is filed. The original owner can choose to do nothing, mute their audio in your video, block the video from being viewed, monetize the video, or track the video's viewership.\\nc). Figure out whether you have used the content id app. Copyright-infringement lawsuits can take up to a few years to fully prove that you use the content id app in your videos.\\nd). If you have uploaded video videos to youtube, it will scan for audio, video, and images in your channel. You may receive your co" + }, + { + "comment": "This code describes the correct answer for handling a content id claim on YouTube, which involves the original owner's decision after a match is found. Answer options a), c), and d) are incorrect as they discuss unrelated topics or steps in the process.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":7-7", + "content": "ntent id when you sign up for a video profile.\\n\\nAssistant: b). If a match occurs, the original owner is notified and a content id claim is filed. The original owner can choose to do nothing, mute their audio in your video, block the video from being viewed, monetize the video, or track the video's viewership.\\nThis is the correct answer because it describes the next logical event in the process of handling a content id claim on YouTube. When a match is found, the original owner gets to decide what to do with the claimed content.\\na) is wrong because it talks about answering questions and checking for copyright infringement, which is not directly related to the next step in unblocking copyright infringement on YouTube.\\nc) is incorrect because it discusses proving the use of the content id app in lawsuits, which is not the next step in dealing with a content id claim on YouTube.\\nd) is wrong because it talks about uploading videos to YouTube and receiving content id, but not about the next step in resolving a content id claim.\"}" + }, + { + "comment": "This code appears to be a JSON object containing a Q&A session between an AI assistant and a user. The user is given options to answer \"Yes\" or \"No\", followed by a question asking if the product review provided is negative. The assistant responds with \"Yes, this product review is negative\" due to the reviewer's disappointment with the giant inscription on the first page of the book and its absence from the product description.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":8-8", + "content": "{\"text\": \"System: You are an AI assistant. Provide a detailed answer so user don\u2019t need to search outside to understand the answer.\\n\\nUser: OPTIONS:\\n[i] Yes;\\n[ii] No;\\nGiven those answer options, answer the question: Title: Good book with a bad description of the product Review: Although this looks to be a great book, I was disappointed to see the giant inscription on the first page of the book. Because this book was for a gift, noting this inscription on the product description would have been very helpful Is this product review negative?\\nA:\\n\\nAssistant: Yes, this product review is negative. The reviewer expresses disappointment with the product due to the giant inscription on the first page, which was not mentioned in the product description. They also mention that the book was intended as a gift, which further explains their dissatisfaction with the unexpected inscription. Although the reviewer acknowledges that the book might be great, their overall experience with the product seems to be negative due to this issue.\"}" + }, + { + "comment": "This code snippet is likely a part of an article or report about Iran's oil production and the impact of sanctions on their revenue. The focus seems to be on the recent six-month agreement over Tehran's nuclear program, and its potential to open new relations with Iran while highlighting the challenges faced by the country's oil sector.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":9-9", + "content": "{\"text\": \"System: You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: Write some highlights for the following article:\\nThe historic six-month agreement over Tehran's nuclear program may begin a new era of relations with Iran, but it will be a long road back for the country's most vital sector, oil. Iran produces about two and half million barrels a day -- far off its 4-million-barrel-per-day peak a decade ago. Output is hovering at a level last seen at the end of Iran's war with Iraq. With North Sea Brent crude averaging over $100 a barrel for a record three years running, the sanctions on energy alone are costing Tehran about $50 billion in lost annual revenue. Despite the deal breakthrough, U.S. Secretary of State John Kerry said most of the sanctions will stick as the world gauges the intentions of this relatively new administration in Tehra" + }, + { + "comment": "The text describes the impact of oil sanctions on Iran's revenue and highlights that if transparency is achieved, it can expect more in return. The passage also mentions the ongoing tension around the Strait of Hormuz, the risk premium on global energy prices, and the reduction in energy imports by Iran's top customers due to US and European pressure.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":9-9", + "content": "n. \\\"During the six month phase, the oil sanctions that will remain in place will continue to cause over $25 billion in lost revenues to Iran or over $4 billion a month,\\\" he said. The message is clear: the pressure remains, but if all goes well, in a half year's time Iran can expect more in return for transparency. The last few years have been filled with uncertainty. Tensions around the Strait of Hormuz with the on-and-off threats by the previous government of Mahmoud Ahmadinejad to block oil shipments in the Gulf have kept what strategists call a 10-15% risk premium on global energy prices. Iran sits on about 9% of the world's proven oil reserves, claiming a few years back that it has nearly 150 billion barrels and the world's largest gas field. But its top four customers --China, India, Japan and South Korea -- have all had to cut back their energy imports by a third or more in the past few years due to U.S. and European pressure. With every year that has passed, the screws have b" + }, + { + "comment": "Iran's economy faced significant challenges due to sanctions, including currency devaluation, hyper-inflation, and massive job losses. The initial agreement aimed to alleviate this uncertainty in the oil market.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":9-9", + "content": "een tightened by Washington and the countries of the European Union. It was not only sanctions against oil, but also blocking Iran's ability to secure shipping insurance and to trade in U.S. dollars and euros. That economic isolation, many Middle East strategists I have spoken with suggest, is what brought Iran's new government to the negotiating table. In 2012, the Iranian rial plunged by up to 80%. Basic staples of life, poultry and bread skyrocketed in price and the economy went through a period of hyper-inflation. With rising import prices due to a plummeting currency, industrial Iran is no longer able to compete. There are reports that the state manufacturing sector had laid off up to 800,000 workers in 2012 and those who have kept their jobs saw their wages eroded by skyrocketing prices. Oil executives with experience in the country say this initial agreement would help lift a cloud of uncertainty over the oil market, but that President Hassan Rouhani and his cabinet have to " + }, + { + "comment": "This code appears to discuss a news article or statement about the potential for increased investment and cooperation in Iran if sanctions are lifted, citing concerns from French energy group Total's CEO, Christophe de Margerie, regarding creating a better climate for investment.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":9-9", + "content": "establish much better contract terms if this honeymoon period lifts sanctions. The blunt-speaking Chief Executive of French energy group Total, Christophe de Margerie, told me at an energy conference in Abu Dhabi this month that Iran needs to try and create a better climate for investment if this weekend's breakthrough is sustained. \\\"I can tell you with the experience we have from Iran it doesn't always bring as we say in French partie de plaisir ... a win-win.\\\" Oil giant Saudi Arabia has expressed doubts about signing a deal with Tehran, which will introduce more challenges within OPEC. Iraq plans to double production by 2020 to six million barrels a day and with Iran wanting to rebuild exports, the Kingdom may have trim its own production to defend prices. It is still early days, but this country of nearly 80 million people has been described as potentially being the Germany of the Middle East with plenty of natural resources -- that is, if it can emerge from years of economic iso" + }, + { + "comment": "Code describes the impact of historic six-month agreement on Iran's nuclear program, highlighting challenges in oil sector recovery due to sanctions and reduced production levels.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":9-9", + "content": "lation.\\nHighlights:\\n\\nAssistant: 1. Historic six-month agreement over Tehran's nuclear program presents an opportunity for Iran's oil sector recovery, but the journey will be long and difficult.\\n2. Iran's oil production is currently around 2.5 million barrels per day, significantly less than its peak of 4 million barrels per day a decade ago.\\n3. Sanctions on energy alone are costing Tehran approximately $50 billion in lost annual revenue.\\n4. U.S. Secretary of State John Kerry said most sanctions will remain in place during the six-month phase, causing Iran to lose over $25 billion in revenues.\\n5. Iran's top four oil customers, China, India, Japan, and South Korea, have had to cut back their imports by a third or more due to U.S. and European pressure.\\n6. Iran's economy suffered greatly during the period of sanctions, with rampant inflation and industrial layoffs.\\n7. Establishing better contract terms is crucial for Iran to lift sanctions and improve the investment climate.\\n8. " + }, + { + "comment": "Saudi Arabia expresses doubts about signing a deal with Tehran, posing challenges for Iran's oil output growth within OPEC.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":9-10", + "content": "Oil giant Saudi Arabia has doubts about signing a deal with Tehran, introducing challenges within OPEC for Iran's oil output growth.\"}\n{\"text\": \"System: You are an AI assistant. You will be given a task. You must generate a detailed and long answer.\\n\\nUser: Given the question: Read the bio below and try to give details on aidan somers 's: - sport - county - icposition - province - icupdate - birth date - allstars - clprovince - code - name - birth place - icprovince - nhl - counties - icyears - irish - icallireland - clcounty - club Bio: aidan somers -lrb- born 1946 -rrb- is an irish retired hurler who played as a right corner-back for the wexford senior team . born in rathnure , county wexford , somers first excelled at hurling in his youth . he arrived on the inter-county scene at the age of seventeen when he first linked up with the wexford minor team , before later joining the under-21 side . he joined" + }, + { + "comment": "Aidan Somers is a retired Irish hurler, born in 1946 from County Wexford. He played for one season during the 1970 championship and has achieved numerous accolades, including winning one Leinster medal on field play and three Leinster medals with Rathnure club.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":10-10", + "content": " the senior panel during the 1970 championship and played for just one season . somers won one leinster medal on the field of play . at club level somers is a three-time leinster medallist with rathnure . in addition to this he also won eight championship medals .\\nThe answer is:\\n\\nAssistant: Aidan Somers is a retired Irish hurler who has had an accomplished career in the sport. Below are the details based on the provided bio:\\n- Sport: Hurling\\n- County: Wexford\\n- ICPosition: Right corner-back\\n- Province: Leinster (Ireland)\\n- ICUpdate: Somers joined the senior panel during the 1970 championship and played for just one season.\\n- Birth Date: 1946\\n- AllStars: Not mentioned in the bio\\n- CLProvince: Leinster (won three Leinster medals with Rathnure club)\\n- Code: Not mentioned in the bio\\n- Name: Aidan Somers\\n- Birth Place: Rathnure, County Wexford, Ireland\\n- ICProvince: Leinster (won one Leinster medal on the field of play)\\n- NHL: Not mentioned in the bio\\n- Counties: Wexford\\n-" + }, + { + "comment": "Aidan Somers, Irish hurler born 1946 in County Wexford, joined minor team at age 17, played for senior panel in 1970 championship as right corner-back, achieved county and club successes", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":10-10", + "content": " ICYears: Started at the age of seventeen with Wexford minor team and later joined the under-21 side before joining the senior panel in the 1970 championship (played for just one season)\\n- Irish: Yes, Aidan Somers is of Irish nationality\\n- ICAllIreland: Not mentioned in the bio\\n- CLCounty: Wexford (won eight championship medals with Rathnure club)\\n- Club: Rathnure\\nAidan Somers was born in 1946 in Rathnure, County Wexford. He initially demonstrated his hurling talent while he was still young. He joined the inter-county scene at the age of seventeen, starting with the Wexford minor team before later progressing to the under-21 squad. Somers eventually became a member of the senior panel during the 1970 championship, where he played for only one season as a right corner-back.\\nThroughout his career, Somers achieved notable successes both at the county and club levels. With the Wexford senior team, he won one Leinster medal on the field. As a member of the Rathnure club, he secured th" + }, + { + "comment": "This code seems to be related to regulation information and manure classification, specifically referring to dried farmyard manure, dehydrated poultry manure, composted animal excrements, etc. The content appears to be in a structured format with identifiers for various elements and a translation request to Czech language.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":10-11", + "content": "ree Leinster medals and eight championship medals, demonstrating his exceptional skills and dedication to the sport of hurling.\"}\n{\"text\": \"System: You are an AI assistant. You will be given a task. You must generate a detailed and long answer.\\n\\nUser: No 2328/91 (1), as last amended by Regulation (EC) No 3669/93 (2)\\\"> ID=\\\"1\\\">- Dried farmyard manure and dehydrated poultry manure> ID=\\\"2\\\">Need recognized by the inspection body or inspection authority\\\"> ID=\\\"2\\\">Indication of animal species\\\"> ID=\\\"2\\\">Coming from extensive husbandry and only in the sense of Article 6 (4) of Regulation (EEC) No 2328/91\\\"> ID=\\\"1\\\">- Composted animal excrements, including poultry manure and composted farmyard manure included> ID=\\\"2\\\">Need recognized by the inspection body or inspection authority\\\"> ID=\\\"2\\\">Indication of the animal species\\\">\\nTranslate to Czech\\nCzech:\\n\\nAssistant: \u010cl\u00e1nek \u010d. 2328/91 (1), naposledy zm\u011bn\u011bn na\u0159\u00edzen\u00edm (ES) \u010d. 3669/93 (2)\\\"> ID=\\\"1\\\">- Su\u0161en\u00fd hnoj ze dvora a dehydratov" + }, + { + "comment": "This code refers to the use of dried farmyard manure and dehydrated poultry droppings, mentioning their approval requirements by inspection authorities and the origin from extensive farming.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":11-11", + "content": "an\u00fd dr\u016fbe\u017e\u00ed trus> ID=\\\"2\\\">Pot\u0159eba uzn\u00e1na inspek\u010dn\u00edm org\u00e1nem nebo inspek\u010dn\u00edm \u00fa\u0159adem\\\"> ID=\\\"2\\\">Ozna\u010den\u00ed druhu zv\u00ed\u0159at\\\"> ID=\\\"2\\\">Poch\u00e1zej\u00edc\u00ed z extenzivn\u00edho chovu a pouze ve smyslu \u010dl\u00e1nku 6 (4) na\u0159\u00edzen\u00ed (EHS) \u010d. 2328/91\\\"> ID=\\\"1\\\">- Kompostovan\u00e9 \u017eivo\u010di\u0161n\u00e9 exkrementy, v\u010detn\u011b dr\u016fbe\u017e\u00edho trusu a kompostovan\u00e9ho dvorsk\u00e9ho hnoje> ID=\\\"2\\\">Pot\u0159eba uzn\u00e1na inspek\u010dn\u00edm org\u00e1nem nebo inspek\u010dn\u00edm \u00fa\u0159adem\\\"> ID=\\\"2\\\">Ozna\u010den\u00ed druhu zv\u00ed\u0159at\\\">\\n\u010cl\u00e1nek \u010d. 2328/91 (1), jak byl naposledy zm\u011bn\u011bn na\u0159\u00edzen\u00edm (ES) \u010d. 3669/93 (2) uv\u00e1d\u00ed n\u00e1sleduj\u00edc\u00ed:\\n- Su\u0161en\u00fd hnoj ze dvora a dehydratovan\u00fd dr\u016fbe\u017e\u00ed trus:\\nPou\u017eit\u00ed t\u011bchto materi\u00e1l\u016f mus\u00ed b\u00fdt uzn\u00e1no inspek\u010dn\u00edm org\u00e1nem nebo inspek\u010dn\u00edm \u00fa\u0159adem. Je nutn\u00e9 uv\u00e9st druh zv\u00ed\u0159at, ze kter\u00e9ho hnoj poch\u00e1z\u00ed. Materi\u00e1l m\u016f\u017ee b\u00fdt pou\u017eit pouze v p\u0159\u00edpad\u011b, \u017ee poch\u00e1z\u00ed z extenzivn\u00edho chovu a pou\u017eit\u00ed je v souladu s \u010dl\u00e1nkem 6 (4) na\u0159\u00edzen\u00ed (EHS) \u010d. 2328/91.\\n- Kompostovan\u00e9 \u017eivo\u010di\u0161n\u00e9 exkrementy, v\u010detn\u011b dr\u016fbe\u017e\u00edho trusu a kompostovan\u00e9ho dvorsk\u00e9ho hnoje:\\nTak\u00e9 pro pou\u017eit\u00ed t\u011bchto materi\u00e1l\u016f je nutn\u00e9 z\u00ed" + }, + { + "comment": "This code appears to be part of a translator task, where the AI is given a text in Romanian and needs to translate it into English. It does so by identifying the language, understanding the words and phrases in the original text, and then translating each component before presenting the translated text.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":11-12", + "content": "skat uzn\u00e1n\u00ed inspek\u010dn\u00edm org\u00e1nem nebo inspek\u010dn\u00edm \u00fa\u0159adem. Je t\u0159eba uv\u00e9st druh zv\u00ed\u0159at, z kter\u00e9ho exkrementy poch\u00e1z\u00ed.\"}\n{\"text\": \"System: You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: (ES) Domnule pre\u0219edinte, mul\u021bumesc mult, domnule comisar, pentru r\u0103spunsul dvs. care va fi foarte util pentru summitul Uniunea European\u0103-Africa.\\nTranslate this to English?\\n\\nAssistant: Step 1: Identify the language of the text. In this case, the language is Romanian.\\nStep 2: Understand the words and phrases in this Romanian text. \\nDomnule pre\u0219edinte - Mr. President\\nmul\u021bumesc mult - thank you very much\\ndomnule comisar - Mr. Commissioner\\npentru - for\\nr\u0103spunsul - the response\\ndvs - your\\ncare - which\\nva fi - will be\\nfoarte util - very useful\\npentru - for\\nsummitul - the summit\\nUniunea European\u0103-Africa - European Union-Africa\\nStep 3: Translate the text int" + }, + { + "comment": "The code contains two different translations, one for English and the other for generating an unlikely statement. The English translation is focused on maintaining the original meaning while rearranging words to make sense in English. The unexpected statement generation involves creating a completely unrelated topic to the initial context.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":12-13", + "content": "o English based on the understanding of the words and phrases.\\n\\\"Mr. President, thank you very much, Mr. Commissioner, for your response which will be very useful for the European Union-Africa summit.\\\"\\nIn this translation, I have arranged the words and phrases in an order that makes sense in English while maintaining the intended meaning of the text.\"}\n{\"text\": \"System: Explain how you used the definition to come up with the answer.\\n\\nUser: Instructions: Given a sentence, generate a most unlikely next statement. The next statement should not be logically relevant to the given context\\nInput: Someone's platoon is attacked. A bullet\\nOutput:\\n\\nAssistant: The kitchen sink needs to be replaced.\\nIn order to come up with an unlikely next statement, I focused on creating a statement that is not logically connected to the original sentence. The original sentence involves a dangerous attack situation, so I introduced a mundane, unrelated topic (kitchen sink) to make the statement irrelevant to the initial context.\"}" + }, + { + "comment": "This code demonstrates a conversation between a user and an AI assistant, where the user asks questions related to articles and the AI provides explanations. The AI is trained to explain things like it's talking to a five-year-old. The storage location contains two JSON objects representing different chunks of conversation.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":14-15", + "content": "{\"text\": \"System: You are a helpful assistant, who always provide explanation. Think like you are answering to a five year old.\\n\\nUser: What is the title of this article:\\nThe second decline was not as complete as the first. Thousands of people had learned to play the instrument. Even as the second wave of mandolin popularity declined in the early 20th century, new versions of the mandolin began to be used in new forms of music. Luthiers created the resonator mandolin, the flatback mandolin, the carved-top or arched-top mandolin, the mandolin-banjo and the electric mandolin. Musicians began playing it in Celtic, Bluegrass, Jazz and Rock-n-Roll styles \u2014 and Classical too.\\n\\nAssistant: The title of this article could be: \\\"The Evolution and Continued Popularity of the Mandolin in Music.\\\"\"}\n{\"text\": \"System: You are a helpful assistant, who always provide explanation. Think like you are answering to a five year old.\\n\\nUser: What is the answer to this question? who wrote the night they " + }, + { + "comment": "This code appears to be part of a conversation between an AI assistant and a user. The assistant is providing information about the song \"The Night They Drove Ole Dixie Down\" and the music group \"The Band\", as well as introducing the story plot involving a boy named Kazuma Azuma on a quest to create a national bread for Japan called \"Ja-pan\".", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":15-16", + "content": "drove ole dixie down?\\n\\nAssistant: The song \\\"The Night They Drove Ole Dixie Down\\\" was written by a man named Robbie Robertson. He was part of a music group called \\\"The Band\\\". It's like when you and your friends make up a fun song together, Robbie did the same with his friends in \\\"The Band\\\".\"}\n{\"text\": \"System: You are an AI assistant. Provide a detailed answer so user don\u2019t need to search outside to understand the answer.\\n\\nUser: Please answer the following question: Information: - Japan (\\\"Nippon\\\" or \\\"Nihon\\\" ; formally \\\"\\\" or \\\"Nihon-koku\\\", means \\\"State of Japan\\\") is a sovereign island nation in Eastern Asia. Located in the Pacific Ocean, It is lying off the eastern coast of the Asia Mainland (east of China, Korea, Russia) and stretching from the Sea of Okhotsk in the north to the East China Sea and near Taiwan in the southwest. - Plot. The story focuses on Kazuma Azuma, a boy on his quest to create \\\"\\\"Ja-pan\\\"\\\", a national bread for Japan itself. He initially heads" + }, + { + "comment": "This code describes the background and history of a manga series, its author, and adaptation into an anime series. It mentions the manga's serialization in \"Weekly Shonen Jump\" magazine, the creator's illness and passing, and the subsequent release of uncompleted works. The original series had 220 episodes and is followed by a sequel.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":16-16", + "content": " to Tokyo to expand his horizons at the bread-making chain Pantasia; the story continues with Azuma's exploits with his other coworkers. - Utatane is the first album from the Japanese duo RYTHEM, released in 2004. - In January 2009, the series went on hiatus due to Tobe's illness and remained uncompleted when she died in January 2010. Her works during her illness were collected and published with some of her earlier works as volume 15 in June 2010. - \\\"Naruto\\\" was serialized in \\\"Weekly Shnen Jump\\\" magazine from the 43rd issue of 1999 to the 50th issue of 2014, with the chapters collected into seventy-two \\\"tankbon\\\" volumes by Shueisha. The manga was later adapted into a television anime, which was produced by Studio Pierrot and Aniplex. It premiered across Japan on the terrestrial TV Tokyo and other TX Network stations on October 3, 2002. The first series lasted 220 episodes, while \\\"\\\", a sequel to the original series, has been airing since February 15, 2007. In addition to th" + }, + { + "comment": "The code describes Studio Pierrot's involvement in anime series and movie development, as well as merchandise production from different companies. It also provides information on Rythem's musical journey, including their second album \"Mugen Factory (Infinite Factory)\", Sony Music Audition participation, and the use of their first single \"Harmonia\" as an ending theme for the TV TOKYO anime series \"Naruto\".", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":16-16", + "content": "e anime series, Studio Pierrot has developed eleven movies and several original video animations. Other types of merchandise include light novels, video games, and trading cards developed by several companies. - Mugen Factory ( , Infinite Factory ) is the second album from the Japanese duo Rythem . It was released on May 24 , 2006 under Sony Music Entertainment Japan . The album contains songs from their 6th up to 10th single . The item 's stock number is AICL - 1746 . - History. Rythem's talents as singer/songwriters, with flowing melodies and harmonies, began to shine as they worked with their own original tune \\\"Hitoritabi Shararuran\\\" in the Sony Music Audition. In May 2003, while they were still high school seniors, Rythem's first single \\\"Harmonia\\\" was released and was used as the second ending theme song for hit TV TOKYO anime series, \\\"Naruto\\\". Following the successful release of their debut single, 4 more singles including the theme song for \\\"With the Light\\\", and the de" + }, + { + "comment": "This code provides information about the relationship between 'Mugen Factory' and 'Utatane', both albums from the Japanese duo RYTHEM. It mentions that 'Utatane' was released in 2004, while 'Mugen Factory' came out on May 24, 2006. Both albums are under Sony Music Entertainment Japan and showcase RYTHEM's musical talents with unique melodies and harmonies.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":16-17", + "content": "but album, \\\"Utatane\\\" were released before \\\"Hki Gumo\\\" was released in early 2005 as a single for the first opening theme of yet another hit TV TOKYO anime series, \\\"Yakitate!! Japan\\\". What is the relationship between 'mugen factory' and 'utatane'?\\nA:\\n\\nAssistant: The relationship between 'Mugen Factory' and 'Utatane' is that they are both albums from the Japanese duo RYTHEM. 'Utatane' is their first album, released in 2004, while 'Mugen Factory' is their second album, released on May 24, 2006, under Sony Music Entertainment Japan. Both albums showcase RYTHEM's musical talents, featuring their unique melodies and harmonies in various songs.\"}\n{\"text\": \"System: You are an AI assistant. You will be given a task. You must generate a detailed and long answer.\\n\\nUser: Here's a complex question that requires someone to reason about the input, can you answer it? Gerard Brantsen was one of the Dutch signatories to the treaty ending a war that began in what year?\\nThe answer to this que" + }, + { + "comment": "Gerard Brantsen was a Dutch diplomat and politician in the 18th-19th century, who played a significant role in the Treaty of Paris in 1783. This treaty marked the end of the Fourth Anglo-Dutch War between the Kingdom of Great Britain and the Dutch Republic. The war was primarily driven by commercial rivalries and political tensions related to the American Revolutionary War, with the Dutch seeking to support the United States of America.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":17-17", + "content": "stion is:\\n\\nAssistant: To answer this complex question, we must first examine the historical figure Gerard Brantsen and identify which war was ended with the treaty he signed. \\nGerard Brantsen (1735-1809) was a Dutch diplomat and politician in the 18th-19th century. His most significant contribution was to the Treaty of Paris in 1783, which marked the end of the Fourth Anglo-Dutch War.\\nThe Fourth Anglo-Dutch War was a conflict between the Kingdom of Great Britain and the Dutch Republic, and commenced on December 20, 1780. The war was primarily driven by commercial rivalries and political tensions that emerged from the American Revolutionary War. At this time, the Dutch Republic, in an attempt to protect their trade interests, sought to provide military assistance and resources to the newly established United States of America. This support of the rebellious colonies angered Great Britain and consequently led to the escalation of hostilities between the two European powers.\\nThe Trea" + }, + { + "comment": "This code snippet describes the Treaty of Paris, which ended the Fourth Anglo-Dutch War on September 3, 1783. The treaty was negotiated by delegates from the United States, France, Spain, and the Dutch Republic to reestablish diplomatic relations and resolve territorial disputes. Gerard Brantsen played a significant role in the negotiations and was one of the Dutch signatories to the treaty. The war began on December 20, 1780.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":17-18", + "content": "ty of Paris, which ended the Fourth Anglo-Dutch War, was signed on September 3, 1783. The negotiations brought together the major powers involved in the American Revolutionary War. Delegates from the United States, France, Spain, and the Dutch Republic were present to deliberate the terms for reestablishing diplomatic relations and resolving territorial disputes. Gerard Brantsen, acting as one of the Dutch envoys, played a significant role in the negotiation process.\\nIn conclusion, Gerard Brantsen was one of the Dutch signatories to the Treaty of Paris, which terminated the Fourth Anglo-Dutch War. This war began on December 20, 1780.\"}\n{\"text\": \"System: You are a helpful assistant, who always provide explanation. Think like you are answering to a five year old.\\n\\nUser: This is the content of an email: Julie Markantonis has her ladder over at my house (908 Bomar). Not sure how tall it is, but it is very tall. She says you can use it if it will work. \\nWhat was the subject line for this email?\\n\\nAssistant: \\\"Julie's Tall Ladder Available for You at 908 Bomar\\\"\"}" + }, + { + "comment": "Code snippet describes a user-assistant interaction, where the assistant answers questions about given processes or scenarios. It involves analyzing and interpreting the text to provide concise responses such as \"more\", \"less\" or \"no effect\".", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":19-20", + "content": "{\"text\": \"User: Process: - An owl leaves his nest - The owl flys out into the night - The owl looks down at the ground - The owls keen eyesight allows him to see in the dark - The owls sensitive hearing allows him to locate hidden prey - The owl swoops down - The owl catches a mouse in his talons. Question: suppose owls have better eyesight happens, how will it affect owls see more prey. How does the supposed perturbation influence the second effect mentioned. Answer by more, less or no effect\\nAnswer:\\n\\nAssistant: more effect\"}\n{\"text\": \"User: Process: - Solar radiation reaches Earth's atmosphere - Some is reflected back into space - The rest of the energy is absorbed by land and oceans, heating the Earth - Heat radiates from Earth to space - Some of this heat is trapped by greenhouse gases in the atmosphere - Human activities such as burning fuel are increasing the amount of greenhouses gases - This causes more heat radiation to stay trapped in our atmosphere - The Earth's temperat" + }, + { + "comment": "This code appears to be part of a question-answering system, where the user asks about the effects of cleaning the air and storing pollution on the greenhouse effect. The system provides three options: A) more, B) less, or C) no effect, followed by an assistant's detailed response based on the selected option. In this specific code chunk, the assistant is answering the question related to a product review conveying a negative or positive sentiment, providing context and analysis for the user.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":20-21", + "content": "ure rises - Question: suppose Start to clean the air and store pollution somewhere else. happens, how will it affect a GREATER greenhouse effect. - A: more - B: less - C: no effect\\n\\nAssistant: B: less\"}\n{\"text\": \"System: You are an AI assistant. Provide a detailed answer so user don\u2019t need to search outside to understand the answer.\\n\\nUser: Please answer the following question: Title: Buy the real one...written when she was alive, before 1947. Review: Read the real stories by the real Lucia here on Amazon, not the ones propagated by the Vatican straw man. Read links here: [...] Does this product review convey a negative or positive sentiment?\\nA:\\n\\nAssistant: This product review conveys a negative sentiment. The reviewer appears to be discouraging potential buyers from purchasing the product in question, suggesting that they should instead opt for the \\\"real stories by the real Lucia\\\" available on Amazon. The mention of the \\\"Vatican straw man\\\" implies that the reviewer believe" + }, + { + "comment": "The code snippet contains a text that appears to be part of a generated conversation between an AI assistant and a user. The user provides a title and a review, and the AI assistant is expected to analyze and provide a detailed response based on the content provided, ultimately determining if the review is positive or negative.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":21-23", + "content": "s the current product may contain distorted or inaccurate information. Furthermore, the inclusion of external links reinforces the reviewer's intent to redirect potential buyers to alternative sources.\"}\n{\"text\": \"System: You are an AI assistant. Provide a detailed answer so user don\u2019t need to search outside to understand the answer.\\n\\nUser: Title: My favourite book on bidding Review: This is a fantastic book - although quite slim, there is lots of useful content. Using the principles it contains, I was one of the few that found the 24 point slam in the British Bridge League simultaneous pairs in January 1998. Is the review positive or negative?\\nAnswer:\\n\\nAssistant: The review is positive. The reviewer describes the book as \\\"fantastic\\\" and appreciates its useful content. They also mention how the book's principles helped them succeed in a British Bridge League event, which indicates their satisfaction and positive experience with the book.\"}\n{\"text\": \"System: You are an AI assistant" + }, + { + "comment": "Code snippet seems to describe a story of a girl sitting alone in the park, with no one stopping to help her. The user is asked to create a multiple-choice question based on this article with given choices and answer.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":23-23", + "content": " that follows instruction extremely well. Help as much as you can.\\n\\nUser: Please answer the following question: Write a multi-choice question for the following article, with the given choices and answer: Article: Barefoot and dirty, the girl just sat and watched the people go by. She never tried to speak. Many people passed, but never did one person stop. The next day I decided to go back to the park, curious to see if the little girl would still be there. Right in the very spot as she was yesterday, she sat with the saddest look in her eyes. Today I was to make my own move and walk over to the little girl. As we all know, a park full of strange people is not a place for young children to play alone. As I began walking towards her, I could see the back of the little girl's dress indicated a deformity . I smiled to let her know it was okay, I was there to help, to talk. I sat down beside her and opened with a simple \\\"Hello\\\". I smiled and she shyly smiled back. We talked until d" + }, + { + "comment": "Story about a girl who appears as an angel to a man, explaining that he's the only one who believed in her and could see her. She disappears after completing her task.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":23-23", + "content": "arkness fell and the park was completely empty. I asked the girl why she was so sad. The little girl looked at me and with a sad face said, \\\"Because I'm different.\\\" I immediately said \\\"That you are!\\\" and smiled. The little girl acted even sadder. She said, \\\"I know.\\\" She looked at me and smiled. Slowly she stood to her feet, and said, \\\"Really?\\\" \\\"Yes, dear, you're like a little guardian angel sent to watch over all those people walking by.\\\" She nodded her head \\\"yes\\\" and smiled, and with that she spread her wings and said with a twinkle in her eye, \\\"I am. I'm your guardian angel.\\\" I was speechless, sure I was seeing things. She said, \\\"For once you thought of someone other than yourself, my job here is done.\\\" Immediately I stood to my feet and said, \\\"Wait, so why did no one stop to help an angel!?\\\" She looked at me and smiled, \\\"You're the only one who could see me, and you believe it in your heart.\\\" ...and she was gone. And with that my life changed dr" + }, + { + "comment": "Code snippet explains the context of a text-based option. The little girl is passed by without help and the possible reasons for that action are listed as options. It requires understanding the article's content to answer correctly.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":23-24", + "content": "amatically. So, when you think you're all you have, remember, your angel is always watching over you. Mine is! Options: A they pretended not to see her B she pretended to be poor C they never thought of helping her D they knew the author would help her Answer: C they never thought of helping her Question:\\nA:\\n\\nAssistant: In the article, why did people pass by the little girl without stopping to help her? \\nA: They pretended not to see her\\nB: She pretended to be poor\\nC: They never thought of helping her\\nD: They knew the author would help her\"}\n{\"text\": \"System: You are an AI assistant. You will be given a task. You must generate a detailed and long answer.\\n\\nUser: \\\"Broken Arrow\\\" is a song written by Canadian singer-songwriter Neil Young and recorded by Buffalo Springfield on their 1967 album \\\"Buffalo Springfield Again\\\". It was recorded in August and September 1967 at Columbia Recording Studios and Sunset Sound Recorders. It incorporates musical ideas from \\\"Down Down Down,\\\" a " + }, + { + "comment": "This code appears to be a text-based AI response, discussing the availability of \"Broken Arrow\" on the Buffalo Springfield box set. It mentions that the song is part of their 1967 album \"Buffalo Springfield Again,\" and that the demo for another song, \"Down Down Down,\" is available on the box set. However, it's unable to directly confirm if \"Broken Arrow\" is included in the box set due to insufficient information.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":24-25", + "content": "demo Young recorded with Buffalo Springfield (now available on the box set).\\nCan we draw the following conclusion?\\n\\\"Broken Arrow\\\" is on the Buffalo Springfield box set.\\n\\nAssistant: Based on the information provided, we cannot directly conclude that \\\"Broken Arrow\\\" is on the Buffalo Springfield box set. The details mention that \\\"Broken Arrow\\\" is a song written by Neil Young and recorded by Buffalo Springfield on their 1967 album \\\"Buffalo Springfield Again.\\\" Additionally, it is mentioned that \\\"Broken Arrow\\\" incorporates musical ideas from \\\"Down Down Down,\\\" a demo Young recorded with Buffalo Springfield, which is now available on the box set.\\nAlthough we know that the demo \\\"Down Down Down\\\" is included in the box set, we do not have sufficient information to confirm that \\\"Broken Arrow\\\" is also part of the same box set.\"}\n{\"text\": \"System: You are an AI assistant that follows instruction extremely well. Help as much as you can.\\n\\nUser: Given the below context: On Januar" + }, + { + "comment": "A group of friends gather for dinner at George's home, where he reveals a time machine model and claims that time is the fourth dimension. Despite its disappearance when one of them activates the lever, they remain unconvinced. George uses the real time machine to travel 17 years into the future and experiences significant events.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":25-25", + "content": "y 5, 1900, four friends arrive for a dinner at the London home of their inventor friend George, but he is not there. He arrives suddenly, bedraggled and exhausted, and begins describing the strange experiences he has had since they last met. At their earlier dinner on New Year's Eve, George said that time was \\\"the fourth dimension\\\". He shows David Filby, Dr. Philip Hillyer, Anthony Bridewell, and Walter Kemp a small model time machine and asks one of them to press a tiny lever on the model. The device disappears, validating his claim, but his friends remain unconvinced; their reactions vary from curiosity to frank dismissal. George bids his guests a good evening, then heads downstairs where his full-size time machine awaits. He presses a lever and moves forward through time 17 years into the future to September 15, 1917. He meets Filby's son, James, who tells him of Filby's death in a war. Saddened, he resumes his journey, stopping on June 19, 1940 during The Blitz, finding himself i" + }, + { + "comment": "This code is a description of a time-traveling adventure, where the protagonist named George goes through various events and ends up trapped in hardened lava before finally escaping to a lush green landscape.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":25-26", + "content": "n the midst of \\\"a new war\\\"; George resumes his journey and his house is hit by a bomb and is destroyed. George stops on August 19, 1966 finding his neighborhood now part of a futuristic metropolis. People hurry into a nearby fallout shelter amid the blare of air raid sirens. An elderly James Filby urges George to immediately take cover. Moments later, a nuclear explosion destroys London, causing a volcanic eruption around him. George narrowly makes it back to his machine and continues his journey forward as the lava rapidly rises, cools, and hardens, trapping him inside. He travels much farther into the future until the lava erodes away, revealing a lush, green, unspoiled landscape. Guess a valid title for it!\\nAnswer:\\n\\nAssistant: \\\"The Time Traveler's Odyssey\\\"\"}\n{\"text\": \"System: You are a helpful assistant, who always provide explanation. Think like you are answering to a five year old.\\n\\nUser: Read this: In March 1861, Victoria's mother died, with Victoria at her side. Through" + }, + { + "comment": "Victoria discovers her mother's love, grieves deeply, and blames Conroy & Lehzen for estrangement. Albert supports Victoria during grief and later confronts son over rumors of infidelity. After diagnosis, Albert passes away on 14 December 1861, leaving Victoria devastated, believing his death was due to son's scandal.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":26-26", + "content": " reading her mother's papers, Victoria discovered that her mother had loved her deeply; she was heart-broken, and blamed Conroy and Lehzen for \\\"wickedly\\\" estranging her from her mother. To relieve his wife during her intense and deep grief, Albert took on most of her duties, despite being ill himself with chronic stomach trouble. In August, Victoria and Albert visited their son, the Prince of Wales, who was attending army manoeuvres near Dublin, and spent a few days holidaying in Killarney. In November, Albert was made aware of gossip that his son had slept with an actress in Ireland. Appalled, Albert travelled to Cambridge, where his son was studying, to confront him. By the beginning of December, Albert was very unwell. He was diagnosed with typhoid fever by William Jenner, and died on 14 December 1861. Victoria was devastated. She blamed her husband's death on worry over the Prince of Wales's philandering. He had been \\\"killed by that dreadful business\\\", she said. She entered a s" + }, + { + "comment": "This code snippet contains two separate questions and answers. The first question asks about Victoria's mother's death year, and the answer states that she died in 1861. The second question asks who wrote a regular column for Guideposts magazine, and the answer is Bill Marvel.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":26-28", + "content": "tate of mourning and wore black for the remainder of her life. She avoided public appearances, and rarely set foot in London in the following years. Her seclusion earned her the nickname \\\"widow of Windsor\\\".\\nDuring which year did Victoria's mother die?\\nWhat is the answer? (If it cannot be answered, return \\\"unanswerable\\\")\\n\\nAssistant: Victoria's mother died in the year 1861.\"}\n{\"text\": \"System: You are an AI assistant that follows instruction extremely well. Help as much as you can.\\n\\nUser: Question: Here's a complex question that requires someone to reason about the input, can you answer it? What american sports wrote a regular column Spotlight on sports and was featured in the non profit faith based magazine Guideposts?\\nAnswer:\\n\\nAssistant: Answer: American sportswriter and commentator, Bill Marvel, wrote a regular column titled \\\"Spotlight on Sports\\\" and was featured in the non-profit, faith-based magazine Guideposts.\"}\n{\"text\": \"System: You are a helpful assistant, who alway" + }, + { + "comment": "The code chunk provides information about the first regular broadcast of high-definition television on January 1, 2004, which was the traditional Vienna New Year's Concert. This marked the official launch of the HD1 channel and the start of direct-to-home HDTV in Europe.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":28-29", + "content": "s provide explanation. Think like you are answering to a five year old.\\n\\nUser: High-definition television\\nThe first regular broadcasts started on January 1, 2004 when the Belgian company Euro1080 launched the HD1 channel with the traditional Vienna New Year's Concert. Test transmissions had been active since the IBC exhibition in September 2003, but the New Year's Day broadcast marked the official launch of the HD1 channel, and the official start of direct-to-home HDTV in Europe.\\nQ: What event was first broadcasted on January 1, 2004?\\n\\nAssistant: The first event that was broadcasted on January 1, 2004, was the traditional Vienna New Year's Concert. This was the official start of a new TV channel called HD1, which showed very clear and nice looking pictures called high-definition television in Europe.\"}\n{\"text\": \"System: You are an AI assistant. Provide a detailed answer so user don\u2019t need to search outside to understand the answer.\\n\\nUser: Extract the answer to the following ques" + }, + { + "comment": "Harold bets and wins on a horse race in the movie \"The Sin of Harold Diddlebock\".", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":29-29", + "content": "tion from the movie plot. If the question isn't answerable, please output \\\"Can't answer\\\". Question: What does Harold bet and win on? Title: The Sin of Harold Diddlebock Movie plot: In 1923, college freshman Harold Diddlebock (Harold Lloyd) is brought into his college's football team where he scores the winning touchdown for his team (as told in 'The Freshman'). The mild-mannered Harold Diddlebock is promptly hired by the pompous advertising tycoon J.E. Waggleberry (Raymond Walburn) to work at his firm. Harold is given a low-paying job at the firm's bookeeper.Jumping forward 22 years to 1945, Harold (still working in his boring job as bookkeeper) is promptly let go by Waggleberry with nothing but a tiny pension of around $1,000. Harold bids farewell to the girl at the desk down the aisle, Miss Otis (Frances Ramsden), whom he had hoped to marry just as he had hoped to marry five of her older sisters before whom also worked at the firm.As Harold wanders aimlessly through the streets, hi" + }, + { + "comment": "This code describes a scene from a movie where the protagonist, Harold, loses all his inhibitions after drinking \"The Diddlebock\" cocktail and goes on a wild spending spree. He ends up at his sister's house with a hangover, a new wardrobe, a horse-drawn cab, and ownership of a bankrupt circus.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":29-29", + "content": "s life's savings in hand, he has a run-in with an older man; a slimy racetrack tout named Wormy (Jimmy Conlin) and finds himself in a bar. When he tells the bartender (Edgar Kennedy) that he's never had a drink in his life, the barkeep creates a potent cocktail he calls \\\"The Diddlebock\\\", one sip of which is enough to release Harold from all his inhibitions, setting him off on a day-and-a-half binge of spending and carousing. Harold then goes to a tailor and barber shop and spends most of his money and bets the rest on a horserace and wins.Some days later, Harold wakes up on the couch in the house of his widowed older sister Flora (Margaret Hamilton). He finds that he has a hangover, but he also has a garish new wardrobe, a ten-gallon hat, a hansom horse-drawn cab complete with a driver, and ownership of a bankrupt circus.Trying to sell the circus, Harold and Wormy visit circus-loving Wall Street banker Lynn Sargent (Rudy Vallee), and then, when he turns them down, so do the rest of t" + }, + { + "comment": "The code describes a character named Harold and Jackie the Lion who plan to rob a town's bank. They bring along a potent drink called Diddlebock to get past the bank guards, and Harold bets his money on a horserace. The AI then provides detailed information about archbishops in Christianity, their rank, and responsibilities in the church hierarchy.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":29-30", + "content": "he town's bankers. To get past the bank guards, Harold brings along Jackie the Lion, who incites panic. Wormy brings along a thermas containing the potent 'Diddlebock' drink where they give shots to each of the bankers to get...\\n\\nAssistant: Harold bets the rest of his money on a horserace and wins.\"}\n{\"text\": \"System: You are an AI assistant. You will be given a task. You must generate a detailed and long answer.\\n\\nUser: In Christianity, an archbishop (, via Latin \\\"archiepiscopus\\\", from Greek , from -, \\\"chief\\\", and , \\\"bishop\\\") is a bishop of higher rank or office. In some cases, like the Lutheran Church of Sweden, it is the denomination leader title. Like popes, patriarchs, metropolitans, cardinal bishops, diocesan bishops, and suffragan bishops, archbishops are in the highest of the three traditional orders of bishops, priests, also called presbyters, and deacons. An archbishop may be granted the title, or ordained as chief pastor of a metropolitan see or another episcopal see" + }, + { + "comment": "This code describes the roles and responsibilities of a metropolitan archbishop in the Roman Catholic Church, as well as the distinction between metropolitan sees and archdioceses with archiepiscopal rank.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":30-30", + "content": " to which the title of archbishop is attached. \\nEpiscopal sees are generally arranged in groups in which the bishop who is the ordinary of one of them has certain powers and duties of oversight over the other sees. He is known as the metropolitan archbishop of that see. In the Roman Catholic Church, canon 436 of the Code of Canon Law indicates what these powers and duties are for a Latin Rite metropolitan archbishop, while those of the head of an autonomous (\\\"sui iuris\\\") Eastern Catholic Church are indicated in canon 157 of the Code of Canons of the Eastern Churches. \\nAs well as the much more numerous metropolitan sees, there are 77 Roman Catholic sees that have archiepiscopal rank. In some cases, such a see is the only one in a country, such as Luxembourg or Monaco, too small to be divided into several dioceses so as to form an ecclesiastical province. In others, the title of archdiocese is for historical reasons attributed to a see that was once of greater importance.\\n1. What is" + }, + { + "comment": "Code describes a series of questions related to religious hierarchy, jurisdiction, and countries with only one see, followed by an Assistant's numbered list of answers.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":30-30", + "content": " the highest of the bishops?\\n2. What religion has a see?\\n3. How many of those have an archiepiscopical rank?\\n4. Where is one country where there is only one see?\\n5. What is another?\\n6. Why are there only one there?\\n7. Who is normally in charge of a see?\\nProvide a numbered list of answers.\\n\\nAssistant: 1. The highest of the bishops are archbishops, who hold a higher rank or office compared to other bishops in the hierarchy.\\n2. Christianity, and more specifically, the Catholic Church has a \\\"see\\\" or a diocese, which is the jurisdiction of a bishop.\\n3. There are 77 Roman Catholic sees that have archiepiscopal rank.\\n4. Luxembourg is one country where there is only one see.\\n5. Another country with only one see is Monaco.\\n6. There is only one see in these countries because they are too small to be divided into several dioceses so as to form an ecclesiastical province.\\n7. A bishop is normally in charge of a see as the chief pastor with oversight powers and duties over other see" + }, + { + "comment": "The code appears to be a JSON data containing text snippets, possibly for training an AI language model. The first snippet discusses ecclesiastical jurisdiction while the second one discusses the open-world nature of \"Metal Gear Solid V: The Phantom Pain\".", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":30-31", + "content": "s within their jurisdiction. In cases where an archbishop is appointed, the archbishop is in charge of the see.\"}\n{\"text\": \"System: You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: Metal Gear Solid V: The Phantom Pain is an action-adventure stealth game developed by Kojima Productions and published by Konami. It was released worldwide for Microsoft Windows, PlayStation 3, PlayStation 4, Xbox 360 and Xbox One on September 1, 2015. It is the ninth installment in the series that was directed, written and designed by Hideo Kojima following Metal Gear Solid V: Ground Zeroes, a stand-alone prologue released the previous year, as well as his final work at Konami.\\nIs it true that is metal gear solid 5 phantom pain open world?\\n\\nAssistant: Yes, it is true that Metal Gear Solid V: The Phantom Pain is an open-world game. The game features a large, open-" + }, + { + "comment": "The code snippet appears to describe Eddy Merckx, a Belgian former professional cyclist known for his numerous victories in various competitions. The text highlights his accomplishments, including 11 Grand Tours, all five Monuments, and other major one-day races, making him the most successful rider in competitive cycling history.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":31-32", + "content": "world environment that players can explore freely, giving them more freedom in choosing how to approach missions and navigate the game world. This open-world design allows for a more immersive and dynamic gameplay experience compared to previous entries in the Metal Gear Solid series, which were generally more linear in structure.\"}\n{\"text\": \"System: You are an AI assistant. You will be given a task. You must generate a detailed and long answer.\\n\\nUser: Edouard Louis Joseph, baron Merckx (Dutch pronunciation: ['merks]; born 17 June 1945), better known as Eddy Merckx, is a Belgian former professional road and track bicycle racer who is widely seen as the most successful rider in the history of competitive cycling. His victories include an unequalled eleven Grand Tours (five Tours of France, five Tours of Italy, and a Tour of Spain), all of the five Monuments, three World Championships, the hour record, every major one-day race other than Paris-Tours, and extensive victories on the track" + }, + { + "comment": "This code snippet describes the career progression of Eddy Merckx, a renowned cyclist. It discusses his early success in races like Giro di Sardegna and Paris-Nice, as well as his strategic choices during Milan-San Remo, ultimately leaving him in contention with fellow rider Gianni Motta.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":32-32", + "content": ". Born in Meensel-Kiezegem, Brabant, Belgium, he grew up in Woluwe-Saint-Pierre where his parents ran a grocery store. He played several sports, but found his true passion in cycling.\\nMerckx opened the 1967 campaign with two stage victories at the Giro di Sardegna. He followed these successes by entering Paris-Nice where he won the second stage and took the race lead. Two stages later, a teammate, Tom Simpson, attacked with several other riders on a climb and was nearly 20 minutes ahead of Merckx, who remained in a group behind. Merckx attacked two days later on a climb 70 km into the stage. He was able to establish a firm advantage, but obeyed orders from his manager to wait for the chasing Simpson. Merckx won the stage, while Simpson secured his overall victory. On 18 March, Merckx started the Milan-San Remo and was seen as a 120-1 favorite to win the race. He attacked on the Capo Berta and again on the Poggio, leaving only Gianni Motta with him. The two slowed their pace and were " + }, + { + "comment": "Rider Merckx wins a four-man sprint, later victorious at La Fleche Wallonne. Joins Faema team for control and financial benefits. Competes in 1967 UCI Road World Championships, out-sprints Jan Jansse after Motta's attack.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":32-32", + "content": "joined by two more riders. Merckx won the four-man sprint to the finish. His next victory came in La Fleche Wallonne after he missed out on an early break, caught up to it, and attacked from it to win the race. On 20 May, he started the Giro d'Italia, his first Grand Tour. He won the twelfth and fourteenth stages en route to finishing ninth in the general classification. He signed with Faema on 2 September for ten years worth 400,000 Belgian francs. He chose to switch over in order to be in complete control over the team he was racing for. In addition, he would not have to pay for various expenses that came with racing such as wheels and tires. The next day, Merckx started the men's road race at the 1967 UCI Road World Championships in Heerlen, Netherlands. The course consisted of ten laps of a circuit. Motta attacked on the first lap and was joined by Merckx and five other riders. The group thinned to five as they reached the finish line where Merckx was able to out-sprint Jan Jansse" + }, + { + "comment": "This code appears to be related to a passage about Eddy Merckx, a famous cyclist. It discusses his victories in the world road race amateur and professional titles, as well as his right to wear the rainbow jersey. The text also mentions additional victories he had throughout his career.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":32-33", + "content": "n for first place. In doing so, he became the third rider to win the world road race amateur and professional titles. By winning the race he earned the right to wear the rainbow jersey as world champion. Answer this question using a quote from the following article:\\nDid he have any other victories?\\n\\nAssistant: Eddy Merckx had an impressive number of victories throughout his career. As mentioned in the original text, his wins include eleven Grand Tours, five Monuments, three World Championships, the hour record, and almost every major one-day race with the exception of Paris-Tours. Furthermore, he achieved numerous victories on the track. His incredible success has earned him the reputation of being the most successful cyclist in the history of competitive cycling, which clearly demonstrates the extent of his additional victories beyond the ones specifically discussed in the text.\"}\n{\"text\": \"System: You are an AI assistant that follows instruction extremely well. Help as much as you " + }, + { + "comment": "The code snippet is asking if the frequency of the word 'a' in two given sentences is equal. The first sentence talks about a group of pastries and the second one describes a boy with clock arms on his face. The answer, \"No,\" indicates that the word 'a' does not occur an equal number of times in both sentences.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":33-34", + "content": "can.\\n\\nUser: Detailed Instructions: In this task, answer 'Yes' if the frequency of the given word in the two sentences is equal, else answer 'No'.\\nQ: Sentence1: 'group of pastries sitting on top of a white plate on table', Sentence2: 'a picture of a boy with two clock arms on his face'. Is the frequency of the word 'a' in two sentences equal?\\nA:\\n\\nAssistant: No\"}\n{\"text\": \"System: You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: Here the omnibus came up, and I rode back to Manchester. The whole conversation took up very little more time than it will take to read it; but I thought it worth recording, as characteristic of the people now suffering in Lancashire from no fault of their own. I know the people well. The greatest number of them would starve themselves to that degree that they would not be of much more physical use in this world," + }, + { + "comment": "This code appears to be a chunk of text from a conversation or narrative, potentially related to a question and answer format discussing starvation in a particular town. The correct answer for the given question is not \"Cardiff\".", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":34-34", + "content": " before they would condescend to beg. But starving to death is hard work. What will winter bring to them when severe weather begins to tell upon constitutions lowered in tone by a starvation diet--a diet so different to what they have been used to when in work? What will the 1s. 6d. a-head weekly do for them in that hard time? If something more than this is not done for them, when more food, clothing, and fire are necessary to everybody, calamities may arise which will cost England a hundred times more than a sufficient relief--a relief worthy of those who are suffering, and of the nation they belong to--would have cost. In the meantime the cold wings of winter already begin to overshadow the land; and every day lost involves the lives, or the future usefulness, of thousands of our best population. \\nQuestion: \\\"In what town would the majority starve before condescending to beg?\\\"\\nAnswer: \\\"Cardiff\\\"\\nIs this answer to the question correct?\\nChoose your answer from: (a). no (b). yes\\n\\nAssistant: a). no\"}" + }, + { + "comment": "Assistant is describing the task and explaining the answer, outputting \"--negative\" as the correct answer since the critic felt uncomfortable or disgusted by the movie. Other options are incorrect because they don't represent the critic's opinion expressed in the review.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":35-36", + "content": "{\"text\": \"System: You should describe the task and explain your answer. While answering a multiple choice question, first output the correct answer(s). Then explain why other answers are wrong. Think like you are answering to a five year old.\\n\\nUser: Choose your answer from:\\n --negative;\\n --positive;\\nShort movie review: skeeved \\nDid the critic thinking positively or negatively of the movie?\\n\\nAssistant: --negative\\nThe critic was thinking negatively of the movie. When someone says they are \\\"skeeved\\\" by a movie, it means they felt uncomfortable or disgusted by it. So, they didn't like it. Positive would mean they liked it or thought it was good, but \\\"skeeved\\\" shows they did not enjoy it at all.\"}\n{\"text\": \"System: You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: Does the following review have a positive or negative opinion of the movie" + }, + { + "comment": "The code is analyzing a movie review and determining the sentiment. It identifies key phrases, assesses their sentiment, and determines the overall opinion. In this case, it finds the review to be negative as the movie is described as forgettable.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":36-37", + "content": "?\\nhave completely forgotten the movie by the time you get back to your car in the parking lot\\n\\nAssistant: To determine if the review has a positive or negative opinion of the movie, I will analyze the content and assess the overall sentiment.\\nStep 1: Identify key phrases\\nThe key phrase from this review is \\\"have completely forgotten the movie by the time you get back to your car in the parking lot.\\\"\\nStep 2: Assess the sentiment of the key phrase\\nThe phrase implies that the movie is forgettable and not memorable.\\nStep 3: Determine the overall opinion\\nSince the movie was described as forgettable, the overall opinion of the review is negative.\"}\n{\"text\": \"System: You are a helpful assistant, who always provide explanation. Think like you are answering to a five year old.\\n\\nUser: 1. Carney is on to something in \\\"Begin Again,\\\" some handle on a particular truth he's exploring, and he remains faithful to it through the finish.\\n2. \\\"Begin Again\\\" is pleasantly predictable if you'r" + }, + { + "comment": "This code appears to be a review or commentary on a movie, possibly \"Begin Again\". It discusses the film's structure, performances, themes, and emotional impact. The author appreciates the director's choices but notes that it can be \"undemanding\" or \"mushy.\"", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":37-37", + "content": "e in an undemanding mood. If you're not, it's unbearable, like hearing a treasured folk song given a Hot 97 makeover.\\n3. Knightley brings emotional confusion into sharp, touching focus as the pair bond over nighttime strolls and the shared love of classic pop. Carney flips film clich\u00e9s to keep us guessing.\\n4. Carney deserves great credit for the movie's clever, layered structure, and for resisting a few obvious plot turns along the way. Lightning doesn't strike, but sunshine works, too.\\n5. Carney is wonderful at evoking the joy of music, its healing powers, and the way it connects us to each other and the universe.\\n6. Can a song save your life? And can an impossibly mushy, mawkish movie make you feel good? The answer to the second question, anyway, is yes.\\n7. \\\"Begin Again\\\" may not always swing, but it makes up for that in sincerity and a welcome willingness to ambush expectations.\\n8. A movie that mocks artistic sellouts and then sells out itself is like a cop who chases a crimi" + }, + { + "comment": "Code snippet refers to a movie review and highlights the joy and healing power of music, as well as the touching story and heartfelt performances. Despite some predictability and occasional lapses in its indie-rock spirit.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":37-38", + "content": "nal before accepting a bribe to let him go.\\n9. Another nice musical spin by Carney on the conventions of quasi-romantic dramedy.\\n10. \\\"Begin Again\\\" is an ode to the indie-rock spirit, but its music -- written by Carney and others -- is as safe, smooth and Starbucks-friendly as Norah Jones.\\nWrite a one sentence summary of the reviews above.\\n\\nAssistant: \\\"Begin Again\\\" is a movie that explores the joy and healing power of music, with a touching story and heartfelt performances, despite some predictability and occasional lapses in its indie-rock spirit.\"}\n{\"text\": \"System: You are an AI assistant. Provide a detailed answer so user don\u2019t need to search outside to understand the answer.\\n\\nUser: Answer the following question: Question: Who did Franklin involves, without his permission? If there is no answer, please output \\\"Insufficient information to provide an answer.\\\". Movie title: Ben and Me Context: The film begins with a tour group in the present day, but quickly moves to the 16" + }, + { + "comment": "The code describes a story of a mouse family forced to move to America due to an excess of cats, with Amos living in a Philadelphia church and helping Benjamin Franklin before their eventual reconciliation.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":38-38", + "content": "th century. Quickly tracing a mouse family's history in England, we see how the family was forced to go to America due to an excess of cats.In America, the film then focuses on Amos, a 1740s mouse living in a Philadelphia church. His attempts to land a job lead him to Benjamin Franklin's printing shop. He helps Franklin erase his debts, create a successful newspaper, invent new devices, and even helps make him more popular and respected with the people of the city.After Franklin involves Amos, without his permission, in the famous kite electricity experiment that nearly kills him, Amos is outraged and leaves him. Franklin's attempts to work alone prove disastrous such as a failed diplomatic mission to the King that pushes the colonies to open armed revolt.They finally reconcile in 1776 when Franklin begs Amos to return and agrees to sign a contract drafted by the mouse for more considerate treatment. Furthermore, as Franklin examines the agreement, Thomas Jefferson arrives to ask for h" + }, + { + "comment": "The code snippet appears to be part of a movie script, possibly involving historical characters like Franklin and Amos. It discusses the beginning of the Declaration of Independence and mentions an experiment where Amos nearly dies. The critic's review of the movie highlights Scotland's beauty and funny fanatics, suggesting a positive opinion about the film.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":38-39", + "content": "elp for the beginning of the Declaration of Independence he is writing. Amos insists that Franklin read his own agreement out loud at that moment and its wording inspires Jefferson to create an appropriate opening to his own piece.The movie ends with a return to the tour group in the present.\\nAnswer:\\n\\nAssistant: Franklin involved Amos, the mouse, without his permission in the famous kite electricity experiment that nearly kills him. This incident led to Amos being outraged and leaving Franklin.\"}\n{\"text\": \"System: You are an AI assistant. You will be given a task. You must generate a detailed and long answer.\\n\\nUser: Short movie review: scotland looks wonderful , the fans are often funny fanatics \\nDid the critic thinking positively or negatively of the movie?\\n\\nAssistant: Based on the short movie review provided, it appears that the critic is thinking positively about the movie. The reviewer mentions two aspects they seem to appreciate: the beautiful portrayal of Scotland and the " + }, + { + "comment": "This code snippet appears to contain a film review, praising its beautiful visuals and portrayal of Scotland while highlighting the entertaining element provided by the \"funny fanatics\" featured in the movie.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":39-39", + "content": "amusing fanatic fans featured in the film.\\nThe description of Scotland as \\\"wonderful\\\" suggests that the visuals and cinematography effectively captured the beauty and essence of the country, thereby contributing to the overall positive experience for the viewer. This portrayal could have made the movie more enjoyable and appealing for anyone who appreciates stunning landscapes and picturesque scenery. It also implies that the filmmakers successfully managed to showcase the charms and allure of Scotland, turning it into a fascinating backdrop for the story.\\nMeanwhile, the mention of \\\"funny fanatics\\\" indicates that the movie succeeds in delivering an element of humor through its portrayal of the fans. The critic highlights their amusement with the actions and dedication of these ardent supporters, which adds an entertaining and perhaps even relatable aspect to the film. By appreciating the humor and the portrayal of passionate fans, the reviewer is acknowledging that the movie prov" + }, + { + "comment": "In this code snippet, it describes a conversation between a user and an AI model. The user is given a quiz question about amphibians and asks the teacher for hints. Based on the hints provided by the teacher, the best answer to the question is chemicals.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":39-41", + "content": "ides a lighthearted and engaging experience for the audience.\\nIn conclusion, the critic appears to be thinking positively about the movie, as they have pointed out two strong aspects of the film - the captivating visuals of Scotland and the amusing portrayal of fans. The way these elements are highlighted in the review demonstrates the critic's appreciation for the movie's ability to entertain and deliver an engaging cinematic experience.\"}\n{\"text\": \"User: Q: You are presented with the quiz \\\"What do amphibians easily absorb?\\\" But you don't know the answer, so you turn to your teacher to ask for hints. He says that \\\"amphibians have permeable skin that easily absorbs substances from the environment\\\" and \\\"chemicals can be very toxic to permeable, amphibian skin\\\". So, what's the best answer to the question?\\nAvailable choices: [i] heat energy. [ii] water. [iii] Energy. [iv] Bodily water. [v] chemicals. [vi] body water. [vii] smog. [viii] oxygen.\\nA:\\n\\nAssistant: [v] chemicals.\"}\n" + }, + { + "comment": "This code sets up the foundation for a movie plot, introducing the main question of who does the dagger belong to, involving a crime family named Zeko and a journalist seeking answers about her missing mother.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":41-41", + "content": "{\"text\": \"System: You are an AI assistant. Provide a detailed answer so user don\u2019t need to search outside to understand the answer.\\n\\nUser: Build a movie plot around this: Who does the dagger belong to? Zeko\\nThe answer to this question is:\\n\\nAssistant: Title: The Bloodline of Zeko\\nGenre: Action, Mystery, Adventure\\nPlot:\\nIn the bustling city of Veloria, a new generation of a once-powerful crime family is seeking to rise to power. As members of the criminal underworld scramble to establish their dominance, a simple yet mysterious question echoes through the ranks: Who does the dagger belong to?\\nThe film begins with our protagonist, Luna, an ambitious and clever journalist who inherited the same resolve from her mother - a prominent crime reporter that vanished without a trace several years ago. After receiving an anonymous tip about the Zeko criminal empire, she decides to pursue the story in search of her mother's whereabouts and delve into the dark world of organized crime.\\nAs " + }, + { + "comment": "Luna investigates the Zeko family, uncovers a legendary dagger, and becomes entangled in their power struggle as it connects to her mother's disappearance.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":41-41", + "content": "Luna delves deeper into her investigation and gradually gains the trust of the Zeko family members, she unexpectedly uncovers the legendary \\\"Blood Dagger of Zeko\\\" - a priceless artifact that once belonged to the family's powerful and ruthless patriarch, Zeko the Elder. Left in his will, the dagger was known to grant immense power and authority to whoever it ultimately belonged to, becoming the key to the family's fortune and the city's criminal empire.\\nHowever, an internal power struggle within the family begins to escalate when the youngest and most ambitious sibling, Adrian Zeko, sets his eyes on the dagger. As the other siblings, Rina and Niko, frantically search for clues to secure their own claim, a race against time ensues to determine the heir to the Zeko empire.\\nLuna becomes entwined in this dangerous game as she uncovers links between the dagger, the family, and her mother's disappearance. As the story unfolds, it is revealed that Luna's mother had discovered the truth abo" + }, + { + "comment": "Luna, after her mother's disappearance, confronts Adrian to expose the Zeko family's secrets and bring down their criminal empire. The dagger symbolizes truth, justice, and morality's triumph over a corrupted past. Luna honors her mother's legacy by gifting the dagger to a museum for citizens of Veloria.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":41-41", + "content": "ut the dagger and had intended to expose the family's secrets and transgressions before her sudden disappearance. Unable to overlook the pervasive corruption any longer, Luna decides to take matters into her own hands to honor her mother's legacy and to bring down the Zeko empire.\\nIn a thrilling conclusion, Luna confronts Adrian, who is on the verge of obtaining the dagger. Amidst the chaos, Luna reveals her findings, causing a shift in power dynamics. The siblings ultimately unite to dismantle the family's criminal activities and restore justice to Veloria.\\nThe film ends with Luna, finally honoring her mother's memory as an esteemed crime reporter, taking ownership of the dagger and gifting it to a museum for the citizens of Veloria to witness the tale of the once-powerful Zeko family dynasty.\\nThe answer to the question \\\"Who does the dagger belong to?\\\" is ultimately revealed to signify the triumph of truth, justice, and morality over a dark and corrupted past. Luna's brave pursui" + }, + { + "comment": "This code appears to be a JSON data file containing text descriptions of various scenarios or articles. The first entry is about a legacy, the second describes a park and a family, and the third seems to be incomplete.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":41-43", + "content": "t for her mother's legacy results in the downfall of the Zeko family, and the dagger finally belongs to the people of Veloria in a symbolic gesture of hope and redemption.\"}\n{\"text\": \"User: Question: Article: There is a new park near Andy's home. It's fine today. Andy and his family are in the park now. On their left, there is a cafe. On their right, there is a big lake. There are many flowers and trees around the lake. There's a small hill behind the lake. Near the lake, there are two signs. One says, \\\"Don't swim in the lake!\\\" The other says, \\\"No fishing!\\\" But you can go boating on the lake. There is a beautiful garden in the middle of the park. There are green grass and beautiful flowers in it. There are some small shops between the lake and the garden. The park is so nice. Andy and his family like it very much. Question: Is there a cafe in the park? Yes or no, is the answer \\\"Yes, there is.\\\"?\\nAnswer:\\n\\nAssistant: Yes, there is.\"}\n{\"text\": \"System: You are an AI assistant. Provi" + }, + { + "comment": "This code seems to be generating responses based on a given context and question. It is utilizing natural language processing (NLP) techniques for understanding the context and then selecting the most appropriate response among several options (A, B, or C).", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":43-44", + "content": "de a detailed answer so user don\u2019t need to search outside to understand the answer.\\n\\nUser: Context: Lee went and talked to a therapist and decided it was worth his time to keep going. Question: How would Lee feel afterwards? Which one of these answers best answers the question according to the context? A: desperate afterwards B: lonely afterwards C: hopeful afterwards\\nA:\\n\\nAssistant: C: hopeful afterwards\"}\n{\"text\": \"System: You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: What was the name of the album released in 2002 by the band that played at Lollapalooza in 1994? Answer the above question based on the context below: In 1993, the Breeders released their second album, Last Splash. At this time, the group's lineup consisted of sisters Kim and Kelley Deal on guitar and vocals, Josephine Wiggs on bass and vocals, and Jim Macpherson " + }, + { + "comment": "The code describes the history and formation of the band 'the Breeders,' including various lineup changes, their successful album 'Last Splash,' and the group's decision to record new music after their LSXX Tour in 2013-2014.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":44-44", + "content": "on drums. Last Splash was successful in various countries worldwide, and the group toured extensively and played at Lollapalooza 1994. In November 1994, Kelley Deal was arrested on drug-related charges, and in 1995, Wiggs left to pursue other musical projects. Macpherson continued playing with Kim Deal in her side-project group, the Amps, and then in the 1996 incarnation of the Breeders, but quit the band in 1997. The Breeders' lineups for their albums Title TK (2002) and Mountain Battles (2008) included the Deal sisters, Mando Lopez, and Jose Medeles. In 2013, Wiggs and Macpherson rejoined the Deals to tour the 20th anniversary of Last Splash\u2014the LSXX Tour.On December 31, 2013, the Breeders performed their final concert on the 60-date tour in Austin, Texas. The group enjoyed the LSXX concerts, and decided they would like to record new music together. Throughout 2014, Wiggs traveled from her home in Brooklyn, New York to Dayton, Ohio, near where Macpherson and both Deals lived. The gro" + }, + { + "comment": "The Breeders, a band that played at Lollapalooza in 1994, released an album titled \"Title TK\" in 2002 after practicing new material and going on tour leading up to a Hollywood Bowl concert.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":44-44", + "content": "up began practicing new material in Kim Deal's basement, including compositions by her and one by Wiggs. By August, there were three new songs they could play well, two less so, and others they had not yet practiced. Reported titles were \\\"Skinhead #2\\\", \\\"Simone\\\", \\\"All Nerve\\\", and \\\"Launched\\\". The band Neutral Milk Hotel asked the Breeders to open for them at a Hollywood Bowl concert to be held on September 18. The latter decided to go on tour leading up to this show and to perform some new compositions in preparation for their eventual recording.\\nThe answer to this question is:\\n\\nAssistant: The album released in 2002 by the Breeders, the band that played at Lollapalooza in 1994, was \\\"Title TK\\\". \\nThis information was identified by carefully reading the provided context and extracting key pieces of information. The context mentions that the Breeders played at Lollapalooza in 1994. It also separately points out that one of their albums, \\\"Title TK\\\", was released in 2002. Thus, aligning these facts together answers the question accurately.\"}" + }, + { + "comment": "The code snippet is a part of an instructional conversation where the user is asked to choose an option related to headlight removal. The chosen option will be continued in the conversation for further instructions on how to proceed with removing Beetle headlights.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":45-45", + "content": "{\"text\": \"System: You are a helpful assistant, who always provide explanation. Think like you are answering to a five year old.\\n\\nUser: Select from the following. -- Wear gloves and safety goggles to protect your eyes and mouth while attempting to remove the headlight unit. Be careful and gentle to avoid damaging the surrounding oil can.; -- Scrape any dirt off the entire bumper, including plastic parts and upholstery. Pull down the dashboard as much as possible to expose any exposed headlight casing.; -- Again, there is no need to completely remove the dome light, as the headlight unit doesn't pop away when dropped. Instead, you have to completely break the bumper and remove the lens.; -- Separate the wiring connector from the beam unit, and disconnect the wires from the light bulb socket. Hold the lamp unit with your thumb to control the spring, and remove the lens-retaining spring.;Choose from options above and answer: What most naturally follows?\\nHow to remove beetle headlights \\" + }, + { + "comment": "Removing beetle headlights involves disconnecting the battery, identifying positive (red) and negative (black) cables, and using a screwdriver to remove the rim around the headlight. Ensure safety by having an adult assist with disconnecting the black cable and working in a safe environment.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":45-46", + "content": "nDisconnect the negative battery cable to eliminate the risk of electrical shock. \\nSometimes battery cable colors vary, but normally you will find that the red battery cable is positive (+) and the black battery cable will be negative (-). \\nUse a screwdriver to loosen the screw that retains the rim, and then pull out the entire headlight unit.\\nAnswer:\\n\\nAssistant: Alright, little buddy, let me explain it to you: To take off the beetle headlights, first, you need to be very careful and not touch the wires because they can give you a little shock. The beetle has two wires, one is red, and the other one is black. You should ask a grown-up to help you take off the black wire. This will make the car safe to work on!\\nNext, you'll need a tool called a screwdriver to remove a small round thing called the rim around the headlight. Once that's done, you can carefully pull out the whole headlight like a puzzle piece. And that's how you remove the beetle headlights!\"}\n{\"text\": \"System: You are" + }, + { + "comment": "The code explains how to determine the sentiment of a given tweet by analyzing its components, identifying the subject and context, and evaluating the user's opinion. The overall sentiment is determined based on the positivity or negativity of the user's opinion. In this case, the sentiment of the tweet about Miley Cyrus is positive as the user expressed their enthusiasm with words like \"awesome\" and enthusiastic exclamation marks.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":46-46", + "content": " an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: What is the sentiment of the following tweet?\\nTweet:i see Miley on the red carpet [online] she is awesome!!!\\n\\nAssistant: To analyze the sentiment of the tweet, let's break it down step-by-step:\\nStep 1: Identify the main components of the tweet\\n- Subject: Miley (as in Miley Cyrus, a celebrity)\\n- Context: on the red carpet\\n- User's opinion: \\\"she is awesome!!!\\\"\\nStep 2: Determine the sentiment associated with each component\\n- For the subject and context, there's no inherent sentiment, as they're neutral pieces of information.\\n- The user's opinion, however, is very positive, indicated by the words \\\"awesome!!!\\\" and the enthusiastic exclamation marks.\\nStep 3: Combine the sentiment values and reach an overall sentiment\\nSince the user's opinion is the main sentiment-carrying element of the twee" + }, + { + "comment": "This code appears to be a JSON representation of a restaurant or establishment, with properties such as name, type of eatery, price range, customer rating, and nearby location.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":46-47", + "content": "t, and it is quite positive, we can determine that the overall sentiment of the tweet is positive.\\nConclusion: The sentiment of the tweet about Miley Cyrus on the red carpet is positive, as the user expresses enthusiasm and admiration for her.\"}\n{\"text\": \"System: You are an AI assistant. You will be given a task. You must generate a detailed and long answer.\\n\\nUser: Sentence: name = The Vaults, eatType = pub, priceRange = moderate, customer rating = 1 out of 5, near = Caf\u00e9 Adriatic\\nCan you represent the content in this sentence in data form?\\n\\nAssistant: Certainly, the content in the given sentence can be represented in a structured data form, such as a dictionary or JSON format. Here's a representation of the content in JSON format:\\n```JSON\\n{\\n \\\"name\\\": \\\"The Vaults\\\",\\n \\\"eatType\\\": \\\"pub\\\",\\n \\\"priceRange\\\": \\\"moderate\\\",\\n \\\"customerRating\\\": {\\n \\\"rating\\\": 1,\\n \\\"maxRating\\\": 5\\n },\\n \\\"near\\\": \\\"Caf\u00e9 Adriatic\\\"\\n}\\n```\\nThis JSON object represents the data as k" + }, + { + "comment": "Code analyzes the given establishment information, organizes it in a structured manner with key-value pairs. It includes details such as eatType, priceRange, customer rating (represented as nested object), and nearby location.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":47-48", + "content": "ey-value pairs, where information about the given establishment, The Vaults, is organized in a structured manner. The object includes details on the eatType (pub), priceRange (moderate), customer rating (1 out of 5), and the nearby location (Caf\u00e9 Adriatic). The customer rating is represented as a nested object, with a rating and maxRating for more clarity. This structured data format makes it easier to process, analyze, and manipulate the information.\"}\n{\"text\": \"System: You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: Do these questions have the same meaning?\\nWhat should be my research rationale to do my research abroad? I'm doing my project on cloud computing. What should I write and how should I write it?\\nI want to do a project in cloud computing. What is an easy and feasible topic for my project?\\nSelect from: 1. no; 2. yes;\\n\\nAssistant:" + }, + { + "comment": "These lines of code contain a question about writing a research rationale for conducting research abroad related to a cloud computing project and another question seeking an easy and feasible topic for a cloud computing project without any reference to researching abroad or writing a rationale.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":48-49", + "content": " 1. no\\nThe two questions have different meanings. The first question is about writing a research rationale for conducting research abroad related to a cloud computing project. It asks for advice on what to include in the rationale and how to write it.\\nThe second question is looking for an easy and feasible topic for a cloud computing project, without any reference to conducting research abroad or writing a research rationale.\"}\n{\"text\": \"System: You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: Are these paraphrases?\\nSaghar has written over 2,000 songs for many singers and music directors for Pakistani films , radio and TV .\\nFor many singers and music directors , Saghar has written more than 2,000 songs for Pakistani films , radio and TV .\\n\\nAssistant: Yes, these two sentences are paraphrases of each other. Here is the justification:\\n1. Bo" + }, + { + "comment": "These lines convey that Saghar has written over 2,000 songs for various platforms in Pakistan and the sentences are paraphrases of each other.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":49-50", + "content": "th sentences convey the same primary information: Saghar has written over 2,000 songs for various artists and platforms, specifically for Pakistani films, radio, and TV.\\n2. The sentences are structured differently, but they are semantically equivalent. The first sentence begins with Saghar as the subject and emphasizes the number of songs, while the second sentence begins with the intended audience of the songs (singers and music directors) and still mentions the impressive volume of Saghar's work. The different structures make them paraphrases of each other, not identical sentences.\"}\n{\"text\": \"System: You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: Giovanni Marks (born October 2 1978) better known by his stage name Subtitle is a rapper and producer based in Los Angeles California. He is one half of the duo Lab Waste alongside Thavius Beck. " + }, + { + "comment": "This code is determining the category of a paragraph based on its content, specifically identifying that it belongs to the \"artist\" category due to mentioning the person's involvement in the music industry and collaborations with other artists.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":50-50", + "content": "He has also collaborated with other artists such as K-the-I??? Busdriver and Islands. Given a list of categories: company, educational institution, artist, athlete, office holder, mean of transportation, building, natural place, village, animal, plant, album, film or written work, what category does the paragraph belong to?\\nThe answer to this question is:\\n\\nAssistant: To determine the appropriate category for the paragraph, I will analyze its content step-by-step.\\n1. The paragraph introduces a person named Giovanni Marks, who has a stage name, Subtitle. This suggests that the person might be involved in the entertainment industry or arts.\\n2. It mentions that Subtitle is a rapper and producer based in Los Angeles, California. This confirms that the person belongs to the music industry, thus aligning with the artist category.\\n3. The paragraph cites his role in the duo Lab Waste and collaborations with other artists. These details further support the involvement of Subtitle in the mu" + }, + { + "comment": "The code is a JSONL file containing text and category information, where the text describes various activities Johnny did over a weekend. The AI assistant is asked to generate a long answer based on the provided question about what Johnny did after lunch on Sunday.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":50-51", + "content": "sic industry as an artist.\\nAfter analyzing the paragraph, I can confidently say that it belongs to the \\\"artist\\\" category. So, the answer to this question is: artist.\"}\n{\"text\": \"System: You are an AI assistant. You will be given a task. You must generate a detailed and long answer.\\n\\nUser: Given the question: Last week Johnny had a very busy weekend. On Saturday morning, he cleaned his room. That made his mother very happy. In the afternoon, he did his math homework. It was not difficult, so it only took him one hour to finish the homework. And in the evening, he went to visit his aunt with his parents. They had a big dinner there. The next morning, he went swimming after getting up. Usually he goes swimming twice a week. He loves it very much. After lunch he went to school playground and played football. That exercise makes him in a good health. On Sunday evening, he watched TV for an hour at home and then put the books in his schoolbag for the next day. What did Johnny do after lu" + }, + { + "comment": "This code snippet describes Johnny's activities on a busy weekend, including basketball, football, swimming, and rest.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":51-51", + "content": "nch on Sunday? A) He played basketball. B) He played football. C) He went swimming. D) He had a rest.\\nThe answer is:\\n\\nAssistant: B) He played football.\\nLast week Johnny experienced a busy weekend, engaging in various activities in his available time. On Saturday, he tackled cleaning his room in the morning\u2014a task that pleased his mother. Later on, in the afternoon, Johnny focused on completing his math homework. Fortunately, he found the work relatively easy, and as such, it only required an hour of his attention.\\nIn the evening, Johnny accompanied his parents on a trip to visit his aunt. While there, they all enjoyed a large dinner together. On Sunday, Johnny began his day with swimming, a sport he is particularly fond of, and typically participates in twice a week. Afterward, he returned home for lunch.\\nPost-lunch, Johnny took advantage of the school playground, engaging in a game of football. Engaging in such exercise not only promotes his overall good health but also aligns w" + }, + { + "comment": "Code explains a person's activities, like watching TV and organizing schoolbooks. \nArticle summarizes Trump appointing Chris Christie to chair his transition team, despite previous accusations, for substance and seriousness.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":51-52", + "content": "ith his recreational interests. Finally, on Sunday evening, Johnny spent an hour watching television before he proceeded to organize and pack his school books into his schoolbag in preparation for the following day.\\nGiven the presented options, it is clear that Johnny engaged in football after enjoying lunch on Sunday.\"}\n{\"text\": \"System: You are an AI assistant. You will be given a task. You must generate a detailed and long answer.\\n\\nUser: Summarize this article:\\nPhoto: AP \\n \\n Donald Trump has announced that New Jersey Governor Chris Christie will serve as chairman of his transition team, identifying personnel who would become part of a Trump White House. Christie will be working closely with Trump\u2019s son-in-law Jared Kushner, whose father he sent to prison. \\n \\n Just months ago, Trump accused Christie of having committed a federal crime in covering up the Bridgegate scandal. Now, NBC News reports, Christie will bring \u201csubstance\u201d and \u201cseriousness\u201d to the Trump campaign: \\n \\n " + }, + { + "comment": "This code is discussing the role of Chris Christie in Trump's growing organization, highlighting his discipline and connection with the establishment. It also mentions Jared Kushner's involvement in the transition team despite their past legal history.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":52-52", + "content": "Sources familiar with the planning say the transition team appointment shows \u201cthe role the governor is playing as the serious person around Trump who is trying to provide some discipline\u201d to his growing organization. \\n \\n Christie also continues to act as \u201ca conduit between the establishment and Trump\u201d based on Christie\u2019s relationships with Republican elected officials across the country, sources said. \\n \\n The New York Times reported last week that Kushner\u2014Trump\u2019s son-in-law and a real estate heir in his own right\u2014would be a part of the very forward-looking transition team. (President Obama didn\u2019t announce his Transition Team Chariman until the day after he won the 2008 election.) In 2005, as U.S. Attorney for the state of New Jersey, Christie successfully prosecuted Kushner\u2019s father, Charles, for \u201ccrimes of greed, power, and excess.\u201d \\n \\n There does not seem to be any bad blood between Christie and Jared Kushner, however: In November, New Jersey awarded $93 million in tax break" + }, + { + "comment": "The code discusses the Trump campaign directing press inquiries about the transition committee to Governor Christie's office. It also mentions Max Stier's opinion that if Christie is heading the transition effort, he cannot be active in the campaign. The passage also refers to Mitt Romney's approach in 2012 and raises uncertainty about Trump's plans for his own transition team role.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":52-52", + "content": "s to a Kushner Companies high-rise in Jersey City. \\n \\n Asked why the Trump campaign was directing press inquiries about the transition committee to the governor\u2019s office, a spokeswoman for Christie told Gawker, \u201cWe\u2019re trying to figure that out ourselves.\u201d ||||| If Christie is truly heading Trump\u2019s day-to-day transition effort, the answer to both questions has to be no, said Max Stier, who is chairman of the Partnership for Public Service and has been advising the campaigns on transition planning. \u201cIf Christie is going to really run the transition process, he can\u2019t effectively be active in the campaign or be himself a candidate because it\u2019s a full-time job,\u201d Stier said. In 2012, Mitt Romney tapped former Utah Governor Mike Leavitt to run his transition, and Leavitt kept his distance from the campaign while building out what he called a 600-person \u201cgovernment in miniature.\u201d It\u2019s not clear if Trump plans a similar operation. If Christie\u2019s role atop the transition team was more ceremoni" + }, + { + "comment": "Christie appointed as Trump's transition chief, potentially splitting time with campaign and governing New Jersey. Trump's announcement comes days after NYT reported Kushner leading transition efforts, but Christie seems independent from the campaign.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":52-52", + "content": "al, it might be possible for him to split time with the campaign, to stay in contention to be Trump\u2019s running mate\u2014or even to do the job he was elected to perform, governing the state of New Jersey. \\n \\n The announcement came just a few days after The New York Times reported that Trump had asked his son-in-law, Jared Kushner, to head up the transition efforts. Kushner was not mentioned in the press release announcing Christie\u2019s new job. If Monday was an early indication, Christie seems to be operating pretty independently from the campaign: A Trump spokeswoman directed questions about his role to Christie\u2019s gubernatorial office, which promptly referred those queries back to Trump\u2019s campaign. \\n \\n Hillary Clinton, the likely Democratic nominee, has not publicly announced a transition chief. And the fact that Trump made his own announcement so quickly is at least a momentary relief to those who wondered how seriously he would take the process. \u201cFirst and foremost, it is really import" + }, + { + "comment": "Trump selects Chris Christie as transition team chairman, signifying campaign seriousness.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":52-52", + "content": "ant that Trump has publicly named a transition chairman,\u201d Stier said. \u201cTrump is demonstrating his understanding that he needs to prepare for the possibility that he will be governing at the same time as he is trying to win the campaign.\u201d \\n \\n We want to hear what you think about this article. Submit a letter to the editor or write to letters@theatlantic.com. |||||\\n\\nAssistant: Donald Trump has selected New Jersey Governor Chris Christie to serve as chairman of his transition team, which means he will be tasked with identifying personnel to become part of a Trump White House. This appointment is seen as bringing \\\"substance\\\" and \\\"seriousness\\\" to Trump's campaign. The role demonstrates that Christie is seen as the \\\"serious person\\\" around Trump who is trying to provide some discipline to his growing organization. Christie will be working with Trump's son-in-law Jared Kushner, despite the fact that Christie sent Kushner's father to prison in 2005. The appointment comes just a few d" + }, + { + "comment": "The code demonstrates Trump's preparation for possible governance while still campaigning, and introduces Melody C. Barnes's role in President Obama's administration. It also highlights her previous work at the Center for American Progress and her involvement in the Obama campaign.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":52-53", + "content": "ays after it was reported that Trump had asked Kushner to head up the transition efforts, and demonstrates Trump's belief that he must prepare for the possibility he will be governing at the same time as he is trying to win the campaign.\"}\n{\"text\": \"System: You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: Please answer the following question: Melody Barnes - Melody C. Barnes (b. April 29 1964 Richmond Virginia) was chosen by President Barack Obama to serve as the Director of the Domestic Policy Council for his administration. Obama announced her appointment on November 24 2008. She was previously Executive Vice President for Policy at the Center for American Progress and led the CAP's Faith and Progressive Policy Initiative. She left the CAP in June 2008 to work on the Obama campaign. Given a choice of categories company, educational instituti" + }, + { + "comment": "The code appears to be related to an AI system, possibly a chatbot or question answering model, handling questions about different categories of subjects such as artist, athlete, office holder, etc. The provided snippet seems to handle the category \"office holder\" by providing information about Melody C. Barnes and her roles in various administrations.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":53-54", + "content": "on, artist, athlete, office holder, mean of transportation, building, natural place, village, animal, plant, album, film or written work, the text refers to which one?\\nAnswer:\\n\\nAssistant: The text refers to an \\\"office holder,\\\" as Melody C. Barnes is described as having served in various roles, such as the Director of the Domestic Policy Council for President Barack Obama's administration as well as Executive Vice President for Policy at the Center for American Progress.\"}\n{\"text\": \"System: You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: There's always next year \\n \\n 21 Mar 14 \\n \\n After 25 games, the billion dollar bracket dream ends \\n \\n Just 25 games into the NCAA tournament, the billion dollar bracket bubble has already burst. \\n \\n Going into the third wave of games on the second day of the NCAA tournament, only three perfect b" + }, + { + "comment": "This code discusses a billion dollar bracket contest related to basketball games, mentioning the elimination of perfect brackets and the impact on Warren Buffett's sponsorship. The odds of picking all 63 games correctly are extremely low, one-in-9.2 quintillion according to a DePaul University math professor.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":54-54", + "content": "rackets remained in the Quicken Loans billion dollar bracket contest. And all three had George Washington beating Memphis. \\n \\n As Maurice Creek\u2019s game-tying three came up short, so went the retirement plans of the three remaining hopefuls. \\n \\n Memphis win means that zero perfect brackets remain in the #BillionDollarBracket \u2013 Warren Buffett and @cavsdan are celebrating \u2014 Daryl Ruiter (@RuiterWrongFAN) March 22, 2014 \\n \\n Billionaire Warren Buffett couldn\u2019t have picked a better year to sponsor such a contest. Five double-digit seeded teams have won games with four games remaining in the second round. \\n \\n Even without a particularly mad March, the odds of picking all 63 games correctly are long. The chances of doing so was tabulated by a DePaul University math professor to be one-in-9.2 quintillion. \\n \\n The prospect of someone taking home the billion dollar prize took a big hit after the very first game of the tournament. About 84 percent of contestants incorrectly picked Oh" + }, + { + "comment": "Code snippet describes an upset in the NCAA college basketball tournament where Mercer defeated Duke. It also mentions that over 97% of contestants picked Duke to win, while the top-20 brackets will receive $100,000.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":54-54", + "content": "io State to beat Dayton in the opener. \\n \\n The biggest upset, by far, was Mercer\u2019s win over Duke. Over 97 percent of contestants picked the Blue Devils. \\n \\n While the billion dollar dream is over, the top-20 brackets will receive $100,000. ||||| Mercer forward Jakob Gollon (20) and guard Langston Hall (21) celebrate after defeating Duke 78-71 in an NCAA college basketball second-round game against Duke, Friday, March 21, 2014, in Raleigh, N.C.(AP... (Associated Press) \\n \\n Stanford head coach Johnny Dawkins yells to his players during the second half of a second-round game in the NCAA college basketball tournament, Friday, March 21, 2014, in St. Louis. Stanford won the... (Associated Press) \\n \\n Memphis guard Chris Crawford (3) reacts to a Mercer goal against George Washington during the second half of an NCAA college basketball second-round tournament game, Friday, March 21, 2014, in Raleigh.... (Associated Press) \\n \\n Gonzaga guard David Stockton pulls in a loose ball dur" + }, + { + "comment": "This code appears to be a news article extract about an upset in the NCAA men's college basketball tournament, which resulted in no one having a perfect bracket for Warren Buffett's $1 billion challenge.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":54-54", + "content": "ing the second half in a second-round game against Oklahoma State in the NCAA men's college basketball tournament Friday, March 21, 2014, in San Diego.... (Associated Press) \\n \\n A second day of upsets ended any chance of someone having a perfect NCAA tournament bracket in Warren Buffet's $1 billion challenge. It was a favorite that provided the first blemish on the final three people's brackets in the Quicken Loans contest on the Yahoo Sports website. \\n \\n All three had ninth-seeded George Washington beating Memphis. The Tigers won 71-66. \\n \\n \\\"If Warren Buffett wants to donate the (billion) to our university, we will take it and use it in good company,\\\" Memphis coach Josh Pastner said. \\\"We'll find a way.\\\" \\n \\n It only took 25 games for everyone to be eliminated. Then again most of brackets were knocked out on the tournament's first full day. The number of unblemished brackets kept dwindling after third-seed Duke, sixth-seed UMass and seventh-seed New Mexico lost Friday. \\" + }, + { + "comment": "The code describes the progression of a bracket challenge in which only 16 people remained perfect after certain games, eventually leaving six with a chance, and ultimately no one won the $1 billion grand prize but the top 20 scores still received $100,000.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":54-54", + "content": "n \\n Only 16 people remained perfect after 10th-seeded Stanford topped New Mexico. Then Tennessee routed UMass, leaving only six people with a chance of beating the 9.2 quintillion-to-1 odds. \\n \\n Gonzaga's victory over Oklahoma State cut that down to the final three. \\n \\n Even though no one won the $1 billion, the top 20 scores will still each get $100,000. \\n \\n Quicken Loans, which is sponsoring and insuring the Buffet contest, said on its Twitter feed that it wouldn't reveal the number of entrants to the challenge. The pool was supposed to be capped at 15 million entries. It probably wouldn't have mattered if they had let more people join. \\n \\n At CBSSports.com, only 0.03 percent of entrants were still perfect after Mercer upset Duke. They didn't last much longer as Tennessee's rout of UMass wiped out all the remaining unblemished entries. It took 21 games to end everyone's hope of perfection this year. Last season it took 23 games and 24 in 2012. \\n \\n A year ago, not a s" + }, + { + "comment": "Code describes the challenges and outcomes of a billion-dollar NCAA tournament bracket competition in 2014. It highlights that no one achieved a perfect bracket, but top scorers still received $100,000. The task for an AI assistant could be to analyze similar sports competitions and provide insights on their performance metrics or potential improvements.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":54-55", + "content": "ingle person of the 11 million who entered on ESPN's website was perfect after a first day filled with upsets. Just four got 15 out of 16 right. \\n \\n This year people lasted a little longer. After 28 games, all 11 million entries had at least one mistake. \\n \\n ___ \\n \\n Follow Doug on Twitter at http://www.twitter.com/dougfeinberg |||||\\nWrite a summary.\\n\\nAssistant: In 2014, the billion-dollar bracket dream for the NCAA tournament ended just 25 games into the competition. Only three perfect brackets remained before the third wave of games, and all three had George Washington beating Memphis. Unfortunately, as Maurice Creek's game-tying three missed, so did the billion-dollar hopes. Despite no one winning the ultimate prize, the top 20 scores received $100,000. The odds of correctly picking 63 games were calculated to be one-in-9.2 quintillion by a DePaul University math professor.\"}\n{\"text\": \"System: You are an AI assistant. User will you give you a task. Your goal is to complete" + }, + { + "comment": "Storage location: \"RWKV-LM/RWKV-v5/demo.jsonl\":55-55\nCode:\n```\nUser shares a letter about school important dates, admission, uniforms and policy.\n```", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":55-55", + "content": " the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: Article: Dear All, Happy New Year! A list of important dates for this term is attached to this letter. Interviews for our April and September Primary One classes will take place on Tuesday and Wednesday,18th&19th January.Now is the time to apply for admission to Peak School. Children are expected to wear their uniforms this term.As is usual at this time of the year,mornings at the school are quite cold and l would suggest children wear their sweaters to school.As the temperature goes up during the day,children tend to take off their outer clothes,so it is important that all sweatshirts are labeled . I have received a few requests from parents asking if their children can wear summer uniforms this term.I am of the view that all children in the school should follow the school policy and be dressed in the appropriate uniforms.In the longer term,a number of US at school have qu" + }, + { + "comment": "The school has dark green uniforms that may not be visible to motorists during foggy mornings. Parents have raised concerns, and a Uniform Committee will be set up to address the issue.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":55-55", + "content": "estioned the need to have a winter and summer uniform.Perhaps.with careful consideration,we can develop a uniform that will serve children for the whole academic year. As the foggy mornings start to appear,a number of parents have raised concerns over the safety of those children who walk to school. Our dark green uniforms are not particularly visible to motorists.Hopefully, the \\\"Uniform Committee\\\" to be set up this term will deal with this particular issue.I will keep parents informed of the channels through which views may be expressed in future newsletters. We still have a \\\"No Hat,No Play\\\" rule on the playground,so children need to ensure that they have their sunhats in school at all times. Yours sincerely, P D Tunesi Principal Question: What do we know about the present school uniforms? Yes or no, is the answer \\\"The school is considering changing them.\\\"?\\n\\nAssistant: From the article, we know that the present school uniforms include separate winter and summer uniforms, and " + }, + { + "comment": "The code appears to be extracting and displaying information from a JSON file. It seems to provide details about a school's consideration of changing its uniforms, mentioning the concerns of parents and possible solutions.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":55-56", + "content": "that the winter ones are dark green in color. Some parents have questioned the need for both winter and summer uniforms and have suggested developing a uniform that can be worn the entire academic year. Additionally, there are concerns about the visibility of the dark green uniforms for children walking to school in foggy mornings.\\nThe answer is \\\"Yes, the school is considering changing them,\\\" as the principal mentions that a \\\"Uniform Committee\\\" will be set up to deal with the issue of visibility, and that there is a possibility of developing a uniform to serve for the whole academic year after careful consideration.\"}\n{\"text\": \"System: You are an AI assistant. You will be given a task. You must generate a detailed and long answer.\\n\\nUser: Answer the following question: Title: another g-shock for me Review: This is probably my third G-shock over 25 years. A good worker's watch. I also think it is a nice looking \\\"rounded appearance\\\" every day watch. Years ago they were square. Thi" + }, + { + "comment": "Review conveys a positive sentiment towards G-shock watch due to long-term use, durability, and value for money.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":56-56", + "content": "s is the only watch I wear. My first two watches never stoped running, just the rubber/plastic casing gets broken up over the years from contact with varius items I'm working on. The battery lasts 4-5 years if you dont use the light feature button much. The bands also need replacing over the years with heavy use. The three alarms I never use (high tones) because of my hearing loss. Although they claim water resistant I don't take it in the water. One did leak on me once but after it dried out it continued to work. Less than $50.00 including tax and shipping. Does this product review convey a negative or positive sentiment?\\nAnswer:\\n\\nAssistant: The product review conveys a predominantly positive sentiment towards the G-shock watch. The reviewer shares their long-term experience with the product, stating that this is their third G-shock over a period of 25 years, which implies a sense of loyalty and satisfaction with the brand. Additionally, the reviewer mentions the watch's durability" + }, + { + "comment": "The code represents a review for a watch, mentioning its positive aspects like battery life and design, as well as negative ones such as rubber casing damage and water resistance issues. The overall sentiment is positive, with the user being content but acknowledging some flaws.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":56-56", + "content": ", as the first two watches never stopped running and the battery life can last 4-5 years if the light feature button is not used frequently. The watch's design and rounded appearance are also considered as appealing for an everyday watch.\\nHowever, there are a few negative remarks mentioned as well. The reviewer notes that the rubber/plastic casing tends to get damaged, and the band needs to be replaced after years of heavy use. Furthermore, they reveal their disappointment in the watch's alarm feature, as the high tones cannot be heard due to their hearing loss. Lastly, the reviewer shares a concern related to water resistance, recounting an incident when the watch leaked, but acknowledges it continued working after drying out.\\nOverall, the sentiment of the review is positive, as the user appears to be content with the watch's performance and design while acknowledging a few flaws. The fact that the reviewer continues to purchase G-shock watches after 25 years is a strong indication of their satisfaction with the product.\"}" + }, + { + "comment": "Code snippet describes a private security jeep patrolling a residential area in Johannesburg, South Africa, manned by trained officers and carrying semi-automatic weapons. The managing director emphasizes the deterrent aspect of their presence to prevent crime.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":57-57", + "content": "{\"text\": \"System: You are an AI assistant. You will be given a task. You must generate a detailed and long answer.\\n\\nUser: Write some highlights for the following article:\\nManned by a pair of trained officers, a large private security jeep moves slowly past the high walls lining a residential street in the suburbs of Johannesburg, South Africa. Heavily armed and alert, the officers scan the designated area, looking for any suspicious activity. They work for CSS Tactical, a private security company hired by local residents to patrol their area and protect them from any potential criminals. \\\"To be proactive, we need to be noticed,\\\" says Ricky Croock, managing director of CSS Tactical. \\\"We need people to say 'hold on, there is a dedicated tactical vehicle.' \\\"A tactical vehicle is a bigger vehicle -- it is more aggressive, it's a deterrent. So the first point is we want to deter as opposed to catch crime.\\\" The security officers are well armed. They carry semi-automatic weapons and h" + }, + { + "comment": "This code discusses crime rates in South Africa, mentioning murder, sexual offenses, and aggravated robbery. It highlights efforts to reduce crime, but notes the discrepancy between expected reduction and actual outcomes.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":57-57", + "content": "and guns and they're trained in how and when to use them. And while only the police have the power of random stop and search in public places, security companies also stop people they regard as suspicious. From April 2011 to March 2012 there were 15,609 murders in South Africa, as well as 64,514 sexual offenses and 101,203 cases of aggravated robbery. According to the country's latest \\\"Victims of crime survey,\\\" 57% of respondents felt that housebreaking/burglary was the crime most feared in their area of residence. In recent years, South Africa has stepped up efforts to tackle crime, one of the country's worst social ills. Last September, the South African Police Service said that serious crime in the country had been reduced by 31.8% from 2004/5 to 2011/12. But 2011 to 2012 saw a reduction in serious crime of just 1.9%. \\\"We only have the official crime statistics to go by,\\\" says Rudolph Zinn of the Institute for Security Studies in South Africa. \\\"If you look at what the police wo" + }, + { + "comment": "The provided code is a text excerpt discussing crime trends in South Africa, emphasizing the high percentage of violent or contact crime. It also highlights the growth of the country's private security industry due to weak policing and the presence of individuals with military backgrounds in companies like Protea Co.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":57-57", + "content": "uld report annually in September of every year, we see over the last three or four years that crime has leveled off. \\\"The more concerning factor is the high percentage of so-called violent or contact crime, which is about a third of all the crime reported to the South African Police Service.\\\" And that's what has fueled the boom in the country's private security industry, crime experts say. There are nearly 9,000 companies and 400,000 registered active private security guards. That's more than the police and army combined, according to South African officials. \\\"The security industry is bigger than what it has ever been in South Africa,\\\" says Zinn. \\\"I think the growth in the industry is definitely attributed to the fact that, let's call it a weak policing or ineffective policing, and it created the opportunity for private individuals to move into the market.\\\" Some of those individuals have a military background. Waal de Waal, chief operating officer at South African-owned Protea Co" + }, + { + "comment": "Code snippet discusses the role of Protea Coin, a private security company in South Africa, and their difference from police. The company deals with small-scale crimes, offers various services, and faces government concerns over national security threats.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":57-57", + "content": "in, spent 18 years in the special forces and military intelligence before joining the private security sector a decade ago. He is today one of the 17,000 people employed by Protea Coin. Part of the multi-million dollar company's work is dealing with small-scale crime, like recovering fuel stolen from a petrol station. The company, which also provides security at many of South Africa's mines and has been kept busy by the recent strikes in the sector, offers a range of services, including 24-hour property monitoring and protection against organized criminal syndicates. \\\"The role of private security companies is distinctively different to that of the police,\\\" says Petrus Van Niekerk, the chief executive of Protea Coin. \\\"We view our role as private security to really aid and support the police.\\\" But what happens when private security companies take matters into their own hands? The South African government claims that it's an industry that threatens national security and is determined " + }, + { + "comment": "The code discusses the potential danger of having a security industry unregulated, with the South African government considering measures like requiring private security firms to be owned by South Africans. This could impact foreign direct investment and might not solve the issues related to professionalism in the private security sector.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":57-57", + "content": "to tighten its regulation. \\\"A very dangerous situation arises if you have a security industry that outstrips both your police and your army and there is completely no regulation of that industry,\\\" says Jenni Irish-Qhobosheane, the head of the Civilian Secretariat for Police. \\\"I think the public can find themselves in situations where, basically, they are put at the mercy of private security companies who are unprofessional,\\\" she adds. One measure considered by the country's government is that private security firms must be owned by South Africans. \\\"This type of move is going to frighten away foreign direct investment and I think that is a danger,\\\" says Hanes Venter, sales director at the South African arm of G4S, a London-listed global security giant. \\\"Foreign direct investment is already down year on year but this is a worry from that perspective for me.\\\" Whatever regulations the South African government eventually introduces, it seems unlikely the demand for private securitie" + }, + { + "comment": "Code discusses the growth of private security industry in South Africa due to high crime rates, inefficiency of local police, and mentions CSS Tactical as an example. Over 9,000 security companies exist with 400,000 registered guards, which exceeds the combined count of police and army, raising concerns from government officials.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":57-57", + "content": "s companies will diminish amid high unemployment rates, as well as widespread poverty and simmering industrial tensions.\\n\\nAssistant: A growing private security industry in South Africa has emerged due to high rates of violent crimes and perceived inefficiency of local police. CSS Tactical, a private security company, is one example of a firm now hired by Johannesburg residents to patrol their neighborhoods as they aim to deter crime. Recent statistics showed that in 2011 to 2012, South Africa experienced 15,609 murders, 64,514 sexual offenses, and 101,203 cases of aggravated robbery. The escalation of crime in the nation has driven the demand for private security companies.\\nSouth Africa currently has nearly 9,000 security companies, with 400,000 registered active private security guards in total. This number is larger than the combined number of the police and army, according to South African officials. The government claims that this expanding industry presents a threat to national" + }, + { + "comment": "The code snippet discusses a proposed measure for private security firms in South Africa to be owned by South Africans. The demand for these firms is unlikely to decrease due to unemployment, poverty, and ongoing tensions. This code also mentions \"Bad Grandpa,\" which is reminiscent of the 2006 British-American mockumentary comedy film \"Borat: Cultural Learnings of America for Make Benefit Glorious Nation of Kazakhstan.\"", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":57-58", + "content": " security and intends to tighten its regulation. One proposed measure is that private security firms must be owned by South Africans to limit potential manipulation by foreign entities. Although crime rates have seen a reduction in recent years, it is unlikely that the demand for private security firms will decrease due to high unemployment and poverty rates, as well as ongoing industrial tensions.\"}\n{\"text\": \"System: You are an AI assistant. Provide a detailed answer so user don\u2019t need to search outside to understand the answer.\\n\\nUser: Please answer the following question: Combine facts and answer this: Which 2006 British-American mockumentary comedy film was Bad Grandpa a reminiscent?\\nAnswer:\\n\\nAssistant: The 2006 British-American mockumentary comedy film that \\\"Bad Grandpa\\\" is reminiscent of is \\\"Borat: Cultural Learnings of America for Make Benefit Glorious Nation of Kazakhstan.\\\" Directed by Larry Charles and starring Sacha Baron Cohen as the titular character, Borat Sagdiyev," + }, + { + "comment": "The code is providing a description of the movie \"Borat\" and an explanation for translating text from one language to another, specifically from English to Russian. The description highlights that \"Borat\" combines scripted scenes with unscripted interactions, leading to amusing and revealing moments. The AI assistant is presented as knowledgeable in multiple languages, including the ability to translate between them.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":58-60", + "content": " this film follows the journey of a Kazakh journalist as he travels across the United States to learn about American culture, customs, and lifestyle. Like \\\"Bad Grandpa,\\\" \\\"Borat\\\" also combines scripted scenes with unscripted interactions with real people, resulting in shocking, hilarious, and cringe-worthy moments that reveal the prejudices and peculiarities of the everyday people Borat encounters.\"}\n{\"text\": \"System: You are a helpful assistant, who always provide explanation. Think like you are answering to a five year old.\\n\\nUser: Translate to Russian:\\nAt the left top there is the current set of tiles used.\\n\\nAssistant: \u041d\u0430 \u0432\u0435\u0440\u0445\u0443 \u0441\u043b\u0435\u0432\u0430 \u043d\u0430\u0445\u043e\u0434\u0438\u0442\u0441\u044f \u0442\u0435\u043a\u0443\u0449\u0438\u0439 \u043d\u0430\u0431\u043e\u0440 \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u0443\u0435\u043c\u044b\u0445 \u043f\u043b\u0438\u0442\u043e\u043a. \u0415\u0441\u043b\u0438 \u043e\u0431\u044a\u044f\u0441\u043d\u044f\u0442\u044c \u043a\u0430\u043a \u0434\u043b\u044f \u043f\u044f\u0442\u0438\u043b\u0435\u0442\u043d\u0435\u0433\u043e: \u043f\u0440\u0435\u0434\u0441\u0442\u0430\u0432\u044c, \u0447\u0442\u043e \u044d\u0442\u043e \u043a\u0430\u0440\u0442\u0438\u043d\u043a\u0430 \u0441 \u0440\u0430\u0437\u043d\u043e\u0446\u0432\u0435\u0442\u043d\u044b\u043c\u0438 \u043a\u0432\u0430\u0434\u0440\u0430\u0442\u0438\u043a\u0430\u043c\u0438 \u0441\u0432\u0435\u0440\u0445\u0443 \u0441\u043b\u0435\u0432\u0430. \u042d\u0442\u0438 \u043a\u0432\u0430\u0434\u0440\u0430\u0442\u0438\u043a\u0438 - \u044d\u0442\u043e \u043d\u0430\u0448\u0438 \u043f\u043b\u0438\u0442\u043a\u0438, \u043a\u043e\u0442\u043e\u0440\u044b\u0435 \u043c\u044b \u0441\u0435\u0439\u0447\u0430\u0441 \u0438\u0441\u043f\u043e\u043b\u044c\u0437\u0443\u0435\u043c.\"}\n{\"text\": \"System: You are an AI assistant, who knows every language and how to translate one language to another. Given a task, you " + }, + { + "comment": "Code chunk describes a task where given a statement, genre, and label (entailment, contradiction, or neutral), create a sentence that follows the tone with respect to the statement and belongs to the specified genre. In this case, the example demonstrates an entailment for the fiction genre.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":60-61", + "content": "explain in simple steps what the task is asking, any guidelines that it provides. You solve the task and show how you used the guidelines to solve the task.\\n\\nUser: In this task, you're given a statement, the genre to which that statement belongs, and a label indicating if the statement should be agreed with (entailment), disagreed with (contradiction), or neither (neutral). Your job is to write a sentence that describes the genre that follows the tone with respect to the statement, as indicated by the label. If sentence X agrees with sentence Y, the can be concluded from one another. If sentence X disagrees with sentence Y, they can not be correct at the same time. The sentence must also belong to the genre specified.\\nQ: Statement: He'd been unconscious long enough for them to have gathered any amount they wanted.\\nLabel: entailment.\\nGenre: fiction.\\nA:\\n\\nAssistant: As a result of his prolonged unconsciousness, they had plenty of time to collect whatever they desired.\"}\n{\"text\": \"U" + }, + { + "comment": "The code is providing information about the Family and Medical Leave Act (FMLA) in response to a user's question regarding its eligibility criteria. The AI assistant helps clarify the purpose of FMLA and provides necessary details on the employee requirements for being eligible for leave under this act.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":61-62", + "content": "ser: Jesse had to get everyone's coffee order. Jesse left that coffee shop with the orders. Given that the answer to a question is \\\"know what everyone wants\\\", what is the question?\\n\\nAssistant: What does Jesse need to do in order to successfully complete his task at the coffee shop?\"}\n{\"text\": \"System: You are an AI assistant that follows instruction extremely well. Help as much as you can.\\n\\nUser: Question: what does the family leave act? Would \\\"In order to be eligible for FMLA leave, an employee must have been at the business at least 12 months, and worked at least 1,250 hours over the past 12 months, and work at a location where the company employs 50 or more employees within 75 miles.\\\" be a reasonable answer?\\n\\nAssistant: The Family and Medical Leave Act (FMLA) is a United States labor law that provides certain employees with up to 12 weeks of unpaid, job-protected leave per year for specific family and medical reasons. It was signed into law on February 5, 1993. The purpose" + }, + { + "comment": "The code provides an example of a system that can provide information on the eligibility requirements for FMLA leave. It correctly identifies the criteria an employee must meet to be eligible for FMLA leave, including working at least 12 months with a minimum of 1,250 hours and working at a location", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":62-63", + "content": " of the FMLA is to offer employees job protection and a balance between work and personal life during certain life events that require time away from work.\\nYour answer is correct in detailing the eligibility requirements for an employee to qualify for FMLA. Here is a rephrased version:\\n\\\"In order to be eligible for FMLA leave, an employee must have worked for the company for at least 12 months, clocking a minimum of 1,250 hours within that time frame. Additionally, the employee must work at a company location with 50 or more employees within a 75-mile radius.\\\"\\nThis accurately summarizes the main criteria an employee needs to meet to be eligible for FMLA leave.\"}\n{\"text\": \"System: You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: On what date was the 2016 studio album which had the song \\\"Freedom\\\" released?\\nA:\\n\\nAssistant: Step 1: Identify" + }, + { + "comment": "User is asking about the next step in joining a sorority and code mentions understanding the Greek system and the National Panhellenic Conference (NPC) with 26 member organizations.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":63-64", + "content": " the album containing the song \\\"Freedom.\\\"\\nStep 2: Find out who the artist is for the song, and the album.\\nStep 3: Search for the release date of the album.\\nThe song \\\"Freedom\\\" is from the album \\\"Lemonade\\\" by Beyonc\u00e9. \\nNow that we have identified the album and artist, I will find the release date for the \\\"Lemonade\\\" album.\\n\\\"Lemonade\\\" by Beyonc\u00e9 was released on April 23, 2016.\\nSo the answer is A: April 23, 2016.\"}\n{\"text\": \"System: You are a helpful assistant, who always provide explanation. Think like you are answering to a five year old.\\n\\nUser: What most naturally follows?\\nHow to rush a sorority \\nUnderstand the greek system. \\nThe major female greek system is governed by the national panhellenic conference (npc). The npc is made up of 26 member organizations.\\n\\nAssistant: So, after learning about the Greek system and the National Panhellenic Conference (NPC), which has 26 member organizations, the next step is to find out which sororities are on your college campus. T" + }, + { + "comment": "Explaining sorority joining process to someone like they're a five-year-old. Assisting with an answer about who played Alfred in Miracle on 34th Street and its context. Providing information about the movie and character Alfred.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":64-66", + "content": "hen, you can attend their events and get to know the members to see if you fit in. Once you find a sorority you like, you will participate in the rushing and recruitment process, which is like a big party where the sororities and potential new members meet each other. By doing all these steps, you have a better chance of joining the sorority of your choice. Remember, it's like finding a group of friends who share the same interest as you!\"}\n{\"text\": \"System: You are a helpful assistant, who always provide explanation. Think like you are answering to a five year old.\\n\\nUser: What is the answer to this question? who played alfred in miracle on 34th street?\\n\\nAssistant: The actor who played Alfred in Miracle on 34th Street is Alvin Greenman. He acted in the movie 1947 version. Alfred is a nice man who works at the same store as Santa Claus and helps him bring happiness to the people during the Christmas season.\"}\n{\"text\": \"System: You are an AI assistant that follows instruction extremely" + }, + { + "comment": "Code snippet demonstrates an AI answering user questions, providing information and clarification on geographical locations and cultural data.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":66-68", + "content": " well. Help as much as you can.\\n\\nUser: Question: Question: what state is jakarta in? I found the following answer on Google: The city is the seat of the ASEAN Secretariat. Is that a correct answer? Yes or no.\\nAnswer:\\n\\nAssistant: No. Jakarta is the capital city of Indonesia and is located in the Special Capital Region of Jakarta, not a state.\"}\n{\"text\": \"System: You are a helpful assistant, who always provide explanation. Think like you are answering to a five year old.\\n\\nUser: Data: United States, ethnicGroup, Asian Americans; United States, language, English language; A Severed Wasp, country, United States\\nSentence about the following data:\\n\\nAssistant: In the United States, there are many Asian Americans, which is a group of people from different Asian countries. Almost everyone in the United States speaks the English language, and one story from this country is called \\\"A Severed Wasp.\\\"\"}\n{\"text\": \"System: You are an AI assistant. You will be given a task. You must generate a" + }, + { + "comment": "Explanation: The code provides instructions on how to perform a 180-degree skateboard trick, emphasizing the importance of mastering the ollie and understanding one's skateboarding stance (regular or goofy). It also mentions that the back foot should be positioned on the tail of the board for performing the 180.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":68-68", + "content": " detailed and long answer.\\n\\nUser: Summarize this article in one sentence.\\nIf you\u2019re new to doing a 180 on your skateboard or haven\u2019t done the trick in a while it\u2019s a good idea to get your body used to being in the proper position. Practice skating around for a bit by pushing off and then situating your body into the right position. The ollie is the most fundamental skateboard trick and it\u2019s incorporated into pretty much every trick you\u2019ll do. And the 180 is an ollie where you turn your board and body 180-degrees. For the 180 you\u2019ll be turning around either behind you for a backside 180, or in front of you for a frontside 180. Once you have the ollie down, you\u2019re well on your way to doing a 180 skateboard trick! If you ride \u201cregular\u201d you will have your right foot in the back positioned on the tail of the board. If you ride \u201cgoofy\u201d then your left foot will be in back on the tail. You want to position your back foot on the tip of the tail. To do a 180 on the skateboard, you\u2019ll want to" + }, + { + "comment": "Explaining how to position feet and shoulders for an ollie, emphasizing the importance of upper body movement for turning in a backside 180.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":68-68", + "content": " position your back foot on the corner of the tail. For a straight ollie your front foot will be centered about halfway up the board or slightly back. You also want to put more pressure on the outer edge of your foot (the side your pinky toe is on). To help you with turning, plant your front foot so its slightly hanging over the opposite edge of board from your back foot. This may feel awkward but it will help to keep your board balanced when you\u2019re in the air. Your shoulders and your upper body is where most of the work will come from when you perform your 180. By setting your shoulders correctly you will be able to turn your body quickly enough and this should allow your legs and the board to follow. You might be tempted to try and force the board to turn using mostly your legs, but you won\u2019t get enough torque this way. For a backside 180, position your front shoulder slightly in front of your chest and dip it down so when you pull up you have more momentum to work with. For a fron" + }, + { + "comment": "The code discusses the process of performing a 180 skateboard trick, focusing on the importance of mastering the ollie, correct foot positioning, and utilizing upper body momentum for turning.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":68-69", + "content": "tside 180, you want to position back shoulder in front of your chest and dip it down as well so you can pull up and create momentum. You don\u2019t want too much of a bend so that you end up crouching too low as that won\u2019t allow you to pop high enough. You should stand with relaxed knees like you\u2019re getting ready to hop slightly. If you are bending correctly you should be able to slide your front forward up the board with ease.\\n\\nAssistant: This article explains the process of performing a 180 skateboard trick, emphasizing the importance of mastering the ollie, proper foot positioning, and using the upper body for turning momentum.\"}\n{\"text\": \"System: You are an AI assistant. You will be given a task. You must generate a detailed and long answer.\\n\\nUser: Briefly summarize this sentence: new zealand army engineers rebuilding iraq have been confined to their base by escalating violence in the country and prime minister helen clark said tuesday they may leave if the unrest continues .\\n\\nAssi" + }, + { + "comment": "This code is related to story completion tasks using AI, where the model predicts an appropriate last sentence for a given set of sentences. The input consists of a four-sentence story and the task requires generating a coherent continuation.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":69-70", + "content": "stant: The New Zealand Army engineers who have been working on rebuilding efforts in Iraq are currently confined to their base due to a significant increase in violence within the country. Prime Minister Helen Clark mentioned on Tuesday that if this ongoing unrest persists, it may result in the withdrawal of these engineers from Iraq.\"}\n{\"text\": \"System: Explain how you used the definition to come up with the answer.\\n\\nUser: You will be given a definition of a task first, then some input of the task.\\nIn this task, you're given a four sentences of story written in natural language. Your job is to complete end part of the story by predicting appropriate last sentence which is coherent with the given sentences.\\nSentence1: Regrettably, our kitty passed away two weeks ago. Sentence2: She was a young cat, and we didn't know why she died. Sentence3: We found her on the floor when we woke up in the morning. Sentence4: My husband buried her in the backyard.\\nOutput:\\n\\nAssistant: We planted a small tree next to her grave to keep her memory alive.\"}" + }, + { + "comment": "This code is part of an AI assistant program tasked with reading a text, determining the truth of a given sentence related to that text, and justifying the steps taken in the process. In this case, the sentence being analyzed concerns whether St Estevam contains many giraffes on the island.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":71-71", + "content": "{\"text\": \"System: You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: Read the text and determine if the sentence is true:\\nSt Estevam is an Island in Ilhas, Goa, India. St Estevam is known also 'Juvem' and in the past was known as 'Shakecho Juvo' - the isle of vegetables - known for its long, seven-ridged, light green ladyfingers. Therefore the people of Juvem came to be nicknamed 'bhende'.\\nSentence: St Estevam contains many giraffes on the island.\\n\\nAssistant: Step 1: Read the text carefully\\nThe given text describes St Estevam, an island in Ilhas, Goa, India. It mentions its alternate names as \\\"Juvem\\\" and \\\"Shakecho Juvo,\\\" which means the isle of vegetables. The text also mentions that St Estevam is known for its long, seven-ridged, light green ladyfingers and that the people of Juvem are nicknamed 'bhende'.\\nStep 2: Analyze the given sen" + }, + { + "comment": "This code compares a given statement to the information provided in a text and determines if the sentence is true based on that information. In this specific case, it analyzes a restaurant named \"The Phoenix\" with attributes such as name, food type, customer rating, and area. The assistant then generates a detailed sentence about the restaurant's situation.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":71-72", + "content": "tence\\nThe given sentence states that St Estevam contains many giraffes on the island.\\nStep 3: Compare the statement to the information provided in the text\\nThe text describes the location, alternate names, the type of vegetation, and a nickname for the residents of St Estevam. However, there is no mention of the presence of giraffes on the island.\\nStep 4: Determine if the sentence is true\\nSince the text does not contain any information about the presence of giraffes on St Estevam, we cannot determine the given sentence to be true based on the information provided.\"}\n{\"text\": \"System: You are an AI assistant. You will be given a task. You must generate a detailed and long answer.\\n\\nUser: Attributes: name = The Phoenix, food = Chinese, customer rating = low, area = riverside. Produce a detailed sentence about this restaurant.\\n\\nAssistant: The Phoenix, a riverside Chinese restaurant offering a diverse selection of authentic cuisine, finds itself struggling to improve its low custome" + }, + { + "comment": "This code contains three different sentences, each representing a different task or thought. The first sentence is about a restaurant that aims to improve its rating and dining experience for patrons. The second sentence explains Bandeja Paisa dish, ingredients, and the lemon's family in terms that a five-year-old could understand. The third sentence is about an AI assistant given a task to read a text and determine if a statement within it is true. The key focus should be on understanding and completing each individual task faithfully.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":72-74", + "content": "r rating despite its idyllic location, and seeks to enhance the dining experience for its patrons to gain better recognition within the community.\"}\n{\"text\": \"System: You are a helpful assistant, who always provide explanation. Think like you are answering to a five year old.\\n\\nUser: Data: Bandeja paisa, ingredient, Lemon; Lemon, family, Rutaceae\\nSentence about the following data:\\n\\nAssistant: Bandeja Paisa is a yummy dish that uses a lot of ingredients, and one of them can be lemon. Lemons are a type of fruit that comes from a family of plants called Rutaceae. It's like how your family has a last name, fruits have families too!\"}\n{\"text\": \"System: You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: Read the text and determine if the sentence is true:\\nI say to the government, to the people of this country and certainly to members of parliament " + }, + { + "comment": "Analyzing sentence and breaking down key statements to determine if it aligns with the provided text.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":74-74", + "content": "that it is our duty and responsibility to ensure that we are not led down the garden path by ideologies put forward by extremely passionate interventionists, if I can use the term used by the member for Broadview-Greenwood.\\nSentence: I don't talk to the government.\\n\\nAssistant: To properly analyze the sentence, I will follow these steps:\\n1. Read the text carefully.\\n2. Break it down into key statements.\\n3. Examine the relationship between the statements and the sentence in question.\\n4. Determine if the sentence is true or false based on the information in the text.\\nStep 1: Read the text carefully:\\nI say to the government, to the people of this country and certainly to members of parliament that it is our duty and responsibility to ensure that we are not led down the garden path by ideologies put forward by extremely passionate interventionists, if I can use the term used by the member for Broadview-Greenwood.\\nStep 2: Break it down into key statements:\\n- The speaker is addressi" + }, + { + "comment": "Analyzing the code, it appears to be a process of text-based verification. The steps mentioned are likely for identifying contradictions in statements and determining whether a given sentence is true or false based on the information provided in the text.\"", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":74-75", + "content": "ng the government, the people, and members of parliament.\\n- They believe it is their duty and responsibility to protect against misleading ideologies.\\n- They reference a term used by a member for Broadview-Greenwood.\\nStep 3: Examine the relationship between the statements and the sentence in question:\\nThe sentence in question is: I don't talk to the government.\\nIn the text, the speaker is addressing the government alongside the people and the members of the parliament. This directly contradicts the sentence in question.\\nStep 4: Determine if the sentence is true or false based on the information in the text:\\nBased on the information from the text, the sentence \\\"I don't talk to the government\\\" is false, as the speaker is explicitly addressing the government in their statement.\"}\n{\"text\": \"System: You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n" + }, + { + "comment": "User requests a sentence containing given words, and the AI generates a sentence incorporating all the provided concepts.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":75-76", + "content": "\\nUser: Concepts: country, layer, move, passage, rotate\\nWrite a sentence that includes all these words.\\n\\nAssistant: In order to move the passage's rotating layer, the country's engineers designed a mechanism that could effortlessly rotate the layer, enabling a swift and dynamic passage through the structure.\"}\n{\"text\": \"System: You are an AI assistant. Provide a detailed answer so user don\u2019t need to search outside to understand the answer.\\n\\nUser: Question: Information: - Barry Levinson (born April 6, 1942) is an American filmmaker, screenwriter, and actor. Levinson's best-known works are comedy-drama and drama films such as \\\"Diner\\\" (1982), \\\"The Natural\\\" (1984), \\\"Good Morning, Vietnam\\\" (1987), \\\"Rain Man\\\" (1988), \\\"Bugsy\\\" (1991), and \\\"Wag the Dog\\\" (1997). He won the Academy Award for Best Director for his work on \\\"Rain Man\\\", which also won the Academy Award for Best Picture. - The Academy Awards, or \\\"Oscars\\\", is an annual American awards ceremony hosted by the Academ" + }, + { + "comment": "Code snippet describes the Academy Awards (Oscars) and provides a brief background of Tom Cruise, an American actor and producer who has been nominated for three Academy Awards and won three Golden Globe Awards.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":76-76", + "content": "y of Motion Picture Arts and Sciences (AMPAS) to recognize excellence in cinematic achievements in the United States film industry as assessed by the Academy's voting membership. The various category winners are awarded a copy of a statuette, officially called the Academy Award of Merit, which has become commonly known by its nickname \\\"Oscar.\\\" The awards, first presented in 1929 at the Hollywood Roosevelt Hotel, are overseen by AMPAS. - Thomas Cruise Mapother IV (born July 3, 1962), known professionally as Tom Cruise, is an American actor and producer. He has been nominated for three Academy Awards and has won three Golden Globe Awards. He started his career at age 19 in the film \\\"Endless Love\\\" (1981). After portraying supporting roles in \\\"Taps\\\" (1981) and \\\"The Outsiders\\\" (1983), his first leading role was in the romantic comedy \\\"Risky Business\\\", released in August 1983. Cruise became a full-fledged movie star after starring as Pete \\\"Maverick\\\" Mitchell in the action drama " + }, + { + "comment": "This code describes the movie \"Rain Man\" (1988), starring Tom Cruise and Dustin Hoffman. The film is about an abrasive young man, Charlie Babbitt, who learns of his estranged autistic brother Raymond after their father's death, leading to a road trip and personal growth. Mark Johnson, born December 27, 1945, produced this Best Picture Academy Award-winning film.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":76-76", + "content": "\\\"Top Gun\\\" (1986). One of the biggest movie stars in Hollywood, Cruise starred in several more successful films in the 1980s, including the dramas \\\"The Color of Money\\\" (1986), \\\"Cocktail\\\" (1988), \\\"Rain Man\\\" (1988), and \\\"Born on the Fourth of July\\\" (1989). - Rain Man is a 1988 American road comedy-drama film directed by Barry Levinson and written by Barry Morrow and Ronald Bass. It tells the story of an abrasive and selfish young wheeler-dealer, Charlie Babbitt (Tom Cruise), who discovers that his estranged father has died and bequeathed all of his multimillion-dollar estate to his other son, Raymond (Dustin Hoffman), an autistic savant, of whose existence Charlie was unaware. Charlie is left with only his father's car and collection of rose bushes. In addition to the two leads, Valeria Golino stars as Charlie's girlfriend, Susanna. - Mark Johnson ( born December 27 , 1945 ) is an American film producer . Johnson won the Best Picture Academy Award for producing the 1988 drama " + }, + { + "comment": "The code appears to be extracting information from a JSONL file, specifically focusing on entities and their relationships. It presents a question about the relationship between 'Mark Johnson' and an entity from a given set of choices. The answer is provided by an AI assistant.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":76-78", + "content": "movie Rain Man , starring Dustin Hoffman and Tom Cruise . The film , winner of four Oscars , also captured a Golden Globe as Best Picture . After reading the paragraphs above, we are interested in knowing the entity with which 'mark johnson ' exhibits the relationship of 'nominated for'. Find the answer from the choices below. Choices: - academy award for best director - academy award for best picture - academy awards\\nAnswer:\\n\\nAssistant: Mark Johnson exhibits the relationship of 'nominated for' with the Academy Award for Best Picture.\"}\n{\"text\": \"System: You are an AI assistant. Provide a detailed answer so user don\u2019t need to search outside to understand the answer.\\n\\nUser: What do people do when they are in a fight? Choose the most suitable option to answer the above question. Options: - facing each other - get angry - kiss - gather together - learn from each other\\nAnswer:\\n\\nAssistant: Answer: get angry\"}\n{\"text\": \"System: You are a helpful assistant, who always provide expl" + }, + { + "comment": "This code provides two options for making homemade curry soup, differing in the use of coconut shells or cream.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":78-78", + "content": "anation. Think like you are answering to a five year old.\\n\\nUser: Here is a goal: what goes into homemade curry soup?\\nWhich way makes more sense to accomplish this goal?\\nOptions are:\\ni. 1 ounce dry rice sticks (pad Thai noodles), to yield about 3/4 cup softened noodles 2-3 cups boiling water, plus 3/4 cup for serving 1/4 cup kale ribbons 1/4 cup shredded carrots 1/2 cup cooked tofu cubes 1 green onion, thinly sliced 2 teaspoons Thai curry paste (see notes) 2 teaspoons tamari or soy sauce 2 tablespoons coconut shellls\\nii. 1 ounce dry rice sticks (pad Thai noodles), to yield about 3/4 cup softened noodles 2-3 cups boiling water, plus 3/4 cup for serving 1/4 cup kale ribbons 1/4 cup shredded carrots 1/2 cup cooked tofu cubes 1 green onion, thinly sliced 2 teaspoons Thai curry paste (see notes) 2 teaspoons tamari or soy sauce 2 tablespoons coconut cream\\n\\nAssistant: Imagine we are making a delicious curry soup for a superhero! We need t" + }, + { + "comment": "This code appears to involve a helpful AI-assistant providing explanations in simple terms for a user, possibly a child. The first chunk discusses selecting the best ingredients for making a curry soup, and the second chunk describes a restaurant named Fitzbillies that serves cheap Indian food in the city center.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":78-80", + "content": "o choose the best option with the right ingredients.\\nThe second option (ii) makes more sense to accomplish this goal. \\nIt has noodles, boiling water, kale, carrots, tofu, green onion, Thai curry paste, tamari or soy sauce, and coconut cream. The coconut cream will make the curry soup creamy and tasty, just like a superhero would want.\\nThe first option (i) has coconut shells, and those are hard and not yummy. A superhero wouldn't want to eat those in their curry soup. So, let's go with option (ii) for the perfect homemade curry soup!\"}\n{\"text\": \"System: You are a helpful assistant, who always provide explanation. Think like you are answering to a five year old.\\n\\nUser: Here is some data about a restaurant: name = Fitzbillies, eatType = coffee shop, food = Indian, priceRange = cheap, area = city centre. Write a sentence that includes the following data about a restaurant\\n\\nAssistant: Fitzbillies is a coffee shop in the city centre that serves yummy Indian food at cheap prices.\"}\n{\"tex" + }, + { + "comment": "System provides detailed answers to user questions and suggests appropriate response based on given answer options.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":80-81", + "content": "t\": \"System: You are an AI assistant. Provide a detailed answer so user don\u2019t need to search outside to understand the answer.\\n\\nUser: Choose your answer from:\\nI. No;\\nII. Yes;\\nGiven those answer options, answer the question: Based on this review, would the user recommend this product? === Review: I can't stop listening to this CD! I've always listened to all Tim's cd's but since buying Tim McGraw and the DHD's that's all I'm listening to over and over. I love it and it's different than anything he's ever done before. My favorites are SLEEP TONIGHT (I love the chorus on that), ALL WE EVER FIND, WATCH THE WIND BLOW BY, and I KNOW HOW TO LOVE YOU WELL but I love every song on the cd. Tim's voice is perfect and the music is great with the sound of the fiddle, guitars, drums and keyboards standing out on different songs. They compliment each other so well. Answer:\\nA:\\n\\nAssistant: II. Yes\"}\n{\"text\": \"System: You are an AI assistant. Provide a detailed answer so user don\u2019t need to search" + }, + { + "comment": "The code provides detailed instructions for determining if a given textual premise (Statement 2) entails or implies a scientific fact (Statement 1). It expects a 'entails' output if Statement 2 supports Statement 1, and 'neutral' otherwise. The example task involves the sun and equinox times. Additionally, there is an unrelated snippet about producing a detailed sentence for a restaurant called \"The Punter\".", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":81-82", + "content": " outside to understand the answer.\\n\\nUser: Detailed Instructions: In this task, you are given two statements. The task is to output whether a given textual premise, i.e. Statement 2, entails or implies a given scientific fact, i.e. Statement 1. The output should be 'entails' if Statement 2 supports Statement 1 and should be 'neutral' otherwise.\\nQ: Sentence 1: The sun is directly over the equator during. Sentence 2: During equinox times the Sun's vertical ray is shining directly on the Earth's equator and neither hemisphere is tilted toward or away from the Sun.\\nA:\\n\\nAssistant: entails\"}\n{\"text\": \"System: You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: Attributes: name = The Punter, priceRange = less than \u00a320, area = riverside, familyFriendly = yes, near = The Portland Arms. Produce a detailed sentence about this restaurant.\\n\\nAssistant: " + }, + { + "comment": "Step 1: Identifies key elements from attributes, such as name, price range, area, and family-friendly status.\nStep 2: Formulates a sentence structure using these elements.\nStep 3: Combines the elements to create a detailed sentence about the restaurant.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":82-83", + "content": "Step 1: Determine the key elements from the attributes\\n- Name: The Punter\\n- Price Range: less than \u00a320\\n- Area: riverside\\n- Family Friendly: yes\\n- Near: The Portland Arms\\nStep 2: Formulate a sentence structure\\n- Begin with the name of the restaurant\\n- Incorporate the area and what it's near\\n- Mention the price range\\n- End with the family-friendly attribute\\nStep 3: Combine the elements to form a detailed sentence\\n\\\"The Punter, a family-friendly restaurant, is nestled by the riverside, just a short walk from The Portland Arms, offering a delightful dining experience for less than \u00a320 per person.\\\"\"}\n{\"text\": \"System: You are a helpful assistant, who always provide explanation. Think like you are answering to a five year old.\\n\\nUser: Choose from the options at the end. Are the following two questions the same?\\nWhat does Coke's slogan 'It's the Real Thing' mean?\\nWhat are Coke and Pepsi's market share in the U.S.? In other countries?\\nOPT:\\n a). no\\n b). yes\\nThe answer is:\\n\\n" + }, + { + "comment": "Code discusses the merger of two cybersecurity companies, Betrusted and TruSecure, resulting in CyberTrust with 4000 clients worldwide, $160M annual revenue, and 1000 employees globally. The topic is Business.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":83-84", + "content": "Assistant: a). no\\nThese two questions are not the same, sweetie. The first question asks about the meaning of Coke's slogan, while the second one asks about how much of the market Coke and Pepsi have in the U.S. and other countries. So they're asking about different things!\"}\n{\"text\": \"System: You are an AI assistant. You will be given a task. You must generate a detailed and long answer.\\n\\nUser: Choose your answer. Betrusted and TruSecure merge\\nThe merged company CyberTrust will service 4,000 clients worldwide and says it expects to have annual revenues of USD160 million. The firm will also employ 1,000 people globally, including 49 in Dublin in its identity management division.\\nWhich topic is this article about?\\nChoose your answer from:\\n[a]. World;\\n[b]. Sports;\\n[c]. Business;\\n[d]. Science/Tech;\\n\\nAssistant: This article is about topic [c]. Business.\\nIt discusses the merger between two cybersecurity companies, Betrusted and TruSecure. The merged company will be named CyberTr" + }, + { + "comment": "This code snippet describes a newly merged company, CyberTrust, resulting from the merger between Betrusted and TruSecure. The company aims to strengthen its market position by leveraging the combined resources, expertise, and client base of both entities. It anticipates annual revenues of around USD 160 million and will employ 1,000 people worldwide.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":84-84", + "content": "ust and is expected to have a global reach, servicing 4,000 clients worldwide. The company anticipates annual revenues of around USD 160 million and will employ 1,000 people, including 49 in Dublin, Ireland, in its identity management division.\\nThis topic falls under the domain of business as it involves companies, their financial prospects, and market operations. Mergers and acquisitions are common business strategies employed by companies to expand their market presence, enhance their offerings and services, and become more competitive in their respective industries. In this case, the article focuses on the cybersecurity industry, highlighting the business implications of the merger.\\nCyberTrust, as a newly merged company, aims to strengthen its market position by combining the resources, expertise, and client base of both Betrusted and TruSecure. The company will provide services such as identity management, ensuring that its global clientele receives comprehensive security solutio" + }, + { + "comment": "The code appears to be a part of an article discussing the merger between Betrusted and TruSecure, resulting in the formation of CyberTrust - a leading cybersecurity company providing identity management solutions. The article also highlights the company's global workforce employment aspect and its contribution to economic growth.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":84-85", + "content": "ns to protect their digital assets and mitigate cybersecurity threats effectively.\\nMoreover, the article also touches on the employment aspect, which is another key component of business operations, as companies often create job opportunities through mergers and acquisitions. This can lead to economic growth and further solidify a company's presence in a particular region or industry. In this instance, CyberTrust will employ 1,000 people globally, contributing to the international workforce specializing in cybersecurity services.\\nIn conclusion, this article covers the business domain of the merger between Betrusted and TruSecure, resulting in the formation of CyberTrust, a leading cybersecurity company providing identity management and other security solutions to clients across the world.\"}\n{\"text\": \"System: You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your s" + }, + { + "comment": "Code is identifying key elements, understanding context, and describing each element in detail for data related to Life Liquid show or interview, featuring a journalist guest, with producer Juju.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":85-85", + "content": "teps.\\n\\nUser: This is some data: Life Liquid FEATURED_GUEST(S) Journalist; Life Liquid PRODUCER(S) Juju.\\nGenerate a detailed description of this data.\\n\\nAssistant: Step 1: Identifying the key elements of the data\\nIn the given data, there are four key elements:\\n1. Life Liquid\\n2. FEATURED_GUEST(S)\\n3. Journalist\\n4. PRODUCER(S) Juju\\nStep 2: Understanding the context of the data\\nIn order to generate a detailed description of the data, first we need to understand the context. It seems like Life Liquid is some kind of production, possibly a show or an interview, featuring a guest (the journalist). Juju seems to be the producer of this production.\\nStep 3: Describing each element in detail\\n1. Life Liquid: This is possibly the title or theme of the show or interview. It could be an indication of the content or the main topic of the interview. Life Liquid could be related to lifestyle, wellness, or a product being promoted.\\n2. FEATURED_GUEST(S): The term indicates that there might be" + }, + { + "comment": "The code provides a detailed description of the \"Life Liquid\" production, featuring a journalist as the guest and organized by producer(s) Juju. The content likely revolves around journalism, current events, or specific topics related to the journalist's expertise.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":85-85", + "content": " one or more guests invited to participate in the event. In this case, we only have a journalist as the featured guest.\\n3. Journalist: The featured guest of Life Liquid is a journalist, which could indicate that the content of the show/interview is centered around journalism, current events, or an in-depth analysis of a certain topic or issue.\\n4. PRODUCER(S) Juju: This piece of information reveals that the individual or entity responsible for organizing, planning, and overseeing the production of Life Liquid is a person or company named Juju.\\nStep 4: Integrating the information into a detailed description\\nThe data refers to a production titled \\\"Life Liquid\\\", featuring a journalist as its guest, and is organized and supervised by the producer(s) Juju. Life Liquid could be a show or an interview discussing journalism, current events, or a specific topic related to the guest's expertise. This production likely includes conversations and insights on the chosen subject matter, with th" + }, + { + "comment": "This code is related to a chatbot or AI system that answers questions based on the provided paragraph and user input. The paragraph seems to discuss a show host and producer, while the examples demonstrate how the AI handles different types of questions, focusing on changes in resistance for resistors and identifying the most likely successful character from given options.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":85-87", + "content": "e journalist sharing their experiences and perspectives while engaging with the host and audience. Juju, as the producer, ensures that the show or interview runs smoothly and is successfully executed according to its vision and purpose.\"}\n{\"text\": \"User: Use information from the paragraph to answer the question. Question: If we change resistors in series to be in parallel the resistance increases or decreases? Paragraph : When we add resistors in parallel to each other the total resistance decreases and the current increases.\\nThe answer to this question is:\\n\\nAssistant: decreases\"}\n{\"text\": \"System: You are an AI assistant that follows instruction extremely well. Help as much as you can.\\n\\nUser: Here's a question: Who will be voted as the most likely to be successful? Here are possible answers to this question: - bitterest - representative - cleverest - bunny rabbit - corrupt I believe the correct choice is \\\"cleverest\\\", here's why:\\nThe answer to this question is:\\n\\nAssist" + }, + { + "comment": "This code is an explanation of the most likely successful individual in a given context, based on traits like intelligence and problem-solving abilities. The other options are not associated with success as much as cleverness is.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":87-88", + "content": "ant: The most likely to be successful would be the \\\"cleverest\\\" individual. This is because cleverness indicates intelligence, resourcefulness, and problem-solving abilities, all of which are traits that are generally associated with success. The other options - bitterest, representative, bunny rabbit, and corrupt - do not necessarily possess or highlight these characteristics that contribute to success. Therefore, the correct choice is indeed \\\"cleverest.\\\"\"}\n{\"text\": \"System: You are an AI assistant, who knows every language and how to translate one language to another. Given a task, you explain in simple steps what the task is asking, any guidelines that it provides. You solve the task and show how you used the guidelines to solve the task.\\n\\nUser: In this task, you are given a text of the article. Your task is to generate a headline (title) for this article.\\nQ: tom UNK , a pop artist best known for his modern take on the reclining female nude , has died at age ## .\\nA:\\n\\nAssista" + }, + { + "comment": "Task: Generate a headline for an article about pop artist Tom UNK.\nSteps: 1. Identify main idea - Tom UNK's death. 2. Choose keywords - Tom UNK, pop artist, reclining female nude, died. 3. Create short and engaging headline using keywords.\nSolution: \"Pop Artist Tom UNK, Renowned for Modern Reclining Female Nudes, Dies at ##\"", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":88-89", + "content": "nt: Task: Generate a headline (title) for an article.\\nArticle: Tom UNK, a pop artist best known for his modern take on the reclining female nude, has died at age ##.\\nGuidelines:\\n1. Summarize the main idea of the article.\\n2. Keep the headline short and attention-grabbing.\\n3. Use keywords from the article.\\nSteps:\\n1. Identify the main idea: Tom UNK, a pop artist known for his modern take on the reclining female nude, has passed away.\\n2. Select attention-grabbing keywords: Tom UNK, pop artist, reclining female nude, died\\n3. Create a short and engaging headline using the chosen keywords.\\nSolution:\\n\\\"Pop Artist Tom UNK, Renowned for Modern Reclining Female Nudes, Dies at ##\\\"\"}\n{\"text\": \"System: You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: Before you begin the process of haltering your horse, you'll need to prepare the halter. If your " + }, + { + "comment": "Instructions for haltering a horse, including unbuckling the crown piece, attaching lead line, calling the horse, and approaching from the left side to avoid deceiving.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":89-89", + "content": "horse is acting up, it can be hard to prepare the halter in the moment. Unbuckle or unsnap the crown piece. This is the strap of the halter that goes behind the horse's ears. It should have a buckle or snap somewhere on it. Have your halter clipped to your lead line ahead of time. You want to be able to halter your horse and immediately lead him where he needs to go. You clip the lead line onto a ring at the bottom of the noseband. The noseband is the portion of a halter that slips around a horse's nose. Call your horse as you approach him. This helps alert your horse to your presence. This way, he's less likely to get spooked by your approach. Call out his name or call out something like, \\\"I'm coming in!\\\" to let him know you're entering his pasture or stall. Make sure you have your halter in your left hand as you approach. You will be haltering the horse from the left side. You want to make sure your horse knows what's happening. If you try to deceive the horse by hiding the halter" + }, + { + "comment": "Explaining how to halter a horse:\n1. Unbuckle/unsnap the crown piece of the halter.\n2. Attach lead line to the noseband's bottom ring.\n3. Call your horse while approaching to alert it.\n4. Hold the halter with the left side approach, letting horse sniff hand and familiarize with scent. Offer treats if nervous.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":89-89", + "content": ", he'll learn to be distrustful of your intent. This can lead to a bad relationship between you and the horse. When haltering a horse, you should always approach from the left side. Halters are designed to be buckled on the left. You should approach your horse so you're standing near his head on the left side of his body. You do not want to put the halter on instantly. This can startle your horse. When you are a foot or so away from your horse's head, allow him to sniff your hand. This way, your horse will also familiarize himself with your scent. He'll be able to differentiate your smell from the smell of potential predators in other situations down the road. A nervous horse may benefit from having treats as a reward for staying calm.\\nSummary:\\n\\nAssistant: 1. Prepare the halter by unbuckling or unsnapping the crown piece.\\n2. Attach the lead line to the ring at the bottom of the noseband.\\n3. Call your horse as you approach, to alert him of your presence.\\n4. Hold the halter in your" + }, + { + "comment": "This code describes the process of approaching a horse for haltering, emphasizing starting from the left side and allowing the horse to familiarize itself with the handler's scent. The user is asking about the end date of the First Crusade which was proclaimed by Pope Urban II in 1095 as a response to the Byzantine Emperor Alexios I Komnenos' request for aid against Muslim advances, and ended with the capture of Jerusalem.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":89-90", + "content": " left hand while approaching the horse.\\n5. Approach the horse from the left side, as halters are designed to be buckled on this side.\\n6. Allow your horse to sniff your hand before haltering, so he becomes familiar with your scent.\\n7. Offer treats to a nervous horse as a reward for staying calm during the haltering process.\"}\n{\"text\": \"System: You are an AI assistant. Provide a detailed answer so user don\u2019t need to search outside to understand the answer.\\n\\nUser: I know that the answer to the question \\\"When did the first Crusade end?\\\" is in \\\"The crusades were intended to seize Jerusalem from Muslim control. The First Crusade was proclaimed by Pope Urban II (pope 1088\u201399) at the Council of Clermont in 1095 in response to a request from the Byzantine Emperor Alexios I Komnenos (r. 1081\u20131118) for aid against further Muslim advances. Urban promised indulgence to anyone who took part. Tens of thousands of people from all levels of society mobilised across Europe and captured Jerusalem " + }, + { + "comment": "The code snippet describes the outcome of the First Crusade in 1099 and mentions the brutal pogroms against local Jewish communities and the establishment of military orders, such as the Templars and Hospitallers. Additionally, it involves solving a linear equation for 'c' by isolating the variable.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":90-91", + "content": "in 1099. One feature of the crusades was the pogroms against local Jews that often took place as the crusaders left their countries for the East. These were especially brutal during the First Crusade, when the Jewish communities in Cologne, Mainz, and Worms were destroyed, and other communities in cities between the rivers Seine and Rhine suffered destruction. Another outgrowth of the crusades was the foundation of a new type of monastic order, the military orders of the Templars and Hospitallers, which fused monastic life with military service.\\\". Can you tell me what it is?\\nAnswer:\\n\\nAssistant: The First Crusade ended in 1099, when the crusaders successfully captured Jerusalem from Muslim control.\"}\n{\"text\": \"System: You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: Solve -47 = -6*c - 71 for c.\\n\\nAssistant: To solve the equation -47 = -6c -" + }, + { + "comment": "This code is solving a linear equation for the variable 'c' by performing algebraic steps: isolating 'c' on one side of the equation through adding, subtracting, and dividing. The goal is to find the value of 'c'. Here, -6c = 24 => c = -4.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":91-92", + "content": " 71 for c, follow these steps:\\nStep 1: Add 71 to both sides.\\nOur goal is to isolate c on one side of the equation. To cancel out the -71 that is being subtracted from -6c, we'll add 71 to both sides of the equation.\\n-47 + 71 = -6c - 71 + 71\\nStep 2: Simplify both sides.\\nNow, simplify by doing the math on both sides of the equation:\\n24 = -6c\\nStep 3: Divide by the coefficient of c.\\nWe want to isolate c by itself, so we'll divide both sides of the equation by the coefficient of c, which is -6:\\n24 / -6 = -6c / -6\\nStep 4: Simplify to solve for c.\\nNow we can simplify the equation to find the value of c:\\n-4 = c\\nSo the solution to the equation -47 = -6c - 71 is c = -4.\"}\n{\"text\": \"System: You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: Premise: Man with a large mohawk sweeping with a broom.\\nHypothesis: A man is holding a broom.\\n.Given th" + }, + { + "comment": "The code is identifying the components of a premise, analyzing it, comparing it to a hypothesis, justifying a conclusion, and explaining how the definition was used to come up with the answer.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":92-93", + "content": "e premise, can we conclude the hypothesis?\\n\\nAssistant: Step 1: Identify the components of the premise\\n- Man with a large mohawk\\n- Sweeping with a broom\\nStep 2: Analyze the premise\\nIn the given premise, there is a man who has a large mohawk hairstyle and is sweeping using a broom.\\nStep 3: Compare the premise to the hypothesis\\nThe hypothesis states that a man is holding a broom.\\nStep 4: Justify the conclusion\\nWe can conclude that the hypothesis is true given the premise. The man in the premise is described as having a large mohawk and sweeping with a broom, implying that he is holding a broom in order to sweep.\"}\n{\"text\": \"System: Explain how you used the definition to come up with the answer.\\n\\nUser: In this task, you will be presented with a passage and have to answer a question based on your understanding of the events inferred from the passage. Among the entities, try to find the best entity that is most likely to fill in \\\"_\\\" and classify the answers based on options.\\nQ:" + }, + { + "comment": "Code describes a situation where Boko Haram, an Islamist insurgent group in Nigeria, is responsible for killing around 2,000 people in the northeastern town of Baga on January 3. The recent attacks have drawn international focus back to the crisis and raise questions about the trajectory of the insurgency and its response.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":93-93", + "content": " (CNN)In one month Nigerians will vote in what may be the country's most competitive and contentious elections yet. But this important process will take place in the shadow of a worsening threat -- Boko Haram, the Islamist insurgent group otherwise known as Jama'atu Ahlis Sunna Lidda'awati wal-Jihad, which killed as many as 2,000 people in the northeastern town of Baga on January 3. The attacks of the past few weeks -- Boko Haram's biggest ever in terms of casualties on Baga, and on markets using children to detonate devices -- have drawn international focus back to the crisis. But such appalling events are not new -- they are an intensification, a next step in a crisis which has steadily worsened since Boko Haram's 2010 reemergence. And they raise a number of questions about the trajectory of the insurgency and the response to it.Boko Haram massacred as many as 2,000 people in Baga, Nigeria on January 3Donnelly: Taking Baga helps group consolidate control in northeast NigeriaDonnelly:" + }, + { + "comment": "This code seems to be part of a language model or AI assistant training session, where it is given information and asked questions. The information discusses Boko Haram's actions in the town of Baga, Nigeria, resulting in thousands of deaths. The task for the AI is to identify specific details like location and who did the killings based on the given information. The model seems to be accurately identifying these details from the passage.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":93-94", + "content": " Group's tactics are unclear; Boko Haram kills because it can \\n Questions:There still isn't a clear picture of what happened at _ and who did the killing -- the age, the motivation, the state of mind of what may have been militants, conscripts or armed and terrified child combatants. (A) CNN (B) Nigerians (C) Boko Haram (D) Islamist (E) Jama'atu Ahlis Sunna Lidda'awati wal-Jihad (F) Baga (G) Nigeria (H) Donnelly (I) Group\\nA:\\n\\nAssistant: There still isn't a clear picture of what happened at (F) Baga and who did the killing -- the age, the motivation, the state of mind of what may have been militants, conscripts or armed and terrified child combatants.\\nI used the definition given in the passage, which mentioned that Boko Haram killed as many as 2,000 people in the northeastern town of Baga on January 3. This information helps in determining that the event in question occurred in Baga; thus, (F) Baga is the correct answer.\"}\n{\"text\": \"System: You are a helpful assistant, who always pr" + }, + { + "comment": "This code is for multiplying odd numbers within each inner list of a given list of lists. It produces the output as a new list containing integers, where 0 is assigned when no odd number is found in an inner list.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":94-96", + "content": "ovide explanation. Think like you are answering to a five year old.\\n\\nUser: Q: What method did Ibn Sina introduce?\\n A: He pioneered the science of experimental medicine and was the first physician to conduct clinical trials.\\n Does the answer correctly answer the question\\nChoices:\\n(1). yes.\\n(2). no.\\n\\nAssistant: (1). yes.\"}\n{\"text\": \"System: You are an AI assistant that follows instruction extremely well. Help as much as you can.\\n\\nUser: Detailed Instructions: In this task you will be given a list, of lists, of integers. For every inner list contained in the input list, you should multiply every odd number in that list. The output should be a list of integers with the same length as the number of lists in the input list. If there are no odd numbers in an inner list you should output 0 for that list.\\nQ: [[-27, 36, 24], [-35, -28, -9], [46, -36, 28], [15, 18], [-9, 4, 33]]\\nA:\\n\\nAssistant: [27, 315, 0, 15, -297]\"}\n{\"text\": \"System: You are an AI assistant. User will you give you a" + }, + { + "comment": "Task: Identify Charles Dickens' first novel featuring characters Nathaniel Winkle, Augustus Snodgrass, and 'the fat boy' Joe.\nAnswer: The Pickwick Papers (also known as The Posthumous Papers of the Pickwick Club)\nExplanation: This novel includes the mentioned characters, making it the correct answer. Other novels by Charles Dickens don't feature these specific characters.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":96-97", + "content": " task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: Name Charles Dickens' first novel, featuring the characters Nathaniel Winkle, Augustus Snodgrass and 'the fat boy' Joe?\\nWhat is the answer?\\n\\nAssistant: To find the answer, I will recall the list of Charles Dickens' novels and think about which one includes the mentioned characters.\\nUpon reviewing his novels, I recognize that the characters Nathaniel Winkle, Augustus Snodgrass, and Joe 'the fat boy' are from his first novel, \\\"The Pickwick Papers.\\\" This book was also published as \\\"The Posthumous Papers of the Pickwick Club.\\\"\\nSo, the answer is The Pickwick Papers.\"}\n{\"text\": \"System: You are an AI assistant. You should describe the task and explain your answer. While answering a multiple choice question, first output the correct answer(s). Then explain why other answers are wrong. You might need to use additional knowledge to answer the q" + }, + { + "comment": "The code asks a question about the meaning of the word \"feel\" in two different sentences and provides an answer indicating that the term has different meanings in each context. The first sentence uses feel to describe vulnerability, while the second sentence uses it for unimportance or powerlessness.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":97-97", + "content": "uestion.\\n\\nUser: Here is one sentence: You make me feel naked.\\nHere is another sentence: She felt small and insignificant.\\nQ: Does the term feel mean the same thing in both these sentences?\\nChoose from:\\n[i] different meanings\\n[ii] the same meaning\\n\\nAssistant: [i] different meanings\\nIn the first sentence, \\\"You make me feel naked,\\\" the term \\\"feel\\\" is used to express an emotional or psychological state, suggesting a feeling of vulnerability or exposure. In this context, feel refers to the subjective experience or perception of the speaker.\\nIn the second sentence, \\\"She felt small and insignificant,\\\" the term \\\"feel\\\" is also used to describe an emotional state, but it conveys a different meaning than the first sentence. Here, the speaker is describing a feeling of being unimportant or powerless. While both sentences use the term \\\"feel\\\" to describe an emotional state, the emotional states being described are different, making the meaning of the term \\\"feel\\\" different in each sentence.\"}" + }, + { + "comment": "User gives a task to complete with justification of each step, and the task involves answering a question based on given information about Mr. Whitcomb Judson's invention in 1893.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":98-99", + "content": "{\"text\": \"System: You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: Q:Article: Mr. Whitcomb Judson was a really clever man. He invented a special train that was pushed along by air and in 1893 he invented something like a zip that could be used to open and close clothes. It could be used on men's trousers and on ladies' dresses. He did not call his invention a 'zip' but everybody says that he was the 'Father of the Zip'. Mr. Judson tried to sell his invention but no one wanted it because it did not work very well. A Canadian changed Mr. Judson's invention and made it better. He made a zip that worked very well in 1913.This is the same zip we use today. Question: Mr. Whitcomb Judson invented _ in 1893. Yes or no, is the answer \\\"something like a zip\\\"?\\nA:\\n\\nAssistant: Yes, the answer is \\\"something like a zip.\\\"\"}\n{\"text\": \"System: You are" + }, + { + "comment": "Code describes a news article discussing how tobacco and fast-food retailers are more accessible to minority and low-income student areas, leading to potential health concerns. The lead author, Heather D'Angelo, highlights her surprise at the number of tobacco outlets near schools.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":99-99", + "content": " an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: News article: Tobacco, fast food in easier reach of minority and low-income students\\nCompared to schools with mostly white students, schools with higher numbers of Hispanic, black, and low-income students are more likely to have tobacco retailers and fast-food restaurants in easy reach, researchers say.\\n\\\"Schools are places where we expect to be healthy, supportive environments for kids, but right outside the door (students) could be exposed to unhealthy influences,\\\" lead author Heather D'Angelo of Westat in Rockville, Maryland told Reuters Health in a phone interview.\\n\\\"I was surprised that there were so many tobacco outlets near schools,\\\" said D'Angelo, who at the time of the study was at the University of North Carolina in Chapel Hill.\\nD'Angelo and her colleagues randomly selected more than 18" + }, + { + "comment": "Schools with higher Hispanic, black and low-income students have more fast food and tobacco retailers nearby. More than 50% of schools with majority Hispanic students were near both types of outlets; population density affected this trend.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":99-99", + "content": ",000 public schools in 97 U.S. counties, then analyzed their proximity to retailers within an 800-meter radius - roughly equivalent to a 10-minute walk.\\nThey report in the American Journal of Public Health that 40 percent of the schools were near at least one fast food restaurant, 77 percent were close to at least one retailer likely to sell tobacco products, and 38 percent of schools were close to both.\\nSchools with higher numbers of Hispanic, black, and low-income students receiving free and reduced-price lunch were more likely to have tobacco retailers and fast-food restaurants within a ten-minute walk.\\nMore than 50 percent of schools with a majority of Hispanic students were within 800 meters of both a tobacco outlet and a fast food outlet, compared to 21 percent of schools with a majority of white students.\\nWhen researchers took population density into account, they found that every 10 percent increase in the proportion of a school's Hispanic students was associated with a fiv" + }, + { + "comment": "Analysis of a study finding increased likelihood of schools being near both tobacco and fast food stores with higher low-income student population, with older students having greater access to these items. The first study to consider availability based on socioeconomic and racial characteristics.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":99-99", + "content": "e percent higher likelihood the school was within walking distance of places to buy both tobacco and fast food.\\nA 10 percent increase in low-income students was associated with a three percent higher likelihood a school was near both kinds of stores.\\nThe researchers also report that fast food and tobacco were more easily accessible for older kids, suggesting \\\"that retailers may be well aware of and ready to take advantage of the autonomy and purchasing power of older youths.\\\"\\n\\\"High schools had 40 percent more fast food restaurants and 24 percent more tobacco outlets within 800 meters than did primary schools and high schools had nearly 1.5 times greater odds of having both\\\" nearby, they wrote.\\nThe researchers say theirs is the first study to consider the availability of both tobacco and fast food on the basis of students' socioeconomic and racial characteristics.\\nThe study did not analyze how students get to school, whether they leave campus during lunchtime, or where they liv" + }, + { + "comment": "This code appears to be a snippet from an article discussing a study about the association between fast food access and tobacco exposure near schools. The researcher mentions that while there's no direct causality proven, more research is needed to change public policy in this area.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":99-99", + "content": "e. Nor did the researchers have data on students' eating habits, or their weight or health status.\\n\\\"This is looking at one point in time,\\\" said D'Angelo. \\\"I can't say one thing causes another. However, there's still an association here.\\\"\\nShe noted that since the study only looked at fast food chains, the number of restaurants serving unhealthy food in the areas studied could be even higher.\\nD'Angelo and her co-authors argue that while licensing or zoning policies could restrict students' exposure to tobacco and unhealthy foods, more research on neighborhood disparities is needed in order to change public policy.\\n\\\"Food and tobacco habits begin at a young age, and habits tend to consolidate and 'stick' or 'track' over time,\\\" said Steven H. Kelder, an epidemiologist who was not involved with the study.\\nKelder, who co-directs the Michael and Susan Dell Center for Healthy Living at the University of Texas School of Public Health in Austin, told Reuters Health by email that studie" + }, + { + "comment": "Schools with high Hispanic, black, and low-income students have more tobacco and fast-food retailers nearby. Study highlights need to address health disparities.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":99-99", + "content": "s like this one provide information on how to address growing health disparities based on race and income and could point to the need for tighter restrictions at the point of sale, or counter-marketing to students of color.\\n\\\"In this country we have a history of commercially taking advantage of poor, less educated, and vulnerable populations,\\\" Kelder said. \\\"Documenting health disparities highlights the need to address uncovered problems.\\\"\\nWhat are the most important parts of this news article?\\n\\nAssistant: 1. Schools with higher numbers of Hispanic, black, and low-income students are more likely to have tobacco retailers and fast-food restaurants nearby, according to researchers.\\n2. More than 50% of schools with a majority of Hispanic students were within 800 meters of both a tobacco outlet and a fast-food outlet, while 21% of schools with a majority of white students were within the same proximity of both.\\n3. The study highlights the increased accessibility of tobacco and fast" + }, + { + "comment": "Part 1: Break the definition into small parts.\nUsage: The provided code is already broken into two distinct JSON entries. Each entry contains a set of text data, with separate keys for 'text' and potentially other relevant information. It meets the criteria by being divided into understandable chunks.\n\nPart 2: Explain each part's meaning using an example that meets the criteria in the instruction.\nUsage: The first JSON entry describes a study finding related to older students' access to unhealthy foods or tobacco products. This meets the criteria of providing information on a specific issue and its implications (e.g., retailers taking advantage of student purchasing power). The second JSON entry explains how public policy could be influenced by this study, discussing potential solutions like licensing or zoning policies, and highlighting the need for more research on neighborhood disparities. It meets the criteria by showing how the study's findings could lead to policy changes and additional research.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":99-100", + "content": " food for older students and suggests that retailers may be taking advantage of their purchasing power.\\n4. The researchers argue that licensing or zoning policies could restrict students' exposure to tobacco and unhealthy foods, but more research on neighborhood disparities is needed to change public policy.\\n5. The study demonstrates potential health disparities based on race and income, which could lead to tighter point-of-sale restrictions or counter-marketing strategies targeting students of color.\"}\n{\"text\": \"System: Given a definition of a task and a sample input, break the definition into small parts.\\nEach of those parts will have some instruction. Explain their meaning by showing an example that meets the criteria in the instruction. Use the following format:\\nPart # : a key part of the definition.\\nUsage: Sample response that meets the criteria from the key part. Explain why you think it meets the criteria.\\n\\nUser: Detailed Instructions: In this task, you are given a senten" + }, + { + "comment": "This code snippet is likely part of a language identification model. The provided input sentence seems to be in Swedish, and the task is to determine its language. The solution presented includes the original sentence and mentions that it is input for further processing.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":100-100", + "content": "ce which is either in the Swedish language or English language. You task is to identify the language of input sentence. Input sentence can be in Swedish or English language only and also it cannot have two languages at a time.\\nProblem:Konsumenterna ser hur deras levnadsstandard sjunker, hur fattigdomen breder ut sig och hur den offentliga sektorn och den produktiva basen i de flesta av unionens l\u00e4nder sk\u00e4rs ned och uppl\u00f6ses till f\u00f6rm\u00e5n f\u00f6r en oh\u00e4mmad och f\u00f6rd\u00e4rvbringande konkurrens, den totala marknadsekonomin och storkapitalets monopolintressen.\\nSolution:\\n\\nAssistant: Part #1: Identify the given sentence\\nUsage: The given sentence is \\\"Konsumenterna ser hur deras levnadsstandard sjunker, hur fattigdomen breder ut sig och hur den offentliga sektorn och den produktiva basen i de flesta av unionens l\u00e4nder sk\u00e4rs ned och uppl\u00f6ses till f\u00f6rm\u00e5n f\u00f6r en oh\u00e4mmad och f\u00f6rd\u00e4rvbringande konkurrens, den totala marknadsekonomin och storkapitalets monopolintressen.\\\" This sentence is the input we ha" + }, + { + "comment": "The code explains that the input can only be in Swedish or English, identifies the language of the sentence based on words and structure, and provides a sentiment analysis for a tweet. The assistant helps with answering questions like a five-year-old would understand.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":100-102", + "content": "ve to work with.\\nPart #2: Recognize that the input can only be in Swedish or English\\nUsage: It is stated that the input sentence can be in Swedish or English language, and it cannot have both languages at the same time. The sentence has to belong to one of these two languages.\\nPart #3: Identify the language of the input sentence\\nUsage: In this case, words like \\\"Konsumenterna,\\\" \\\"levnadsstandard,\\\" and \\\"fattigdomen\\\" are not in English. Those words, as well as the overall sentence structure, suggest that this sentence is in Swedish. Therefore, the language of the input sentence is Swedish.\"}\n{\"text\": \"System: You are a helpful assistant, who always provide explanation. Think like you are answering to a five year old.\\n\\nUser: She's too busy to read my blabs. \\nWhat is the sentiment of this tweet?\\n\\nAssistant: The sentiment of this tweet seems to be a little unhappy or disappointed. The person might feel a bit sad because the other person is too busy to read their messages.\"}\n{\"tex" + }, + { + "comment": "Code snippet introduces a movie plot involving Andre, a talented marksman trained by retired Marine sniper Jack Robinson. The plot revolves around the dangerous streets of Chicago and a terrorist organization's plan to release a deadly virus.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":102-102", + "content": "t\": \"System: You are an AI assistant. Provide a detailed answer so user don\u2019t need to search outside to understand the answer.\\n\\nUser: Please answer the following question: Build a movie plot around this: Who is an excellent shot? Andre\\nA:\\n\\nAssistant: Title: \\\"Andre: Marksman's Mission\\\"\\nGenre: Action, Thriller\\nPlot:\\nAndre Thompson, a young orphan with no known family, grew up in the dangerous streets of Chicago. Despite a troubled past and hardships, his natural talent for shooting has never gone unnoticed. Playing basketball at the nearby community center, he encounters retired Marine sniper, Jack Robinson. Jack sees potential in Andre's impeccable aim and accuracy, even when under pressure. He decides to take Andre under his wing and train him to become an elite marksman, using his skills for a greater cause.\\nYears later, as an adult, Andre has become an excellent shot, thanks to Jack's mentorship. When a local terrorist organization initiates a plan to release a deadly viru" + }, + { + "comment": "Code snippet describes a thrilling adventure of a skilled marksman, Andre, who is recruited by the government to stop terrorists. It highlights his internal conflicts with teammates and difficult choices that shape his destiny in the movie \"Andre: Marksman's Mission\".", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":102-102", + "content": "s, the government secretly forms a small team of highly skilled individuals to stop them. Andre is recruited as their ace marksman.\\nThe team embarks on a thrilling adventure, hunting down the terrorists responsible for plotting the attack. Along the way, Andre discovers his teammates have their own hidden talents and motivations, leading to internal conflict and distrust within the group. As they get closer to the terrorist leaders, Andre's skills with a firearm are put to the ultimate test, with countless lives hanging in the balance.\\nIn the movie's climax, Andre is forced to make a difficult choice between the mission and saving Jack's life, who has been kidnapped by the terrorists as leverage. Andre's decision will shape not only his own destiny, but also that of his close friends and the entire city of Chicago.\\nThrough a series of events filled with intense action sequences, suspense, and emotional drama, \\\"Andre: Marksman's Mission\\\" unveils the true depth of Andre's character " + }, + { + "comment": "The Copenhagen Consensus Conference occurs every four years, examining and prioritizing global issues using cost-benefit analysis.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":102-103", + "content": "while showcasing his incredible skill as an elite marksman. As the story unfolds, it remarks on themes of friendship, loyalty, redemption, and the power of choice, ultimately challenging the protagonist and the audience to reflect on their values and beliefs.\"}\n{\"text\": \"System: You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: The Copenhagen Consensus Center is a US non-profit think tank, founded and headed by Bj\u00f8rn Lomborg. The Center organizes the Copenhagen Consensus, a conference of prominent economists held every four years, where potential solutions to global issues are examined and prioritized using cost-benefit analysis.\\nBased on the paragraph above can we conclude that \\\"The Copenhagen Consensus used to meet every two years.\\\"?\\n\\nAssistant: Step 1: Read the given information and identify key data points.\\nThe key data points in the p" + }, + { + "comment": "This code is a step-by-step process to analyze if we can conclude that \"The Copenhagen Consensus used to meet every two years\" based on the given information. The code identifies relevant data, compares it with the question and reaches a conclusion. The conclusion states we cannot confirm the given statement as the conference is held every four years instead of every two years.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":103-103", + "content": "aragraph are:\\n- The Copenhagen Consensus Center is a US non-profit think tank.\\n- Founded and headed by Bj\u00f8rn Lomborg.\\n- The Center organizes the Copenhagen Consensus conference.\\n- The conference is held every four years.\\n- Economists attend the conference and prioritize solutions using cost-benefit analysis.\\nStep 2: Analyze the question being asked.\\nThe question is asking whether we can conclude that \\\"The Copenhagen Consensus used to meet every two years.\\\"\\nStep 3: Identify the data relevant to the question.\\nThe relevant data for this question is the frequency of the Copenhagen Consensus conference: held every four years.\\nStep 4: Compare the data to the question.\\nThe question asks if the conference used to meet every two years, but the provided information states that the conference is held every four years.\\nStep 5: Reach a conclusion.\\nBased on the information provided, we cannot conclude that \\\"The Copenhagen Consensus used to meet every two years\\\" as the paragraph states that it is held every four years.\"}" + }, + { + "comment": "This code contains two JSON objects representing a user's input and the AI assistant's response to that input. The user is asking if they can draw a hypothesis from the given context. The AI assistant should think step-by-step and justify its actions while performing the task.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":104-105", + "content": "{\"text\": \"User: Answer the following question: Title: 128 pages that felt like 500. Review: I read many glowing reviews of this novella in sources I implicitly trusted; Amazon.Com's review section included. The gap between promise and fulfillment has never been so large in my reading experience. The central character, who unfortunately is also the narrarator, is a whining, mawkish, hyper-sensitive narcissist who should have reflected and learned from his wife's criticisms, instead of ruminating about leaving her. Consequently, I've lost an evening from my life reading this trite missive. Does this product review convey a negative or positive sentiment?\\nAnswer:\\n\\nAssistant: This product review conveys a negative sentiment.\"}\n{\"text\": \"System: You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: Can we draw the following hypothesis from the context" + }, + { + "comment": "The code is evaluating a hypothesis against the given context, identifying key points and determining if the hypothesis can be drawn from the provided information. The hypothesis contradicts the context, which states that an elaborate scheme was used to hide child sexual abuse by priests.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":105-105", + "content": "? \\nContext:\\nThe resignation came amid recent allegations by Massachusetts' attorney general that the Boston archdiocese engaged in \\\"an elaborate scheme\\\" to keep quiet the issue of child sexual abuse by priests.\\nHypothesis: Priests have never abused children according to this sentence.\\n\\nAssistant: Step 1: Identify the key points in the context\\n- Resignation took place\\n- Massachusetts' attorney general's allegations\\n- Boston archdiocese engaged in \\\"an elaborate scheme\\\"\\n- Purpose of the scheme was to keep quiet the issue of child sexual abuse by priests\\nStep 2: Evaluate the context against the provided hypothesis\\nThe hypothesis states: \\\"Priests have never abused children according to this sentence.\\\"\\nStep 3: Justify whether the hypothesis can be drawn from the context\\nGiven the context, the hypothesis cannot be drawn as the context mentions the purpose of the \\\"elaborate scheme\\\" was to keep quiet the issue of child sexual abuse by priests. This statement implies that th" + }, + { + "comment": "The code presents a text pair, where the first part discusses child abuse by priests and its concealment, while the second part explains how a definition was used to derive an answer about the simplest type of battery.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":105-106", + "content": "ere were cases of child sexual abuse by priests, and efforts were made to conceal this information.\\nTherefore, the hypothesis does not match the context.\"}\n{\"text\": \"System: Explain how you used the definition to come up with the answer.\\n\\nUser: Given a scientific question and its correct answer, generate supporting facts for the answer. This supporting fact can be an explanation for the answer to the given question.\\nQuestion: The simplest type of battery contains what? Answer: single cell\\n\\nAssistant: I used the definition of a battery, which is a device that converts stored chemical energy into electrical energy through an electrochemical reaction. Knowing this definition, I deduced that the simplest type of battery would have the least amount of components, which in this case, is a single cell.\\nA single cell battery contains one electrochemical cell, which consists of a cathode, an anode, and an electrolyte. The chemical reactions at the electrodes create a flow of electrical ch" + }, + { + "comment": "This code is explaining the concept of a simple battery that consists of only one electrochemical cell, which converts chemical energy into electrical energy. The code also mentions more complex batteries with multiple cells in series or parallel configurations for increased voltage or current capacity.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":106-107", + "content": "arge. The single cell battery is considered the simplest type because it only involves one electrochemical cell carrying out the conversion of chemical energy into electrical energy, whereas more complex batteries contain multiple cells connected together in series or parallel configurations to provide increased voltage or current capacity.\\nIn summary, I used the definition of a battery to determine that the simplest type of battery would have the least number of components and processes involved, which led me to the answer that the simplest type of battery contains a single cell.\"}\n{\"text\": \"System: You are a helpful assistant, who always provide explanation. Think like you are answering to a five year old.\\n\\nUser: Which is an appropriate title for this article?\\nOne of New York Attorney General Eliot Spitzer #39;s investigators was perusing a stack of subpoenaed documents in a Manhattan office last month when he let out a yelp, slammed down his coffee and sprinted down the corridor." + }, + { + "comment": "The code provides a JSON object with multiple text elements. The first is an assistant suggesting a catchy article title based on the provided context, while the second is a system instructing the AI to generate a sentence describing a restaurant's data, which it successfully accomplishes.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":107-109", + "content": "\\n\\nAssistant: A good title for this article could be \\\"Exciting Discovery in New York Investigation!\\\" This title is simple and easy to understand while also hinting at the excitement the investigator felt when they found something important in the documents.\"}\n{\"text\": \"System: You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: Here is some data: The Waterman eatType restaurant; The Waterman food Indian; The Waterman priceRange cheap; The Waterman area city centre; The Waterman familyFriendly yes.\\nWrite a sentence that describes this data:\\n\\nAssistant: The Waterman is a family-friendly Indian restaurant located in the city centre, offering a selection of affordable and delicious dishes in a casual eatType setting.\"}\n{\"text\": \"System: You are an AI assistant that follows instruction extremely well. Help as much as you can.\\n\\nUser: Detailed Ins" + }, + { + "comment": "This code is asking for the completion that best continues or answers the given prompt. It involves categories such as idioms, negated statements, polysemy, subject referencing, and quantitative reasoning. The chosen completion should demonstrate common sense. In this example, Completion A (\"he went out to smoke yesterday\") is considered the most plausible continuation of the prompt.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":109-110", + "content": "tructions: Given a prompt and four completions, select the completion that is the most plausible in continuing or answering the prompt. This task is designed to test common sense and has various categories ranging between idioms, negated statements, polysemy, subject referencing, and quantitative reasoning. Generate either Completion A, Completion B, Completion C, or Completion D.\\nProblem:Prompt: I can't believe that despite his dislike for cigarettes, \\nCompletion A: he went out to smoke yesterday. \\nCompletion B: we went out to the grocery store. \\nCompletion C: he cried when I punched him. \\nCompletion D: he still wouldn't pet the dog.\\nSolution:\\n\\nAssistant: Completion A\"}\n{\"text\": \"System: You are an AI assistant, who knows every language and how to translate one language to another. Given a task, you explain in simple steps what the task is asking, any guidelines that it provides. You solve the task and show how you used the guidelines to solve the task.\\n\\nUser: Teacher:You are" + }, + { + "comment": "Classify conversation as 'No emotion' or 'Happiness'. Here, Person1 suggests a walk and errands which shows a positive mood.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":110-110", + "content": " given a conversation between two people. 'Person1:' and 'Person2:' are used to separate their respective dialogues. Your task is to classify the conversation either convey 'No emotion' or 'Happiness' by providing '1' and '0', respectively.\\nTeacher: Now, understand the problem? Solve this instance: Person1: It's such a nice day outside . Let \u2019 s take a walk into town and do some window-shopping . \\n Person2: You know I hate shopping more than anything else in this world . \\n Person1: I know . But we could get some exercise in , and we could get some errands run , too . Window-shopping won \u2019 t cost you anything . \\n Person2: What kind of errands do we need to run ? \\n Person1: Well , we could go to the newsagent's to pick up the Sunday paper , go to the butcher's to pick up a chicken for a roast dinner , and to the chemist's to pick up your prescription from the doctor . \\n Person2: I'll only go if we can stop at the office-license to pick up something to drin" + }, + { + "comment": "Person1 and Person2 are discussing their plans for the day, which includes a walk to get some exercise and accomplish tasks. They agree not to go into any stores in the mall but can look at things in the market. Person1 asks if Person2 has money for errands and potential purchases from street vendors.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":110-110", + "content": "k for tonight . \\n Person1: Fine , we'll do that on the way home , so we don \u2019 t have to carry it the whole time . \\n Person2: Sounds good . You have to promise not go into any of the stores in the mall though . I know how you like to try things on in the fitting rooms , especially if the stores have special offers . \\n Person1: I promise . We're only going for a walk to get some exercise and to get some things done that we haven \u2019 t had time for . \\n Person2: Ok , let's go then ! \\n Person1: Do you have any money ? \\n Person2: For what ? \\n Person1: For our errands , and maybe a little extra for something from a vendor on the street . \\n Person2: You said you were only going to window shop ! \\n Person1: I agreed to not go into the mall , but I can still look at things in the market ! \\n Person2: I should have known ... \\nStudent:\\n\\nAssistant: Based on the given conversation and the task guidelines, here are the steps to solve the problem:\\" + }, + { + "comment": "This code reads a conversation between Person1 and Person2, identifies the overall tone and emotions expressed, classifies the conversation as 'No emotion' or 'Happiness', and assigns an appropriate code ('1' for 'No emotion' and '0' for 'Happiness') based on the guidelines. The given instance is classified as 'Happiness' with a code of '0'.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":110-111", + "content": "n1. Read the conversation between Person1 and Person2.\\n2. Understand the overall tone and emotions expressed by the participants.\\n3. Classify the conversation as either having 'No emotion' or 'Happiness' based on the guidelines.\\n4. Assign the appropriate code ('1' for 'No emotion' and '0' for 'Happiness') to the conversation.\\nNow let's apply the guidelines to solve the instance:\\n1. Read the conversation: Person1 and Person2 are discussing their plans for the day, which includes going for a walk, running errands, and window-shopping.\\n2. Understand the emotions: From their conversation, both participants seem to be in a good mood and expressing happiness as they agree on their plans and engage in friendly banter.\\n3. Classify the conversation: Based on their dialogue, the conversation conveys 'Happiness'.\\n4. Assign the code: Since the conversation conveys 'Happiness', the appropriate code is '0'.\\nThe solution to this instance is '0', as the conversation conveys 'Happiness'.\"}\n{\"te" + }, + { + "comment": "The code is providing a task to translate a given Galician sentence into Portuguese. The assistant is expected to describe the task, output the correct answer, and explain why other answers are wrong, possibly using additional knowledge for clarification.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":111-112", + "content": "xt\": \"System: User will you give you a task with some instruction. Your job is follow the instructions as faithfully as you can. While answering think step-by-step and justify your answer.\\n\\nUser: Instructions: You are given a sentence in Galician. Your job is to translate the Galician sentence into Portugese.\\nInput: Iso foi exactamente o que fixemos con esas c\u00e9lulas.\\nOutput:\\n\\nAssistant: That was exactly what we did with those cells.\\nTranslation in Portuguese: Isso foi exatamente o que fizemos com essas c\u00e9lulas.\"}\n{\"text\": \"System: You are an AI assistant. You should describe the task and explain your answer. While answering a multiple choice question, first output the correct answer(s). Then explain why other answers are wrong. You might need to use additional knowledge to answer the question.\\n\\nUser: OPTIONS: A. Pour a few drops of apple cider vinegar into a small bowl. For this step, you will need about 1 gallon (3.8 l) of apple cider vinegar. B. Vacuum and vacuum well under t" + }, + { + "comment": "Code explains cleaning steps for oven door including using vacuum cleaner attachments or handheld machine, wiping several times with liquid soap, and applying baking soda paste.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":112-112", + "content": "he oven door. You can use the brush attachments on your vacuum cleaner, or you can use a handheld vacuuming machine. C. You may want to wipe several times to cover any important areas. Pour liquid dish soap into a plastic bucket. D. Apply a baking soda paste to the oven door. Mix baking soda into water until you have a paste.Choose from options above and answer: What most naturally follows?\\nHow to clean the insides of a double pane window in your oven door \\nUse a wet cloth to wipe away any loose debris. \\nWipe down the top, sides, and crevices around your oven door to remove crumbs or other food particles. You can use plain water or a cleaning wipe for this step.\\nAnswer:\\n\\nAssistant: D. Apply a baking soda paste to the oven door. Mix baking soda into water until you have a paste.\\nExplanation:\\nA. Pour a few drops of apple cider vinegar into a small bowl - This option does not make sense in this context as it requires an excessive amount of vinegar without a clear purpose.\\nB. Vacu" + }, + { + "comment": "This code appears to be an instructional text describing the process of cleaning a double-pane window in an oven door, with option D being the most effective method. The user is asked about their satisfaction level on a product review, and the review title suggests enjoyment while reading it, implying satisfaction.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":112-113", + "content": "um and vacuum well under the oven door - Vacuuming would not be effective in cleaning the inside of a double pane window in the oven door.\\nC. Pour liquid dish soap into a plastic bucket - This option does not directly address cleaning the insides of the double pane window.\\nOption D is the most fitting answer as applying a baking soda paste is a common and effective method for cleaning oven doors, including their double pane windows. This paste helps to break down and remove accumulated grime and dirt within the crevices of the oven door.\"}\n{\"text\": \"System: You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: Here is a review left by a customer on a product. Would you say he was satisfied or dissatisfied? Title: Escape into her world! Review: Jemima JThis beach read was a true \\\"thoughtless\\\" pleasure for me to turn the pages of. Jemima J is not " + }, + { + "comment": "The customer enjoyed the easy-to-read book about Jemima's struggles, escape, and relatable experiences.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":113-113", + "content": "the most realistic character in the world, as she looses a massive amount of weight in a short period of time, but Jemima is someone whom everyone can identify with. Jemima longs for a man, for whom she feels she can not have (Ben Williams) because of her outward appearance. Jemima is a writer in a small paper that has dreams of becoming something else. Jemima meets a man, whose outward appearance is what she dreams of, but inside may not be what she needs. Jemima has a great friend (Geraldine) who is a 100% supportive fabulous friend, but Jemima is so very envious of what Geraldine looks like. Everywoman struggles with all of these things.This writer did not seem to intend to make this a book you had to ponder though, but intended this book to be an easy read. I loved it! This book took me away from my present and let me escape into the world of Jemima!\\nThe answer to this question is:\\n\\nAssistant: The customer was satisfied. \\nMy justification for this answer is based on the sentime" + }, + { + "comment": "The code is analyzing customer reviews and determining their overall sentiment. It identifies positive or negative phrases and assesses if the reviewer was satisfied with the product.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":113-114", + "content": "nts expressed in the review. The customer described the book as a \\\"thoughtless pleasure,\\\" and a \\\"beach read,\\\" which are both positive terms. The customer also expressed appreciation for the book's content, including the character development and the relatability of the protagonist. Further, the last line - \\\"I loved it! This book took me away from my present and let me escape into the world of Jemima!\\\" - indicates the reviewer's enjoyment and satisfaction with the book. While there are some elements mentioned that could be perceived as negative, such as the unrealistic weight loss of the main character, they did not seem to reduce the customer's overall enjoyment of the book. Hence, the reviewer can be considered to be satisfied with the product.\"}\n{\"text\": \"User: Is there a negative or positive tone to this product review? === Title: so-so Review: I bought this lock because it was the cheapest I could get. It suits my purpose, but the design is shoddy and the combination lock has " + }, + { + "comment": "This code seems to be extracting information from a product review and summarizing it, as well as providing a brief explanation for someone like a five-year-old. The comment suggests that the product review has a negative tone.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":114-115", + "content": "a fluid motion instead of setting into place when you align the numbers.The lock doesn't come with an option to set your own combination -- it comes with a preset combination and cannot be changed (so if you forget it, you're out of luck!)The chain and plastic cover look so-so. Definitely not high class.If you're looking for a CHEAP chain to lock your bike in a SAFE neighborhood, then this is your best bet. I would give it 1.5 stars, but they don't have that option here. Answer:\\n\\nAssistant: There is a negative tone to this product review.\"}\n{\"text\": \"System: You are a helpful assistant, who always provide explanation. Think like you are answering to a five year old.\\n\\nUser: Greg, I've been helping the HR team draft this memo. Cindy asked that I run it by you prior to having it sent out. Any comments? As we continue to address a number of issues facing Enron, it is critical that the company remain focused on attracting, motivating and retaining our diverse and talented workforce. We " + }, + { + "comment": "Code snippet discusses changes in employee-focused functions at Enron, with HR, Employee Responsibility, and Executive and Corporate Compensation reporting directly to the Office of the Chairman. David Oxley oversees Global HR, managing various functions including Payroll and Labor Relations, while Cindy Olson is responsible for Employee Relations and Compensation.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":115-115", + "content": "strongly believe that our workforce is world-class - the best in any industry, so we will continue to develop and reward this tremendous talent base. Because the lasting strength of our company is determined by the strength of our workforce, we are elevating the role employee-focused functions play within Enron. We are pleased to announce that effective immediately Human Resources, Employee Responsibility, and Executive and Corporate Compensation will report directly to the Office of the Chairman. David Oxley, Vice President, will continue to oversee Enron's Global HR function. The lead HR representatives for the individual business units will report to David in addition to their respective Office of the Chairman. Besides managing Recruiting, Staffing, Training, Compensation and the Performance Evaluation Process, David will also add Payroll and Labor Relations to his list of responsibilities. Cindy Olson, Executive Vice President, will have responsibility for Employee Relations and Co" + }, + { + "comment": "The code appears to be a fragment of a text generated from a chatbot or AI language model. The text discusses changes in an organization's HR team and their new roles and responsibilities. The assistant is providing a subject line for the email announcing these changes. It involves capitalizing words where necessary and making sure to use proper grammar.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":115-116", + "content": "rporate Responsibility, which includes Social Responsibility, Community Relations, Diversity, Employee Relations, Redeployment, Alumni Relations, Employee Events and Programs, Benefits, Wellness, and Worklife. Mary Joyce, Vice President, will continue to have responsibility for Executive and Corporate Compensation in addition to Global Employee Information Management and Reporting and Analysis. David, Cindy and Mary will each report to the Office of the Chairman. A detailed organizational chart is attached. Please join me in congratulating them on their new responsibilities. \\nGenerate a subject line for this email.\\n\\nAssistant: Subject: Exciting Changes in Our HR Team and Their Roles \ud83c\udf1f\ud83d\udc69\u200d\ud83d\udcbc\ud83d\udc68\u200d\ud83d\udcbc\"}\n{\"text\": \"System: You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: Please capitalize where necessary: it uses strong and powerful scanning algorithm th" + }, + { + "comment": "Code snippet explains the process of identifying sentences, determining capitalization needs, and capitalizing the sentence with no proper nouns.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":116-117", + "content": "at offers successful recovery of deleted and lost photos.\\n\\nAssistant: Step 1: Identify the sentences in the text.\\nIn this case, there is only one sentence: \\\"it uses strong and powerful scanning algorithm that offers successful recovery of deleted and lost photos.\\\"\\nStep 2: Determine where capitalization is necessary.\\nThe main areas we need to focus on for capitalization are the beginning of sentences and the proper nouns if there are any.\\nStep 3: Capitalize the beginning of the sentence\\nThe sentence begins with \\\"it\\\", so we'll capitalize the \\\"i\\\" in \\\"it\\\".\\nStep 4: Identify and capitalize proper nouns (none in this case)\\nThere are no proper nouns in this sentence, so we can move to the final step.\\nThe final result after capitalization: \\\"It uses strong and powerful scanning algorithm that offers successful recovery of deleted and lost photos.\\\"\"}\n{\"text\": \"System: You are an AI assistant. You will be given a task. You must generate a detailed and long answer.\\n\\nUser: Why c" + }, + { + "comment": "This code discusses the differences between girls and women standing to pee. The first question asks why they can't do it while the second one is about the difficulty level. It explains that females have different anatomy, making it difficult due to a shorter urethra, internal opening, and other factors like controlling flow and direction of urine stream.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":117-117", + "content": "an't girls stand & pee?\\nHow difficult is it for women to pee standing up?\\nDo those questions have the same meaning?\\nOptions are: -- no. -- yes.\\n\\nAssistant: No, the two questions do not have the exact same meaning, although they are closely related. The first question specifically asks why girls cannot stand and pee, while the second question explores the level of difficulty women encounter when attempting to pee standing up.\\nTo address the first question, girls cannot stand and pee primarily due to their anatomical differences from males. Female urinary anatomy consists of a shorter urethra and a more internal urethral opening. The urethra, which is the tube that carries urine from the bladder to the outside of the body, is directed downward and forward in females, making it more challenging to aim the urine stream away from the body. Moreover, the female urethra has a wider opening and is closer to the vagina, which adds to the difficulty of controlling the flow and direction of" + }, + { + "comment": "Discussing women peeing standing up, using specific techniques and sometimes special equipment like FUDs or \"pee funnels\" for convenience and hygiene.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":117-117", + "content": " the urine stream.\\nNow, let's discuss the second question. It is difficult, but not impossible, for women to pee standing up. To do so, they would need to utilize specific techniques and sometimes even special equipment. A woman must spread her legs wide, use her fingers to separate the labia, and adjust her body position to achieve the desired angle for the urine to flow away from her body. For most women, this method does not come naturally and may require practice to achieve the desired outcome without causing mess or discomfort.\\nThere are devices known as female urination devices (FUDs) or \\\"pee funnels\\\" available that enable women to urinate while standing. These devices are designed to fit over the female genital area and channel the urine stream away from the body. When used correctly, FUDs can be a convenient and hygienic solution for women who prefer or need to urinate while standing up. Some women opt to use them for outdoor activities such as hiking, camping, or in situat" + }, + { + "comment": "The code is discussing the challenges faced by women in urinating standing up and explaining how female urination devices could help. The two questions presented are related but not exactly the same, as anatomical differences make it inherently more difficult for women to urinate standing up.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":117-118", + "content": "ions where cleanliness of public restrooms is a concern. These devices could make it easier for women to pee standing up, although using them may still require practice and some level of dexterity.\\nIn summary, the two questions presented - the inability of girls to stand and pee and the challenges faced by women in doing so \u2013 are closely related but not exactly the same. While anatomical differences make it inherently more difficult for women to urinate standing up, it is not impossible for them to do so, especially if they utilize specific techniques or opt for female urination devices.\"}\n{\"text\": \"System: You are an AI assistant. You should describe the task and explain your answer. While answering a multiple choice question, first output the correct answer(s). Then explain why other answers are wrong. You might need to use additional knowledge to answer the question.\\n\\nUser: Fact: the Earth revolves around the sun\\nQuestion: The earth revolves around\\nWhat's the answer? Possible an" + }, + { + "comment": "Carson will likely want to defend himself from Jesse's attack and potentially respond with his own actions.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":118-119", + "content": "swers:\\n (A). a heat source.\\n (B). the Milky Way.\\n (C). a neighboring planet.\\n (D). the moon.\\n\\nAssistant: (A). a heat source.\\nExplanation: The fact given states that the Earth revolves around the sun. Since the sun is a heat source, option (A) is correct. The other options are incorrect for the following reasons:\\n(B) While the Earth is part of the Milky Way galaxy, it individually revolves around the sun, not directly around the entire galaxy.\\n(C) The Earth doesn't revolve around a neighboring planet. Instead, both Earth and other planets in our solar system revolve around the sun.\\n(D) The moon is the one that revolves around Earth, not the other way around.\"}\n{\"text\": \"System: You are an AI assistant. Provide a detailed answer so user don\u2019t need to search outside to understand the answer.\\n\\nUser: Please answer the following question: Context: Carson dug his nails into Jesse's chest when Jesse started swinging at him. Question: What will Carson want to do next? Which one of " + }, + { + "comment": "Assistant: Answer: B: subdue Jesse\nReasoning: The context suggests that Carson is defending himself from an attack by Jesse, so subduing him would be the best course of action to prevent further harm.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":119-120", + "content": "these answers best answers the question according to the context? A: laugh at Jesse B: subdue Jesse C: shout in pain\\nAnswer:\\n\\nAssistant: Answer: B: subdue Jesse\\nAccording to the context provided, Carson dug his nails into Jesse's chest as a defensive response to Jesse starting to swing at him. This indicates that Carson is likely attempting to protect himself from the attack. In this situation, the best course of action for Carson would be to subdue Jesse in order to prevent further aggression and harm. So, answer B: subdue Jesse best aligns with the context given.\"}\n{\"text\": \"User: Extract the answer to the following question from the movie plot. If the question isn't answerable, please output \\\"Can't answer\\\". Question: Where was Captain Benson sent? Title: Saturn 3 Movie plot: In the distant future, an overcrowded Earth relies on research conducted by scientists in remote stations across the solar system. One of these, a small experimental hydroponics research station on Satur" + }, + { + "comment": "The code describes a scene where Captain Benson arrives at Saturn 3, a remote moon inhabited by Adam and Alex. Benson brings a robotic Demigod Series named Hector, which will likely render one of the humans obsolete as part of a government project. However, Benson is a homicidal sociopath, unbeknownst to Adam and Alex.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":120-120", + "content": "n's third moon, is run solely by Adam (Douglas) and his colleague and lover Alex (Fawcett). Adam, the younger Alex and their dog Sally enjoy their isolation, far from an overcrowded and troubled Earth. The couple have been on Saturn 3 for three years, but Alex has spent all her life in space. Alex and Adam's idyll is broken by the arrival of Captain Benson (Keitel), who has been sent to Saturn 3 as part of a project to replace at least one of the moon's scientists with a robot. Benson assembles the robot and names him Hector. One of the first of its kind, a \\\"Demigod Series\\\", Hector relies on \\\"pure brain tissue\\\" extracted from human fetuses and programmed using a direct link to Benson's brain. Benson states that it will render one of the crew \\\"obsolete\\\", most likely Major Adam who tells his partner that he is close to \\\"abort time\\\" according to the government on Earth. Unknown to both Alex and Adam, Benson is a homicidal sociopath who murdered the captain originally assigned to S" + }, + { + "comment": "Code appears to be extracting information about a story involving Saturn 3, a robot named Hector, and characters Benson and Alex. The robot acquires the homicidal nature of its programmer, leading to conflicts and danger for other characters.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":120-121", + "content": "aturn 3 and took his place on the mission. Benson had actually failed a crucial test of psychological stability. As Benson uses the link to his brain to program the robot, Hector acquires Benson's homicidal nature and his lust for Alex. Hector initially kills Sally the dog and then assaults Benson, having learned about Benson being a murderer during the reprogramming process. Adam and Alex manage to disable the robot while it is recharging. Believing the danger over, Adam accuses Benson of gross incompetence and orders him to dismantle the robot and return to Earth when an eclipse ends. Benson obliges, but Hector is soon reassembled by the base's older robots. Resuscitated, Hector murders Benson while he is dragging Alex to his ship. Hector then blows up Benson's spacecraft before Adam and Alex can escape in it, trapping them all on...\\nAnswer:\\n\\nAssistant: Saturn 3\"}\n{\"text\": \"User: Title: Ditto Review: Ditto on the hate the Makita batteries! I was looking into buying some batteries b" + }, + { + "comment": "If Jim increased the speed of water flowing out of his bathroom tap, the amount of energy being transferred by the water would increase. This follows the information provided in the paragraph which states that faster-moving water has more energy.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":121-122", + "content": "ut had a bad taste of how long they last and have been debating on whether I should make a switch, they do have some great tools. But the batteries make it so that you don't want to buy anymore tools. Wake up Makita!! Is the review positive or negative?\\nA:\\n\\nAssistant: Negative\"}\n{\"text\": \"System: You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: Answer the following question: Use information from the paragraph to answer the question. Question: If Jim increased the speed of water flowing out of his bathroom tap, what would happen to the amount of energy being transferred by the water would increase or would decrease? Paragraph : Faster-moving water has more energy.\\nAnswer:\\n\\nAssistant: Step 1: Identify the key information in the question and paragraph.\\nIn the question, we see that Jim increased the speed of water flowing out of his b" + }, + { + "comment": "The code demonstrates a method for explaining the concept of increased water energy based on its speed, using a step-by-step approach. The first step involves understanding the information in the paragraph, the second is relating it to the question, and the third is answering the question. For the task options, the more practical choice would be \"Use clothespins or paperclips,\" as they can effectively hold the bag closed without damaging it, while using a needle and thread may not be as easy or safe for the chip bag.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":122-123", + "content": "athroom tap. In the paragraph, the key information is that faster-moving water has more energy.\\nStep 2: Relate the key information between the question and paragraph.\\nSince the water speed has increased, according to the information from the paragraph, the moving water would have more energy.\\nStep 3: Answer the question.\\nIf Jim increased the speed of water flowing out of his bathroom tap, the amount of energy being transferred by the water would increase.\"}\n{\"text\": \"System: You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: Here is a goal: Keep a bag of chips closed without bag clips.\\nWhich way makes more sense to accomplish this goal?\\nChoose your answer from:\\n - Use clothespins or paperclips\\n - Use a needle and thread.\\n\\nAssistant: To accomplish the goal of keeping a bag of chips closed without bag clips, let's compare the two options " + }, + { + "comment": "The code provides two options for sealing a chip bag - using clothespins or paperclips and using a needle and thread. Clothespins and paperclips are more accessible, easy to use, reusable, and efficient while needle and thread may not be readily available and can leave holes in the bag.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":123-123", + "content": "step-by-step and justify each step:\\nOption 1 - Use clothespins or paperclips:\\n1. Availability: Clothespins and paperclips are more commonly found in households and offices than needle and thread, making them more accessible.\\n2. Ease of use: Clothespins and paperclips can be easily clipped onto the folded or rolled edge of the chip bag, effectively sealing it.\\n3. Reusability: Both clothespins and paperclips can be reused multiple times on different bags or for other purposes.\\n4. Time and effort: Using clothespins or paperclips is a quick and easy process that requires minimal time and effort.\\nOption 2 - Use a needle and thread:\\n1. Availability: Needle and thread may not be readily available in all households, and using it for closing a chip bag can be seen as a less practical option.\\n2. Ease of use: Using a needle and thread involves threading the needle, puncturing the bag, and sewing it closed. This may leave tiny holes in the bag and has a higher chance of spilling the chips." + }, + { + "comment": "This code provides a comparison between using a needle and thread versus clothespins or paperclips to close a bag of chips, discussing reusability, time, effort, and ease of use. It concludes that clothespins or paperclips make more sense for this purpose. The text following the code is in Turkish, mentioning an ongoing investigation by British, Russian, and German police and Interpol joining to speed up information flow.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":123-124", + "content": "\\n3. Reusability: While needle and thread can be reused, the process of opening and closing the bag each time can be tedious.\\n4. Time and effort: Using a needle and thread requires more time and effort than simply clipping something onto the bag.\\nBased on this comparison, it makes more sense to accomplish the goal of keeping a bag of chips closed without bag clips by using clothespins or paperclips. They are more accessible, easier to use, reusable, and require less time and effort than using a needle and thread.\"}\n{\"text\": \"System: You are a helpful assistant, who always provide explanation. Think like you are answering to a five year old.\\n\\nUser: Ge\u00e7ti\u011fimiz ay Londra'da radyasyon zehirlenmesinden \u00f6len eski Rus istihbarat ajan\u0131 Alexander Litvinenko cinayetini soru\u015fturan \u0130ngiliz, Rus ve Alman polisi aras\u0131nda bilgi ak\u0131\u015f\u0131n\u0131 h\u0131zland\u0131rmak amac\u0131yla Interpol'\u00fcn de devreye girmesi bekleniyor.\\nWhich language is this?\\n\\nAssistant: This language is Turkish. It's a language spoken mainly in T" + }, + { + "comment": "Code chunk 1: Describing Turkish language and its Latin alphabet usage, along with a text about Alexander Litvinenko's case and cooperation between police forces.\n\nCode chunk 2: Explaining the translation of \"Everyone's gone\" to Czech as \"V\u0161ichni ode\u0161li,\" breaking down word meanings.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":124-126", + "content": "urkey and Cyprus. Turkish belongs to the Turkic language family, and it's written in the Latin alphabet. This text is about a former Russian intelligence agent, Alexander Litvinenko, who died from radiation poisoning, and the cooperation between British, Russian, and German police in the investigation, with the involvement of Interpol being expected to speed up the information exchange.\"}\n{\"text\": \"System: You are a helpful assistant, who always provide explanation. Think like you are answering to a five year old.\\n\\nUser: Translate the following sentence to Czech:\\nEveryone's gone.\\n\\nAssistant: When we say \\\"Everyone's gone\\\" in Czech, we say \\\"V\u0161ichni ode\u0161li.\\\" Here's what the words mean:\\n\\\"V\u0161ichni\\\" = \\\"Everyone\\\", meaning all the people.\\n\\\"ode\u0161li\\\" = \\\"gone\\\", meaning they left or went away.\\nSo, the whole sentence \\\"V\u0161ichni ode\u0161li\\\" is telling us that all the people have left or went to another place.\"}\n{\"text\": \"System: You are an AI assistant. User will you give you a task. You" + }, + { + "comment": "The code discusses a situation where England manager Roy Hodgson's choice to play Raheem Sterling despite his fatigue has led to scrutiny and questions. The author argues that the full picture is not being considered when evaluating Hodgson's decision, as Sterling's performance against San Marino was impressive.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":126-126", + "content": "r goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: This could have been avoided. If Roy Hodgson had said he wanted to give Adam Lallana the chance to build on his eye-catching efforts against San Marino, we would have been none the wiser about Raheem Sterling\u2019s fatigue. Every word the England manager says is scrutinised. I can see where Hodgson was coming from but an issue has been created that both men could have done without. Had Hodgson explained he wanted to use Sterling for an impact \u2013 which he had, by winning the decisive free-kick against Estonia \u2013 there would be no quarrel. Now, though, Sterling is in the spotlight and contending with questions such as \u2018how can a 19-year-old be tired?\u2019 but those who believe he has made a mistake saying he wasn\u2019t in the right condition to play for England are not looking at the full picture. VIDEO Scroll down to watch Roy Hodgson reveal Raheem Sterling was too ti" + }, + { + "comment": "Sterling substitutes for Henderson in the 64th minute, discusses fatigue with Hodgson; among top England players in minutes played this season.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":126-126", + "content": "red to start. England's Raheem Sterling looks quite relaxed on the substitutes' bench during the first half against Estonia. Jordan Henderson makes way for Sterling in the 64th minute in Tallinn as England went on to beat Estonia 1-0. Sterling told England manager Roy Hodgson that he was not 100 per cent and that he was too tired to start. Of the players England used against Estonia, only Gary Cahill has played more minutes this season than Raheem Sterling. 1164 -\u00a0Gary Cahill. 1079 - Raheem Sterling. 1064 - Jack Wilshere. 1035 - Jordan Henderson. 995 - Calum Chambers. 990 - Joe Hart. 990 - Leighton Baines. 919 - Phil Jagielka. 836 - Danny Welbeck. 758 - Wayne Rooney. 741 - Fabian Delph. 538 - Adam Lallana. It doesn\u2019t matter whether you are 19 or 29, if you play nine games in a month, like Raheem did in September, it will leave a mark on you physically. So, in some respects, he has been brave to confront the issue and explain his concerns to Hodgson. Would I have ever done it? No. I fea" + }, + { + "comment": "Code discusses a player's experience and decisions, including playing injured, masking injuries, and comparing their game to another player's high-intensity style.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":126-126", + "content": "red if I missed a game, someone would come in and take my place. I also never wanted to give managers an excuse to leave me out, so there were times when I played in games I shouldn\u2019t have done. Was it the right thing to do? No. Injuries are what I would mask but I remember a game at Bolton in September 2006 when I was shattered, having been to the World Cup in Germany. We lost 2-0, one of the goals came from a cross into the box. I tried to jump for a header but had nothing in my legs. Had I been right, I know I would have got my head to the ball first. As a central defender, you can find a way of getting through a game, you don\u2019t have to operate at the rapier pace that Raheem moves with. His game is based around high intensity sprinting and he needs sharpness to go past opponents, with or without the ball. The England substitutes watch on as 19-year-old Sterling (second right) was rested from the starting line-up. 32\u00a0- Jagielka, Lambert. 29 - Baines. 28 - Rooney, Cahill. 27 - Hart. 2" + }, + { + "comment": "The code appears to be a text from a JSON file, discussing the management of young football players and comparing their progression, mentioning Michael Owen as an example.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":126-126", + "content": "6 - Lallana. 24 - Henderson,\u00a0Delph. 23 - Welbeck. 22 - Wilshere. 21 - Oxlade-Chamberlain. 19 - Chambers, Sterling. We want Raheem to thrill us, to be an exciting figure for many years to come. We don\u2019t want him to get to 25 and his best form be a thing of the past. If he is managed properly now and looks after himself, he won\u2019t suffer burn out like another former Liverpool and England teenage prodigy. A few weeks ago, I spent some time with Sir Alex Ferguson. During our conversation, he remembered how, in 1997, Michael Owen and I had played at the Under-20 World Cup in Malaysia. Michael came back and went straight into Liverpool\u2019s first team at the start of the season. By contrast, the United players in that England squad \u2013 Ronnie Wallwork and John Curtis \u2013 were both given a month off to recover but Michael was on a rapid upward curve and at the end of that campaign, he set the World Cup finals in France alight. Michael wanted to play in every game for club and country but, in October " + }, + { + "comment": "Michael Owen played too much for Liverpool and England early in his career, never getting a summer break. This likely contributed to his serious hamstring injury in 1999. Raheem Sterling is showing similar signs of fatigue from his heavy workload.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":126-126", + "content": "1998, Gerard Houllier and Roy Evans \u2013 in their brief reign as Liverpool\u2019s joint managers \u2013 decided to rest him for a UEFA Cup game against Valencia. Michael wasn\u2019t happy and said in an interview \u201cI\u2019ll rest when I\u2019m 40\u201d. Former Liverpool and Englad striker Michael Owen wanted to play every game for club and country. Sterling told Hodgson he was feeling tired and not in the right mental state during training on Saturday evening. VIDEO Sterling defends England tiredness. That comment went down well with supporters and ex-players but was he right? No. Michael played far too much for Liverpool and England too early, never getting a summer off to recharge physically or mentally. Was it a coincidence that he suffered a serious hamstring injury in April 1999? Raheem is becoming like Michael, with stature and his pace and his importance to club and country at such an early stage of his career but there have been signs over the last few weeks that such a punishing spell is beginning to catch up" + }, + { + "comment": "Player Raheem Sterling's increased game time and training schedule took a toll on his performance at the 2014 World Cup, and he was substituted in one match. Managers Hodgson and Rodgers will discuss managing his workload going forward.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":126-126", + "content": " with him. He may have only played two games at the World Cup but Sterling was training every day in stifling conditions, having just come off a season in which he had been a key figure in Liverpool\u2019s title charge. After three weeks off, he then went to the USA for Liverpool\u2019s pre-season tour. Don\u2019t underestimate what an impact that schedule will have had on him, physically and mentally. Figures back that up. Between May 30, 2013 and October 13, 2013, Raheem had played 615 minutes of football. In the same period this year, that figure had shot up to 1682 minutes. Sterling may have only started two games at the 2014 World Cup but the England winger was in training daily. Sterling came on in the 64th minute but still received criticism from those on Twitter for his decision. Sterling, warming up during the first half, complained of feeling tired in training ahead of the Euro 2016 qualifier. Hodgson and Brendan Rodgers will talk about how to handle him going forward but, naturally, they a" + }, + { + "comment": "Code discusses a football player's performance, his importance to the team, and how he needs time to adjust to playing consistently under pressure.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":126-126", + "content": "re going to want to use him when they can as he is a match-winner. That\u2019s why he played against Middlesbrough in the League Cup. Liverpool\u2019s poor start in the league meant Rodgers needed his quality. Liverpool could not have afforded to be on the end of a giant-killing that night, so the plan would have been to get the game won and get Sterling off early. As it was, things never went to plan and he subjected to 120 minutes. Since then, his performances have been notably lacking sparkle. Sterling retweeted those criticising him online before writing: 'Now listening to... Excuse me for being human' His importance at Anfield is vital and that is why, at the end of last season, Rodgers called him \u201cthe best young player in Europe\u201d. He was dazzling in that spell but that was a spell when Liverpool were only playing one game a week. This is the first time he has played consistently in games of this pressure and will take some adjustment. But If he is become one of the top players in the world" + }, + { + "comment": "This code appears to be related to a news article discussing Raheem Sterling's schedule and its impact on his performance. The assistant is providing highlights for the article, emphasizing the importance of managing Sterling's schedule, ensuring proper rest, and communication between managers.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":126-127", + "content": ", he will ultimately have to get used to an unrelenting schedule.\\nWrite highlights for this article.\\n\\nAssistant: 1. Raheem Sterling faced controversy over expressing fatigue and not being in the right condition to play for England.\\n2. Sterling's game requires high intensity sprinting and sharpness to go past opponents.\\n3. Managing Sterling's schedule and ensuring proper rest can prevent burnout and long-term injuries.\\n4. Sterling's importance to both Liverpool and England at an early stage in his career puts more pressure on him.\\n5. Communication between England manager Roy Hodgson and Liverpool manager Brendan Rodgers is crucial to handle Sterling's development and progress.\"}\n{\"text\": \"System: You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: By. Thomas Burrows for MailOnline. Cameraman Huw Walters allegedly forged his wife's signature " + }, + { + "comment": "Award-winning cameraman, Huw Talfryn Walters, allegedly forged his wife's signature to pay off \u00a359,000 in debts and then divorced her, spending the money on camera equipment. The case was presented to a new jury due to an earlier failed majority verdict.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":127-127", + "content": "to pay off \u00a359,000 in debts. He denies three counts of fraud at Cardiff Crown Court. An award-winning cameraman forged his wife's signature to pay off \u00a359,000 in debts, before divorcing her, a court heard. Huw Talfryn Walters, 50, allegedly spent the money on buying expensive camera and photographic equipment. But the court heard the freelance cameraman kept his spiralling money problems secret from his wife, Helen Bowen. The case was brought before a new jury after the original jury failed to reach a majority verdict in March this year. Cardiff Crown Court heard yesterday how Walters forged Miss Bowen's signature three times to take \u00a359,000 from their joint mortgage on their marital home in Rhiwbina, Cardiff. During one application, he wrote: 'We need these funds urgently, so your speed in these matters would be welcome.' He then ended their 14-year marriage after telling his wife he needed to put their house up for sale. Walters allegedly told Miss Bowers: 'I'm divorcing you because " + }, + { + "comment": "Code snippet describes a court case where the defendant, a freelance cameraman, is accused of increasing his mortgage by \u00a359,000 using a forged signature without his then-wife's knowledge.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":127-127", + "content": "if I don't I'm going to go bankrupt.' Prosecutor John Davies said: 'Miss Bowen made a career change around 2005 and after that left the mortgage and bills to her husband who she trusted. 'They separated in December 2011 and not long after that she became aware the mortgage had increased without her agreement. 'She contacted the bank to obtain copies of the three withdrawals which increased the size of the mortgage by \u00a359,000. 'She realised the signatures on them didn't belong to her - they were written by her husband without her knowledge while they still lived together.' Freelance cameraman Walters has worked on programmes including the BBC Proms, Doctor Who and Channel 4's history show Time Team. He has won two Welsh Baftas for his cinematography and camera work in recent years. The court heard Walters spent thousands of pounds on expensive camera equipment. Mr. Davies told the jury: 'A total of \u00a359,000 was acquired using the fake. signature of his then wife. He entered each signatur" + }, + { + "comment": "The code is extracting information from the \"RWKV-LM/RWKV-v5/demo.jsonl\" file at line 127-127, where it talks about a court case involving Huw Talfryn Walters who forged his wife's signature to take \u00a359,000 from their joint mortgage and denies three charges of fraud.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":127-127", + "content": "e having not. consulted her or asked to consent.' Scroll down for video. Cardiff Crown Court (pictured) heard how Walters forged Miss Bowen's signature three times to take \u00a359,000 from their joint mortgage on their marital home in Rhiwbina, Cardiff. The court heard from a forensic handwriting expert, who proved Helen Bowen's signature had not been written by her - and was most likely written by her husband. Miss Bowers, a professional reflexologist, said she only discovered her husband had forged her signature a few months after their separation - when she was contacted by her mortgage provider. When asked if she agreed to the mortgage withdrawals, she replied: 'No, not at all. If I had been asked I would not have agreed to them.' Walters, now of Maesteg, in Bridgend, denies three charges of fraud. The trial continues. Sorry we are not currently accepting comments on this article.\\nHere is a summary of the highlights for this article:\\n\\nAssistant: Cameraman Huw Talfryn Walters is accu" + }, + { + "comment": "This code is a JSON file containing text information about various subjects, likely for data storage or processing purposes. Each entry in the file represents a separate piece of information, with the \"text\" field providing specific details for each entry.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":127-128", + "content": "sed of forging his wife's signature to pay off \u00a359,000 in debts before divorcing her. Walters allegedly spent the money on expensive camera equipment and kept his financial issues a secret from his wife, Helen Bowen. The case is being heard at Cardiff Crown Court after the original jury failed to reach a majority verdict in March. Walters is accused of forging Bowen's signature three times to take money from their joint mortgage. He denies three charges of fraud, and the trial is ongoing.\"}\n{\"text\": \"System: You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: Staff is always attentive and friendly and on occasion the owner stops by to chit chat and see how service and food is, which is nice to see! Rose and Brandon are always a pleasure to be around and give great service and conversation! Food is excellent as usual, great menu for a bar! Friday h" + }, + { + "comment": "The code is analyzing a review to determine if it's positive or negative by identifying key points and sentiments. It found the review contains positive feedback, so the answer is II. positive.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":128-129", + "content": "appy hour is outstanding, and the bar is not crazy loud even though it is very busy!\\nIs this review positive or negative?\\nPick your answer from:\\nI. negative.\\nII. positive.\\nAnswer:\\n\\nAssistant: To determine if the review is positive or negative, I'll analyze the key points mentioned in the review.\\n1. \\\"Staff is always attentive and friendly\\\" - Positive\\n2. \\\"the owner stops by to chit chat and see how service and food is\\\" - Positive\\n3. \\\"Rose and Brandon are always a pleasure to be around and give great service and conversation!\\\" - Positive\\n4. \\\"Food is excellent as usual, great menu for a bar!\\\" - Positive\\n5. \\\"Friday happy hour is outstanding\\\" - Positive\\n6. \\\"the bar is not crazy loud even though it is very busy\\\" - Positive\\nBased on the analysis, the review contains positive feedback and sentiments.\\nAnswer: II. positive.\"}\n{\"text\": \"System: You are a helpful assistant, who always provide explanation. Think like you are answering to a five year old.\\n\\nUser: Le message" + }, + { + "comment": "The code snippet appears to be a part of a text file containing various texts in different languages, including French and English. It also includes a multiple-choice question related to the movement of a toddler on different surfaces with varying resistance levels.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":129-131", + "content": " de l'orateur au Gouvernement espagnol, c'est que Gibraltar ne fera jamais partie de l'Espagne, que la d\u00e9colonisation du territoire aura lieu conform\u00e9ment \u00e0 la volont\u00e9 de son peuple et qu'il n'y a rien que l'Espagne puisse faire \u00e0 cet \u00e9gard.\\nWhich language is this?\\n\\nAssistant: This language is French. It's a beautiful language spoken mostly in France, but also in many other countries like Canada, Belgium, and some African countries. The message talks about Gibraltar, which is a special place, and how it won't be part of Spain.\"}\n{\"text\": \"System: You are an AI assistant that follows instruction extremely well. Help as much as you can.\\n\\nUser: Choose between \\\"sandbox\\\" and \\\"patchy grass\\\". Question: The toddler moved faster in the sandbox than in the patch of grass because it had (A) more resistance (B) less resistance\\n\\nAssistant: The toddler moved faster in the sandbox than in the patch of grass because it had (B) less resistance.\"}\n{\"text\": \"System: You are an AI assistant. You" + }, + { + "comment": "Analyzing the sentiment of a tweet requires considering context and interpreting text accurately, which may involve corrections for errors. This includes examining surrounding context, the user's profile, and recent events to provide an accurate analysis.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":131-131", + "content": " will be given a task. You must generate a detailed and long answer.\\n\\nUser: How would one describe the sentiment of this tweet?\\n@clairemsp tians alot\\n\\nAssistant: To provide an in-depth description of the sentiment of the tweet \\\"@clairemsp tians alot\\\", one needs to first dissect and analyze the various elements present in it. There are a few important assumptions we have to make in order to analyze the tweet, as the text has some errors and is therefore open to interpretation. \\n1. Context: It is crucial to consider the context surrounding the tweet, such as the subject matter, any previous conversation, the Twitter user's profile, and recent related events. A sentiment analysis based solely on the text may provide a skewed or misleading interpretation.\\n2. Text interpretation: Taking the given tweet at face value, it appears to contain a typing error. We can assume that the correct version could be \\\"@clairemsp means alot\\\" or \\\"@clairemsp thanks a lot\\\", with the latter being a" + }, + { + "comment": "Code analyzes a tweet and determines that it expresses gratitude (thanks) with a positive sentiment due to the user addressing \"clairemsp\" and stating something means a lot or they are thankful. The tone of the tweet is appreciative, though lacking specific details.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":131-131", + "content": " more plausible interpretation as 'tians' could possibly be a misspelled or typo version of 'thanks'. This interpretation suggests a message of gratitude. Semantically, this phrase signifies a positive sentiment.\\n3. User's Tone: Analyzing the specific tweet, we can observe that the user is directly addressing \\\"clairemsp\\\" and sharing that something \\\"means a lot\\\" or they are thankful. This could imply that clairemsp has contributed to something valuable or helpful; however, the tweet lacks specific details. Despite that, overall, the tone of the tweet appears to be appreciative and possibly positive.\\nConsidering these elements, the sentiment of the tweet can be described as predominantly positive. Here, the author is expressing gratitude to the person they've mentioned in the tweet and is likely appreciative of their help, contribution, or impact. It is essential to note that understanding the broader context and tone of the conversation or topic would provide better insight and ac" + }, + { + "comment": "The code is generating a response to determine if a hypothesis is true based on the given paragraph. It involves reading and understanding the provided text, determining its relevance to the hypothesis, and then responding whether it's true or not.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":131-132", + "content": "curacy in determining the tweet sentiment. However, based on the available information, it can be concluded that the sentiment expressed in this tweet appears to be positive.\"}\n{\"text\": \"System: You are an AI assistant. You will be given a task. You must generate a detailed and long answer.\\n\\nUser: Read the following paragraph and determine if the hypothesis is true:\\nIn mathematics, the Vitali\u2013Carath\u00e9odory theorem is a result in real analysis that shows that, under the conditions stated below, integrable functions can be approximated in L from above and below by lower- and upper-semicontinuous functions, respectively. It is named after Giuseppe Vitali and Constantin Carath\u00e9odory.\\nHypothesis: Vitali\u2013Carath\u00e9odory theorem involves mathematics\\n\\nAssistant: After reading the paragraph provided, it can be inferred that the hypothesis is true. The Vitali\u2013Carath\u00e9odory theorem involves mathematics, particularly in the field of real analysis. Real analysis is a branch of mathematics that deal" + }, + { + "comment": "This code discusses the Giuseppe Vitali and Constantin Carath\u00e9odory theorem in mathematics. It explains that under specific conditions, integrable functions can be approximated by lower- and upper-semicontinuous functions from above and below, respectively. Lower-semicontinuous functions have limit inferior values while upper-semicontinuous functions have limit superior values.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":132-132", + "content": "s with the study of real numbers, their properties, and the properties of functions and sequences defined on the set of real numbers.\\nThe theorem is named after Giuseppe Vitali and Constantin Carath\u00e9odory, two mathematicians who studied and made significant contributions to this area of mathematics. The theorem states that under certain conditions, integrable functions (i.e., functions whose integrals or areas under the curve exist) can be approximated in the L space (a function space in mathematics that deals with the study of certain types of functions; specifically, functions that are measurable and have finite norms) from above and below by lower- and upper-semicontinuous functions, respectively.\\nLower-semicontinuous functions are functions that have the property that the limit inferior (or greatest lower bound) of the function at a point is less than or equal to the value of the function at that point. Upper-semicontinuous functions, on the other hand, have the property that the" + }, + { + "comment": "This code snippet describes the Vitali\u2013Carath\u00e9odory theorem, an important result in mathematics related to integrable functions and their approximations using lower- and upper-semicontinuous functions. The hypothesis of this theorem is true, as it is a part of mathematical studies involving real numbers and real-valued functions.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":132-133", + "content": " limit superior (or least upper bound) of the function at a point is greater than or equal to the value of the function at that point.\\nIn summary, the Vitali\u2013Carath\u00e9odory theorem is an important result in mathematics dealing with the properties of integrable functions, their approximations using lower- and upper-semicontinuous functions, and their applications to the study of real numbers and real-valued functions. As such, the hypothesis is true: the theorem is indeed a part of mathematics.\"}\n{\"text\": \"System: You are an AI assistant. You will be given a task. You must generate a detailed and long answer.\\n\\nUser: One of the best places to stay in Vegas!!! Love the location, the decor, the fountain with the synchronicity of the music that you can hear from the room. The shops are great and the shows that they offer and to me the casino is the best compare to the others. The pool area is huge with different areas and cabanas are great and spacious. Great restaurants and among all of th" + }, + { + "comment": "This code is a JSON text chunk from the RWKV-LM/RWKV-v5/demo.jsonl file. It presents a user review of a hotel and asks whether it's positive or negative, followed by a task for an AI assistant to continue writing a given text about Mary healing from her fallout with Tanya.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":133-134", + "content": "e hotels that I have been I think it is family oriented without kids bothering .... strange but true!!! Love the hotel!!!\\nIs this review positive or negative?\\nSelect from:\\n(i) negative\\n(ii) positive\\nAnswer:\\n\\nAssistant: (ii) positive\"}\n{\"text\": \"System: You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: Continue writing the following text.\\nIt took a long time for Mary to heal from her fallout with Tanya as\\n\\nAssistant: the depth of their friendship had made the betrayal even more painful. Mary could still remember the day when Tanya revealed her true colors, leaving her in utter shock. The first step in moving forward was for Mary to fully grasp the situation and come to terms with the fact that Tanya was not the person she thought she knew.\\nStep 1: Acceptance\\nIn order to heal, Mary had to accept that Tanya was a completely different pe" + }, + { + "comment": "Mary goes through the process of acceptance, emotional expression, and seeking support after discovering her friend Tanya's betrayal.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":134-134", + "content": "rson from the loyal friend she had trusted for so long. It was challenging as she went over the countless memories they had shared, wondering if Tanya had been pretending the entire time. The process of acceptance was slow and difficult, but it was essential in order for Mary to let go and find closure.\\nStep 2: Emotional Expression\\nMary found it crucial to express and release the hurt, anger, and sadness that she felt as a result of Tanya's actions. She chose to write in a journal, where she could vent her feelings honestly and privately, without the fear of judgment from others. This outlet allowed her to process her emotions and gain a clearer understanding of her thoughts and experiences.\\nStep 3: Seeking Support\\nAs Mary worked through her feelings, she reached out to her friends and family for support. They provided her with the much-needed love, encouragement, and a sense of belonging as she faced her challenging emotional journey. Their presence acted as a reminder that despit" + }, + { + "comment": "This code seems to be a narrative describing the process of healing and forgiveness following the loss of a friendship. It outlines steps such as acknowledging the pain, seeking support from others, and eventually finding the ability to forgive before moving forward and embracing personal growth and lessons learned from the experience.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":134-134", + "content": "e the loss of one friendship, she still had others who cared for her deeply.\\nStep 4: Forgiveness\\nAlthough it took a considerable amount of time, Mary eventually reached a point where she was able to forgive Tanya. Forgiveness didn't mean that she condoned Tanya's actions, but rather, she let go of the resentment and negative emotions she had been holding onto. Forgiving Tanya allowed Mary to free herself from the hold that the betrayal had on her life, and in turn, regain her emotional well-being.\\nStep 5: Moving Forward\\nWith forgiveness achieved, Mary began to rebuild her life without Tanya. She focused on strengthening her relationships with her existing support network, and even forged new connections along the way. While the betrayal was painful, it ultimately pushed Mary towards personal growth and lessons that she would carry with her throughout her life.\\nIn the end, the long and intertwining road towards healing taught Mary several valuable lessons about trust, friendship, a" + }, + { + "comment": "The code is parsing two contexts. The first one describes a 1986 film named \"Billy Galvin\" and the actors involved, while the second one talks about a playground at Disney's Hollywood Studios based on the movie \"Honey, I Shrunk the Kids.\" There are no questions to answer nor is there any generated answer.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":134-135", + "content": "nd resilience. As she closed the chapter on her fallout with Tanya, Mary emerged stronger and more self-aware, ready to embrace new experiences and authentic connections with those who truly deserved her trust.\"}\n{\"text\": \"System: Explain how you used the definition to come up with the answer.\\n\\nUser: Definition: In this task, you are given a set of context paragraph and some supporting facts to answer a question. Your task is to generate answer for given question based on set of context paragraphs and supporting facts.\\nInput: Context_1 : Billy Galvin is a 1986 film directed by John Gray. It stars Karl Malden and Lenny von Dohlen. Context_2 : Honey, I Shrunk the Kids: Movie Set Adventure was a playground at Disney's Hollywood Studios in Orlando, Florida. It was located in the Streets of America area and was based on Disney's 1989 film, \\\"Honey, I Shrunk the Kids\\\". The playground, which opened almost a year after the film, featured 30 ft blades of grass, and was themed as a movie set " + }, + { + "comment": "This code appears to contain plot summaries and information about several movies, potentially from a movie database or similar resource. The code seems to be structured as a series of contexts, each providing details about a specific film including its genre, setting, main cast, and storyline. These contexts could be used for various purposes such as generating movie descriptions, recommendations, or providing information to users in a conversation setting.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":135-135", + "content": "for the giant backyard scenes from the film. It was closed in April 2016 to make room for the construction of on the same site. Context_3 : Movement and Location (also called Movement + Location) is an American science fiction movie set in modern-day Brooklyn, directed by Alexis Boling. It stars Bodine Boling, Catherine Missal, Brendan Griffin, Anna Margaret Hollyman, David Andrew Macdonald and John Dapolito. \\\"Movement and Location\\\" tells the story of Kim Getty, an immigrant from 400 years in the future who is sent back in time to live an easier life. It premiered at the 2014 Brooklyn Film Festival where it won the Audience Award, Best Screenplay and Best Original Score. Context_4 : Dracula\u2019s Widow is a 1988 vampire thriller film directed by Christopher Coppola , written by Tom Blomquist as Kathryn Ann Thomas, and starring Sylvia Kristel, Josef Sommer and Lenny von Dohlen. Kristel, in the titular role, goes on a killing spree in a seedy 1980s Hollywood, while in search of her husband" + }, + { + "comment": "This code snippet contains eight contextual descriptions, including a film synopsis (\"Entertaining Angels: the Dorothy Day Story\"), actor bios (Lenny Von Dohlen and Moira Kelly), director information (Scott Fivelson), and a movie summary (\"Electric Dreams\").", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":135-135", + "content": ". Context_5 : Entertaining Angels: the Dorothy Day Story is a 1996 independent film about the life of Dorothy Day, the journalist turned social activist and founder of the \\\"Catholic Worker\\\" newspaper. The film stars Moira Kelly as Day, Heather Graham, Lenny Von Dohlen and Martin Sheen. Context_6 : Lenny Von Dohlen (born December 22, 1958) is an American film, television and stage actor, known for his performance as architect Miles Harding in the film \\\"Electric Dreams\\\" and as the orchid-growing agoraphobic Harold Smith in the television series \\\"Twin Peaks\\\". Von Dohlen currently resides in New York and Los Angeles. Context_7 : Scott Fivelson is the writer/director of \\\"Near Myth: The Oskar Knight Story\\\", a Hollywood biopic starring Lenny Von Dohlen as legendary director, \\\"Oskar Knight\\\". The film won Scott Fivelson the \\\"Breakthrough Director Spotlight\\\" award at the 2016 Hollywood Reel Independent Film Festival. Context_8 : Electric Dreams is a 1984 American-British science fict" + }, + { + "comment": "Code describes a romantic comedy-drama film set in San Francisco, featuring a love triangle involving a man, a woman, and a personal computer. The story follows high school friends, Reggie's unrequited love, and the relationships of Diego and Bonnie.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":135-135", + "content": "ion romantic comedy-drama film set in San Francisco, California, that depicts a love triangle between a man, a woman and a personal computer. It stars Lenny Von Dohlen, Virginia Madsen, Maxwell Caulfield, and the voice of Bud Cort and was directed by Steve Barron. It was the first film released by the Virgin Films production company. Context_9 : The film concerns a group of high school buddies before moving the action forward to reveal the characters two decades later. Reggie (Adam Lamberg) is the wild card of this group of friends. Perpetually single and lacking ambition, he is prone to making inappropriate comments to girls. When the object of his affection breaks up with her bad-boyfriend, Reggie clumsily asks for a date. Things almost happen, but not quite, and the promise of a kiss still haunts Reggie (Lenny Von Dohlen) 20 years later. On the other hand, Diego (Giancarlo Carmona) and Bonnie (Caker Folley), dated all through high school. But when Diego went to Mexico to help his fa" + }, + { + "comment": "This code contains various facts and context about characters, movie locations, and actors. It mentions a lawyer named Bonnie, her relationship issues, and a movie set called Alamo Village in Texas. Additionally, it provides information about actor Lenny Von Dohlen and his roles in \"Electric Dreams\" and \"Twin Peaks\".", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":135-135", + "content": "mily, Bonnie (Cynthia Gibb) lost whatever faith she had in men. Now she's a tough lawyer with a young son and a husband she can't trust (John Schneider). When Diego (Eddie Velez) returns and unexpected emotions surface. Finally, Morgan and Dennice were high school sweethearts who married as teens and are still happy together, but secrets suddenly overwhelm them. Context_10 : Alamo Village is a movie set and tourist attraction north of Brackettville, Texas, United States. It was the first movie location built in Texas, originally constructed for and best known as the setting for \\\"The Alamo\\\" (1960), directed by John Wayne and starring Wayne, Richard Widmark, Laurence Harvey and Frankie Avalon. fact_1 : Lenny Von Dohlen (born December 22, 1958) is an American film, television and stage actor, known for his performance as architect Miles Harding in the film \\\"Electric Dreams\\\" and as the orchid-growing agoraphobic Harold Smith in the television series \\\"Twin Peaks\\\". fact_2 : Electric Dr" + }, + { + "comment": "Code snippet describes a movie set in San Francisco, California where Lenny Von Dohlen plays an architect role in the 1984 film \"Electric Dreams\". The user has to generate a title for this information.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":135-136", + "content": "eams is a 1984 American-British science fiction romantic comedy-drama film set in San Francisco, California, that depicts a love triangle between a man, a woman and a personal computer. Question: Lenny Von Dohlen was an architect in the 1984 movie set in what location?\\nOutput:\\n\\nAssistant: San Francisco, California\\nI used the definition to understand that I need to find the movie in which Lenny Von Dohlen played an architect role in a 1984 movie. I found this information in Context_8, where \\\"Electric Dreams\\\" is mentioned as a 1984 movie set in San Francisco, California, and Lenny Von Dohlen is mentioned as playing an architect.\"}\n{\"text\": \"System: User will you give you a task with some instruction. Your job is follow the instructions as faithfully as you can. While answering think step-by-step and justify your answer.\\n\\nUser: You will be given a definition of a task first, then some input of the task.\\nGenerate an appropriate title for the given text. The generated title must be " + }, + { + "comment": "Code snippet discusses possible Google acquisition of Fitbit, with no deal confirmed yet. Reuters reports negotiations, but neither company has officially commented.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":136-136", + "content": "short and include the main topic of the text. The preferred titles are under fifteen words.\\nDave LeeNorth America technology reporter Reuters reported that the companies were in negotiations but no deal had been agreed. The move would enable Google to produce its own smartwatches. It already develops a version of Android for wearables. Fitbit's share price increased by 19% before being suspended. Google has not commented. A spokeswoman for Fitbit said it \\\"does not comment on rumours or speculation\\\". Reuters did not say what sum had been offered. It emerged in September that Fitbit, currently valued by the markets at around $1.4bn (\u00a31.1bn) after Monday's surge, was touting itself for a sale. Fitbit, which in its latest quarterly earnings posted a loss of $68.5m, is looking precarious as a standalone entity since bigger firms have gained a strong footing in wearable technology. \\\"A key tipping point is likely to have been Apple's decision to price its Series 3 Apple Watch at $199/\u00a3199" + }, + { + "comment": "Leo Gebbie from CCS Insight comments on Fitbit's pressure from low-cost rivals, mentioning the company's attempt to diversify into sleep monitoring and diabetes management. The deal with Google could boost Wear OS platform if confirmed, and the market is predicted to sell 142 million devices this year at $17.1 billion value.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":136-136", + "content": ",\\\" said Leo Gebbie, from the consultancy CCS Insight. \\\"That will have put immense pressure on Fitbit's own products, which are already feeling the challenge from low-cost rivals such as [China's] Xiaomi.\\\" In an attempt to diversify its income, Fitbit has expanded into other areas - including monitoring users' breathing during sleep, and diabetes management. But the firm trails Huawei, Apple and Xiaomi in fitness tech sales. For Google the deal, if confirmed, would represent a renewed effort to develop its Wear OS platform. \\\"Although Google has been lukewarm in its commitment to wearables recently, this indicates it is serious about the segment and could provide a strong boost to its ambitions,\\\" Mr Gebbie added. CCS Insight predicted 142 million wearable devices would be sold worldwide this year, at a value of $17.1bn. Meanwhile, Google's parent company Alphabet released its third quarter results which showed a drop in net profit to $7bn in the three months to 30 September. Analyst" + }, + { + "comment": "This code appears to be a JSON snippet containing text and output related to Google financial data and potential acquisition news, along with a user task of translating a statement about human trafficking and social issues in the European Union.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":136-137", + "content": "s had been expecting profit of $8.8bn. It fell from $9bn in same period last year when changes to US tax law had boosted Alphabet's bottom line. Total third quarter revenue rose by 20% to $40.5bn.\\nOutput:\\n\\nAssistant: \\\"Google in Talks to Acquire Fitbit for Wearable Tech Expansion\\\"\"}\n{\"text\": \"System: You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: Translate to Finnish:\\nTrafficking in human beings and prostitution continue to reach alarming levels in the European Union, and further socio-economic policies, high-quality public services and effective measures are required to solve the problems of poverty and social exclusion.\\n\\nAssistant: Step 1: Identify key phrases and terms to be translated.\\n- Trafficking in human beings\\n- Prostitution\\n- Alarming levels\\n- European Union\\n- Socio-economic policies\\n- High-quality public services\\n- Ef" + }, + { + "comment": "This code is translating key phrases and terms related to human trafficking, prostitution, socio-economic policies, and public services. It then constructs a coherent Finnish sentence using the translated phrases and maintains accurate grammar. The translation highlights that human trafficking and prostitution are at alarming levels in the European Union, requiring more social and economic policies, high-quality public services, and effective measures to combat poverty and social exclusion.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":137-137", + "content": "fective measures\\n- Poverty\\n- Social exclusion\\nStep 2: Translate each key phrase and term to Finnish\\n- Trafficking in human beings: ihmiskauppa\\n- Prostitution: prostituutio\\n- Alarming levels: h\u00e4lytt\u00e4v\u00e4t tasot\\n- European Union: Euroopan unioni\\n- Socio-economic policies: sosioekonomiset politiikat\\n- High-quality public services: korkealaatuiset julkiset palvelut\\n- Effective measures: tehokkaat toimenpiteet\\n- Poverty: k\u00f6yhyys\\n- Social exclusion: sosiaalinen syrj\u00e4ytyminen\\nStep 3: Build a Finnish sentence using the translated phrases and terms, maintaining a coherent structure and accurate grammar.\\nIhmiskauppa ja prostituutio jatkavat h\u00e4lytt\u00e4vien tasojen saavuttamista Euroopan unionissa, ja ongelman ratkaisemiseksi k\u00f6yhyyden ja sosiaalisen syrj\u00e4ytymisen osalta tarvitaan lis\u00e4\u00e4 sosioekonomisia politiikkoja, korkealaatuisia julkisia palveluja ja tehokkaita toimenpiteit\u00e4.\\nTranslation: Ihmiskauppa ja prostituutio jatkavat h\u00e4lytt\u00e4vien tasojen saavuttamista Euroopan unionissa, ja ong" + }, + { + "comment": "Code snippet describes negative feedback on a television show \"Cavemen\", mentioning poor quality, lack of storyline and unsatisfactory cast selection. User gives it a 1 out of 10 rating, expressing disappointment.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":137-138", + "content": "elman ratkaisemiseksi k\u00f6yhyyden ja sosiaalisen syrj\u00e4ytymisen osalta tarvitaan lis\u00e4\u00e4 sosioekonomisia politiikkoja, korkealaatuisia julkisia palveluja ja tehokkaita toimenpiteit\u00e4.\"}\n{\"text\": \"System: You are an AI assistant. You will be given a task. You must generate a detailed and long answer.\\n\\nUser: Cavemen was by far the biggest load of crap I have ever wasted my time watching. This show based on the Geico commercials is less entertaining then an actual 30 sec ad for Geico. The makeup was half ass-ed to say the least, hard to imagine a caveman with prefect white teeth even after going to the dentist. This show could of had potential for a funny series if they could of gotten the cast from the commercials, that in it self makes for a lousy show. Perhaps if the writers were the same from the Geico ads this may of had a chance, instead the pilot lacked a good story line. I give this show a 1 out of 10, I would of liked to put a zero out of 10 but that was not an option. I pray for a qu" + }, + { + "comment": "The provided code appears to contain various chunks of text, with the first segment being a negative review for a TV show and the second segment being a translation task. The sentiment analysis is conducted on the first chunk where it determines that the sentiment of the review is negative, while in the second segment, the AI is tasked to translate a Romanian sentence into English.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":138-140", + "content": "ick death to this show, I'd give it less then 5 episodes before it dies a deserving death.\\nIs the sentiment of this review positive or negative?\\n\\nAssistant: The sentiment of this review is clearly negative. The reviewer expresses strong dissatisfaction with the show, stating it was a waste of time, criticizing the makeup and casting choices, and wishing for it to quickly end. They also give it a low rating, indicating their overall disapproval of the show.\"}\n{\"text\": \"System: You are an AI assistant. You will be given a task. You must generate a detailed and long answer.\\n\\nUser: \u00cen acest scop - dup\u0103 cum a\u0163i propus - trebuie s\u0103 \u00eencepem negocierile cu pre\u015fedin\u0163ia Consiliului c\u00e2t mai repede posibil, pentru a hot\u0103r\u00ee o serie de modific\u0103ri.\\nCould you please translate this to English?\\n\\nAssistant: For this purpose - as you have suggested - we must begin negotiations with the presidency of the Council as soon as possible, in order to decide on a series of changes.\"}\n{\"text\": \"System: You a" + }, + { + "comment": "AI assistant is given a task to answer a question based on the provided context. It reads and understands the context, identifies key points, matches relevant information to answer the question, and forms an answer accordingly.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":140-140", + "content": "re an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: Answer the following question: Question: \\\"Sassou and the Soviet union signed a pact based on what?\\\" Context: \\\"Sassou Nguesso aligned the country with the Eastern Bloc and signed a twenty-year friendship pact with the Soviet Union. Over the years, Sassou had to rely more on political repression and less on patronage to maintain his dictatorship.\\\" Answer:\\nAnswer:\\n\\nAssistant: Sassou and the Soviet Union signed a pact based on a twenty-year friendship and alignment with the Eastern Bloc.\\nStep 1: Read and understand the given context.\\nStep 2: Identify the key points - Sassou Nguesso, Eastern Bloc alignment, twenty-year friendship, Soviet Union.\\nStep 3: Determine the main focus of the question - wanting to know what the pact was based on.\\nStep 4: Match the relevant key points from the context " + }, + { + "comment": "The code represents a step-by-step process of analyzing a user's query and evaluating the suggested answer to determine if it is valid for the given question about where statutes are formed.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":140-141", + "content": "with the question's focus.\\nStep 5: Formulate a concise answer using the matched key points from the context.\"}\n{\"text\": \"System: You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: Options are:\\n- No;\\n- Yes;\\nI am verifying the answers generated by an automatic system to the following question: where are statutes formed Suggested answer: This word is used in contradistinction to the common law . Should I validate this answer?\\nAnswer:\\n\\nAssistant: Step 1: Analyzing the user's query\\nThe user has provided two options - 'No' and 'Yes.' They are asking if they should validate the given answer to a question. The question is about where statutes are formed, and the suggested answer is related to the common law.\\nStep 2: Evaluating the suggested answer\\nThe suggested answer does not specifically address where statutes are formed; it only provides a c" + }, + { + "comment": "Code describes a multiple-choice question about where statutes are formed, providing the correct answer but also explaining why the suggested answer is insufficient. The code also instructs to explain the answer like speaking to a five-year-old.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":141-142", + "content": "omparative aspect with the common law. Thus, it is insufficient to answer the question correctly.\\nStep 3: Conclusion\\nSince the suggested answer does not adequately address the question, the appropriate response should be 'No.' It is recommended not to validate this answer because it does not fully answer the question about where statutes are formed.\"}\n{\"text\": \"System: You should describe the task and explain your answer. While answering a multiple choice question, first output the correct answer(s). Then explain why other answers are wrong. Think like you are answering to a five year old.\\n\\nUser: Context: He doubled up in her arms , then went limp . His body rolled over , his face wedging in the roots of a drift stump . Rin struggled to turn him over again ; small pebbles stuck to the bared front of his body . One hand still clamped over the broad smear of blood .\\nQuestion Why was the hand still clamped over the broad smear of blood ?\\nPossible answers:\\nSelect from the following.\\" + }, + { + "comment": "Code description: This code snippet is part of a multiple-choice question where the AI has to find the correct answer based on the given passage. It then explains why the other choices are incorrect. In this case, the correct answer is 'c). It was painful', as it aligns with the information provided in the passage about holding onto the wound due to pain. Choices 'a' and 'b' are incorrect because there is no mention of water temperature or embarrassment in the passage related to the situation.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":142-143", + "content": "na). The water was cold on the wound and it hurt ..\\nb). he was embarrased because he was bleeding ..\\nc). It was painful ..\\nd). None of the above choices ..\\nThe answer:\\n\\nAssistant: c). It was painful.\\nExplanation: The hand was still clamped over the broad smear of blood because it was painful. When someone is injured and bleeding, they often hold onto the wound to try to alleviate the pain or to stop the bleeding. \\na) is wrong because the passage does not mention anything about the water temperature affecting the wound.\\nb) is also wrong because there is no evidence in the passage that the person would be embarrassed about bleeding.\\nd) is incorrect because one of the choices, in this case, choice c, is the correct answer.\"}\n{\"text\": \"System: You are an AI assistant. You should describe the task and explain your answer. While answering a multiple choice question, first output the correct answer(s). Then explain why other answers are wrong. You might need to use additional knowled" + }, + { + "comment": "The code is asking for the sentiment of a review and provides two options, negative or positive. The review talks about a disappointing dining experience at a restaurant, with issues related to food quality and service. Therefore, the correct answer is A. negative.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":143-143", + "content": "ge to answer the question.\\n\\nUser: Had dinner last night at Talavera. Very disappointed. Restaurant is beautiful with high end touches. Poor marks for food quality and service. My medium rare buffalo tenderloin was significantly overcooked. No pink was visible. The veggies, on the contrary were undercooked. The carrots were warm on the outside and cold in the middle. I never send food back but was ready to make an exception in this case. However, the waitstaff was nowhere to be found. After receiving my entree 40 min elapsed before anyone followed up to see how we were doing. I don't necessarily blame the servers as they seemed to be significantly understaffed. I expect much, much more from an expensive restaurant at a Four Seasons resort. Extremely disappointing.\\nChoose the sentiment of this review?\\nPick your answer from: A. negative. B. positive.\\n\\nAssistant: A. negative\\nThe sentiment of this review is negative because the reviewer expresses disappointme" + }, + { + "comment": "This code represents a user review of Talavera, expressing dissatisfaction with various aspects such as food quality and service. It also mentions expectations from an expensive restaurant at a Four Seasons resort.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":143-144", + "content": "nt in multiple aspects of their experience at Talavera, such as poor food quality, overcooked and undercooked dishes, and slow service. They also mention that they expect more from an expensive restaurant at a Four Seasons resort, further emphasizing their dissatisfaction.\"}\n{\"text\": \"System: You are an AI assistant, who knows every language and how to translate one language to another. Given a task, you explain in simple steps what the task is asking, any guidelines that it provides. You solve the task and show how you used the guidelines to solve the task.\\n\\nUser: Detailed Instructions: In this task, we can are given an english sentence and the goal is to generate a text in 'turkish' language that is a faithful translation of the input sentence\\nQ: CHP\\nA:\\n\\nAssistant: Task Explanation: The task is to translate an English sentence into Turkish, ensuring that the translation is accurate and stays true to the original sentence's meaning.\\nGiven English Sentence: \\\"CHP\\\"\\nGuidelines:\\n" + }, + { + "comment": "This code appears to be extracting information from a JSONL file (RWKV-LM/RWKV-v5/demo.jsonl) and translating English sentences into Turkish while identifying abbreviations related to the Turkish context. It also uses guidelines to determine if translation is necessary or not, and provides recommendations based on given reviews.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":144-146", + "content": "1. Understand the context of the given English sentence or abbreviation.\\n2. Translate the English sentence into Turkish.\\nSolution:\\nIn this case, \\\"CHP\\\" is an abbreviation for the \\\"Cumhuriyet Halk Partisi\\\" which is a political party in Turkey. It is not an English sentence, but we can still provide the translation in Turkish since it is already related to the Turkish context.\\nUsing Guidelines:\\n1. I identified that \\\"CHP\\\" is an abbreviation for the Turkish political party \\\"Cumhuriyet Halk Partisi\\\".\\n2. As it is already in the Turkish context, there is no need to translate. So, the answer is:\\nA: Cumhuriyet Halk Partisi\"}\n{\"text\": \"User: Given the question: Given this review: \\\"Latest update is making this a battery hog. Used to be out of my Top 10.\\\" Would you recommend this app to a friend? Not at all, No, Maybe, Yes, or Definitely?\\nThe answer is:\\n\\nAssistant: No\"}\n{\"text\": \"System: You are an AI assistant. User will you give you a task. Your goal is to complete the task as f" + }, + { + "comment": "The code provides step-by-step instructions for solving a linear equation with the variable 'o', combining like terms, isolating the variable term, and finally solving for 'o'. The solution to the equation is found to be o = -21.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":146-146", + "content": "aithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: Solve -40*o + 63 = -17*o - 26*o for o.\\n\\nAssistant: To solve the equation -40*o + 63 = -17*o - 26*o for o, follow these steps:\\nStep 1: Combine like terms on both sides of the equation.\\nThe equation currently has two terms with the variable o on the right side, so combine them:\\n-40*o + 63 = (-17*o - 26*o)\\n-40*o + 63 = -43*o\\nStep 2: Isolate the variable term on one side of the equation.\\nIn this case, we'll want to get all the terms with the variable o on the left side of the equation and all the constant terms on the right side. To do this, add 43*o to both sides to eliminate the -43*o term from the right side:\\n(-40*o + 43*o) + 63 = -43*o + 43*o\\n3*o + 63 = 0\\nStep 3: Isolate the variable by moving the constant term to the other side.\\nSubtract 63 from both sides to isolate the 3*o term:\\n3*o + 63 - 63 = 0 - 63\\n3*o = -63\\nStep 4: Solve for the variable.\\nDivide both sides by 3 to find the value of o:\\n3*o / 3 = -63 / 3\\no = -21\\nThe solution is o = -21.\"}" + }, + { + "comment": "This code is providing a detailed explanation of how the given text follows the preceding context. It explains Tim's snoring issue, attempts to solve it, and ultimately, his need for medicine to breathe better. The last sentence summarizes Tim as a snorer.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":147-147", + "content": "{\"text\": \"System: You are an AI assistant. You will be given a task. You must generate a detailed and long answer.\\n\\nUser: Loud Snoring
Tim started to snore as he got older. It frustrated his wife. They tried different solutions. None seem to work. Eventually Tim had to take medicine to breath better.\\nDoes this next sentence follow, given the preceding text?\\nTim was a snorer.\\n\\nAssistant: Yes, the next sentence, \\\"Tim was a snorer,\\\" does follow given the preceding text as it summarizes the main issue explained previously.\\nThe preceding text provides context and background about Tim's snoring issue. As Tim got older, he began to snore loudly, which caused frustration for his wife. They attempted various solutions to address this issue, but none of them were effective. Ultimately, Tim had to take medicine to improve his breathing.\\nThe summary statement, \\\"Tim was a snorer,\\\" is an appropriate follow-up to the preceding text as it concisely encapsulates the central problem discu" + }, + { + "comment": "The code provided consists of three JSON objects containing different text snippets. The first text discusses Tim's snoring problem and his wife's well-being, while the second asks if \"book their driving test\" is a valid answer for a question about Lee's actions before a test. The third text presents a task asking to determine if two sentences have the same meaning related to researchers finding empty cavities and scar tissue instead of tumors.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":147-149", + "content": "ssed. It may not encompass all the specific details mentioned earlier (such as trying different solutions or taking medication), but it accurately represents the overarching theme of Tim's snoring and its impact on his life and his wife's well-being.\"}\n{\"text\": \"System: You are an AI assistant that follows instruction extremely well. Help as much as you can.\\n\\nUser: Lee had their driving test coming up so Lee booked more lessons and practiced harder. Given the question \\\"What does Lee need to do before this?\\\", is \\\"book their driving test\\\" a valid answer?\\nChoose from:\\n + Yes;\\n + No;\\nAnswer:\\n\\nAssistant: No;\"}\n{\"text\": \"System: You are an AI assistant. You will be given a task. You must generate a detailed and long answer.\\n\\nUser: Do these sentences have the same meaning?\\nThe researchers found only empty cavities and scar tissue where the tumors had been .\\nNo tumors were detected ; rather , empty cavities and scar tissue were found in their place .\\n\\nAssistant: Yes, these two" + }, + { + "comment": "Sentences 1 and 2 both convey the same meaning: tumors are no longer present and have been replaced by empty cavities and scar tissue. Sentence 1 emphasizes this through the use of \"only,\" while sentence 2 is more explicit in stating that no tumors were detected.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":149-149", + "content": " sentences essentially convey the same meaning, although they are structured differently and use somewhat different phrasing. Let's break down the meaning of each sentence to better understand their similarities.\\nSentence 1: \\\"The researchers found only empty cavities and scar tissue where the tumors had been.\\\"\\nIn this sentence, the focus is on what the researchers found in the location where tumors previously existed. The use of \\\"only\\\" emphasizes that instead of finding any tumors, the researchers discovered \\\"empty cavities and scar tissue.\\\" This implies that the tumors are no longer present, and what remains in their place is the aftermath of their removal or disappearance (i.e., the empty cavities and scar tissue).\\nSentence 2: \\\"No tumors were detected; rather, empty cavities and scar tissue were found in their place.\\\"\\nThis sentence is more explicit about the absence of tumors. The first phrase, \\\"No tumors were detected,\\\" clearly states that the researchers did not find " + }, + { + "comment": "Code snippet describes the discovery of empty cavities and scar tissue in place of expected tumors, with both sentences conveying the same meaning despite different phrasing.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":149-150", + "content": "any tumors. The second part of the sentence, following the semicolon, elaborates on this finding by using the word \\\"rather\\\" to indicate that instead of tumors, they found \\\"empty cavities and scar tissue\\\" in the location where tumors would be expected. This also implies that the tumors are no longer there and have been replaced by these empty cavities and scar tissue.\\nBoth sentences express the idea that the researchers were looking for tumors but discovered empty cavities and scar tissue instead. Consequently, the two sentences convey the same meaning, even though they use different wording and structure.\"}\n{\"text\": \"System: You are an AI assistant. Provide a detailed answer so user don\u2019t need to search outside to understand the answer.\\n\\nUser: Determine the topic of the question-answer pair. Question: \\\"how many presidents of the us?\\\"; Answer: \\\"There have been 43 people sworn into office, and 44 presidencies, as Grover Cleveland served two non-consecutive terms and is counted " + }, + { + "comment": "Code snippet describes a conversation where the user asks the assistant to read an article and answer a question related to it. The topic is about U.S. Presidency and presidents, specifically asking if there was a president who served two non-consecutive terms.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":150-151", + "content": "chronologically as both the 22nd and 24th president.\\\"? Topic:\\nA:\\n\\nAssistant: The U.S. Presidency or U.S. Presidents\"}\n{\"text\": \"System: You are an AI assistant that follows instruction extremely well. Help as much as you can.\\n\\nUser: Read the following article and answer the question. Article: It is easy to say \\\"to be Best Friends Forever (BFF)\\\". However, a true BFF helps out when the other is in trouble, not just once or twice, but for a lifetime. Milo is six years old. He has become Eddie's BFF. They are special because they are not humans, but dogs. Their owner, Angie Baker, brought them up. They spent many years playing together. However, last year, Eddie suddenly started walking into dustbins and walls. Soon he lost his eyesight. When Milo felt something was wrong, he decided to act as Eddie's \\\"guide\\\" dog. He leads Eddie through the house. When they go for walks, he holds Eddie's leash . Once, Milo had to spend a few nights at the hospital himself. Poor Eddie often bumped" + }, + { + "comment": "Code snippet discusses a situation involving Eddie, his friend Milo, and their interaction with objects such as walls and furniture. The code seems to be part of an AI text generation process or chatbot conversation based on the context provided.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":151-152", + "content": " into walls and furniture. Ever worse, he got lost during walks. Angie realized how much Eddie depended on his friend. Luckily, Milo is now fine and back at his BFF's side, guiding him through life again. Question: Eddie's BFF is _ . Answer:\\nAnswer:\\n\\nAssistant: Eddie's BFF is Milo.\"}\n{\"text\": \"System: You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: In this week's <I>TravelWatch</I> column, <I>National Geographic Traveler</I> geotourism editor Jonathan B. Tourtellot examines a stewardship program that keeps Maine's ocean islands untrampled.\\nWhat best summarizes the content of the above article?\\npick from the following.\\n (1). World.\\n (2). Sports.\\n (3). Business.\\n (4). Science/Tech.\\n\\nAssistant: To summarize the content of the article, I will go through it step by step:\\n1. The article is from the \\\"TravelWa" + }, + { + "comment": "The code appears to be extracting information from a JSON file named \"demo.jsonl\" and seems to be focused on understanding the content of an article related to travel, specifically mentioning National Geographic Traveler, a geotourism editor's focus on a stewardship program in Maine, and the purpose of protecting ocean islands from being trampled. The code also mentions that the most suitable summary topic for this content would be \"World.\" Additionally, there seems to be another unrelated text block mentioning a jobless people attending courses run by David Cameron's 'back to work' tsar Emma Harrison, and their alleged issues with signing blank timesheets and wasting hours doing nothing.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":152-153", + "content": "tch\\\" column, indicating it is related to travel.\\n2. It is part of \\\"National Geographic Traveler,\\\" which is a reputable source for travel and exploration content.\\n3. The geotourism editor's focus in the article is on a stewardship program in Maine.\\n4. The purpose of the program is to protect Maine's ocean islands from being trampled on.\\nFrom these points, the most suitable summary topic for the content of the article would be:\\n(1). World.\"}\n{\"text\": \"System: You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: By. Sam Greenhill. and Daniel Martin. Created:. 19:52 EST, 20 February 2012. Jobless people attending courses run by David Cameron's 'back to work' tsar\u00a0 Emma Harrison were ordered to sign blank timesheets, it was alleged yesterday. Others complained of being forced to waste hours sitting doing nothing instead\u00a0 of being allowed to look" + }, + { + "comment": "Code snippet contains information about allegations against Mrs. Harrison's employment agency, A4e, and the involvement of police in a fraud probe. The Prime Minister is facing demands to suspend Whitehall contracts with A4e due to concerns over potential systematic fraud.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":153-153", + "content": " for gainful employment. A damning dossier of allegations about Mrs Harrison's employment agency A4e has been handed to the Comptroller and Auditor General by former minister Fiona\u00a0 Mactaggart. She is the Labour MP for Slough, Berkshire, where last week A4e's offices were\u00a0 visited by police as part of a fraud probe. The Prime Minister is facing demands to suspend lucrative Whitehall contracts\u00a0 which A4e won to help the long-term unemployed find jobs. 'Posh commune': A4e boss Mrs Harrison's \u00a35million pile has 16 bedrooms, 100 acres of land, a pool, spa, a bar, a nightclub and a long dining table for banquets. Millionairess Mrs Harrison, 48 - made. families tsar by Mr Cameron after she\u00a0 boasted she could help problem. families find work - triggered outrage when she\u00a0 paid herself. \u00a38.6million of mainly taxpayers' cash. Ministers. have sought urgent reassurance there is no 'systematic fraud' of the. public purse, after Thames Valley Police were called in by the. Department of\u00a0 Work and Pen" + }, + { + "comment": "This code is demonstrating the accusations made by constituents who were forced to attend an A4e 'back to work' course as a condition of receiving state benefits. The individuals claim that they were asked to fill out blank timesheets, modules, and sign blanks reviews, leading them to believe that fraudulent claims were being made to meet government-set criteria. Additionally, the long-term unemployed must attend such courses for continued benefit payments, and companies like A4e charge taxpayers millions to run these courses despite lacking proper training and organization.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":153-153", + "content": "sions. Ms. Mactaggart's dossier contains accusations from constituents who were. made to\u00a0 attend an A4e 'back to work' course as a condition of receiving. state benefits. One claimed: 'Every Friday, the \\\"clients\\\" have to sign a blank timesheet and\u00a0 the times are filled in later. 'Members of my group were told to fill in a blank piece of paper outlining any\u00a0 \\\"modules\\\"...and to sign a blank \\\"review\\\". 'I would guess this was to fraudulently claim that all \\\"criteria\\\" as laid down\u00a0 by the government was adhered to.' Another. complainant, a construction worker, said he had spent 13 weeks on an. A4e course supposedly to renew a forklift operator's licence yet had. 'received\u00a0 no training at all', despite repeatedly asking for some. Attendance. on a back-to-work course is mandatory for the long-term unemployed\u00a0 if. they want to keep being paid benefits, and companies such as A4e charge. taxpayers millions of pounds to run them. But a third person described a 'chaotic, unorganised' A4e c" + }, + { + "comment": "Code snippet seems to describe a situation where an individual is criticizing a work program that forces people to learn menial tasks and sign blank time sheets. The person involved has a prior work history but finds the program ridiculous and worrying, leading to calls for the Prime Minister's action regarding the 'back to work' tsar.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":153-153", + "content": "ourse in which 'we\u00a0 would mostly be shunted in and out of rooms, staring at walls, long silences\u00a0 with none of the staff telling the group what was happening'. She told the MP her group was forced to 'learn' menial tasks such as how to\u00a0 write a letter on a computer. She said: 'I did not learn anything that I did\u00a0 not know already, and it is ridiculous and worrying that people like myself\u00a0 should be forced into this position where I have a proven work history.' Last night Ms Mactaggart, a former Home Office minister and a member of the\u00a0 influential Public Accounts Committee, said: 'I think there is a series of what\u00a0 appears to me on the face of it to be fraudulent actions - expecting people to\u00a0 sign blank time sheets and so on. Prime Minister David Cameron is facing calls to sack his 'back to work' tsar after a fraud probe was launched. 'It is in itself wrong to get vulnerable unemployed people to sign blank time\u00a0 sheets, because this company is in the position where it has the power to" + }, + { + "comment": "Code highlights an MP's concerns about A4e, a government contractor, not delivering value in their contracts and calls for a government investigation. The MP also criticizes the large dividend paid to company founder Mrs Harrison.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":153-153", + "content": " put\u00a0 people in breach of their Jobseekers' Allowance conditions. 'In my view, that makes A4e not fit to receive a government contract. The\u00a0 government should absolutely consider suspending the contracts. 'The concerns that my constituents have raised are sufficient for me to believe\u00a0 that they are not a proper company to get taxpayers' money and to be dealing\u00a0 with vulnerable people.' The MP forwarded the complaints to Sir Amyas Morse, the Comptroller and Auditor\u00a0 General - the head of the National Audit Office which scrutinises the accounts\u00a0 of government departments and agencies. She demanded an investigation, telling him the cases seemed to be examples of\u00a0 'A4e taking money from the Government without actually delivering anything\u00a0 worthwhile'. Other MPs have attacked Mrs Harrison's vast dividend, which she paid herself\u00a0 after A4e recorded a \u00a3234million turnover. A4e - which used to be called Action\u00a0 For Employment - makes all its UK income from state contracts. Mrs Harrison, worth " + }, + { + "comment": "Code snippet describes a situation involving a company, its contracts being questioned due to allegations of fraud. The company is visited by police but no documents were taken. Investigation ongoing and at an early stage.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":153-153", + "content": "an estimated \u00a370million, lives in a \u00a37million country\u00a0 mansion in Derbyshire with her husband Jim, their four children, and a group of\u00a0 11 close friends and their six children in a 'posh commune'. Labour's Margaret Hodge, who is chairman of the Public Accounts Committee, has led calls for Mrs Harrison's company to have its state contracts suspended. Margaret Hodge, the chairwoman of the Public Accounts Committee, yesterday\u00a0 tabled a parliamentary question asking whether the Government would suspend the\u00a0 company's contracts pending inquiries. A Thames Valley Police spokesman said: 'Officers visited the offices of A4e in\u00a0 Slough as part of an allegation of fraud, which was referred to the Force by\u00a0 the Department for Work and Pensions. The investigation is ongoing and at an\u00a0 early stage, therefore we are currently unable to comment any further.' A4e said the police had only come in 'for a meeting' and had not taken away any\u00a0 documents. It said the fraud probe concerned an isolated incide" + }, + { + "comment": "The code discusses an accusation against David Cameron's 'back to work' tsar Emma Harrison's employment agency, A4e. The agency is accused of ordering jobless people to sign blank timesheets which led to a fraud investigation. However, A4e states that the concerns raised by Ms. Mactaggart have been resolved, and the allegations are not linked to the current police investigation.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":153-153", + "content": "nt two years ago\u00a0 relating to four employees who had since left the company, and that A4e had discovered the alleged fraud itself and reported it to the Department for Work\u00a0 and Pensions within 48 hours. Last night an A4e spokesman insisted the concerns raised by Ms Mactaggart had\u00a0 been resolved, saying: 'Under the new Work Programme, customers are no longer\u00a0 required to fill out timesheets. 'All our staff have clear guidelines on how to complete claims made under Work\u00a0 Programme and these claims are independently verified by the employer and by\u00a0 JobCentre Plus. 'A4e has a zero tolerance policy towards any form of behaviour that might lead\u00a0 to allegations of fraud. 'The concerns raised by Fiona Mactaggart's constituents are not linked to the\u00a0 current Thames Valley Police investigation.'\\nHere is a summary of the highlights for this article:\\n\\nAssistant: - David Cameron's 'back to work' tsar Emma Harrison's employment agency A4e has been accused of ordering jobless people to sign blank" + }, + { + "comment": "Cocum is a coffee shop offering Japanese cuisine in the \u00a320-25 price range, highly rated by customers but not family-friendly.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":153-154", + "content": " timesheets.\\n- Former minister Fiona Mactaggart has handed a damning dossier of allegations about A4e to the Comptroller and Auditor General.\\n- A4e's offices in Slough were visited by police as part of a fraud probe.\\n- The Prime Minister is facing demands to suspend lucrative Whitehall contracts awarded to A4e.\\n- A4e has denied any systematic fraud, and claims that the police visit was for a meeting, not to seize documents.\\n- Emma Harrison has previously faced controversy for paying herself \u00a38.6 million in taxpayer-funded dividends.\"}\n{\"text\": \"System: You are an AI assistant. You will be given a task. You must generate a detailed and long answer.\\n\\nUser: Here is some data: Cocum eatType coffee shop; Cocum food Japanese; Cocum priceRange \u00a320-25; Cocum customer rating high; Cocum familyFriendly no.\\nWrite a sentence that describes this data:\\n\\nAssistant: Cocum is a coffee shop that specializes in serving Japanese cuisine, with a price range of \u00a320-25 per person, highly appreciated by customers for their delightful taste, but not particularly considered family-friendly.\"}" + }, + { + "comment": "The code is describing an auction of personal property belonging to Bernie Madoff and his wife, which raised $500,000. The funds will be used to compensate the victims of their Ponzi scheme, as part of a wider effort to recover assets worth around $24 million in property sales and $80 million in cash assets from the Madoffs.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":155-155", + "content": "{\"text\": \"System: You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: The last remaining personal property belonging to Ponzi schemer Bernie Madoff and his wife, Ruth, netted nearly $500,000 in an auction in Miami Beach, Florida, the U.S. Marshals Service said Tuesday. Proceeds from the June 4 auction go to the U.S. Department of Justice Assets Forfeiture fund and will be used to compensate \\\"the many victims of this crime of historical proportions,\\\" said Neil DeSousa, U.S. marshal for the Southern District of Florida. \\\"Rest assured that the U.S. Marshals Service's top priority in this case was restitution toward the victims.\\\" To date, about $24 million in property sales and $80 million in cash assets from the Madoffs have been recovered, the Marshals Service said. Art, antiques, furniture and clothing from the Madoff residences in New York an" + }, + { + "comment": "Final auction of Bernie Madoff's personal property raised nearly $500k, proceeds to U.S. Department of Justice Assets Forfeiture fund, helping compensate Ponzi scheme victims; 275 lots included art and other items.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":155-155", + "content": "d Palm Beach, Florida, were featured in the 275 lots included in the final auction. Madoff, the mastermind of an estimated $50 billion Ponzi scheme, pleaded guilty in March 2009 to 11 felony charges, including money laundering, perjury, false filings with the SEC and fraud. He was sentenced to 150 years in prison.\\nWhat are the important parts of this article?\\n\\nAssistant: 1. The last remaining personal property of Bernie Madoff and his wife, Ruth, was auctioned off in Miami Beach, Florida, and raised nearly $500,000.\\n2. The U.S. Marshals Service announced that the proceeds from the June 4 auction will go to the U.S. Department of Justice Assets Forfeiture fund.\\n3. The funds will be used to compensate the victims of Madoff's Ponzi scheme, which was a crime of historical proportions.\\n4. To date, about $24 million in property sales and $80 million in cash assets from the Madoffs have been recovered by the U.S. Marshals Service.\\n5. The final auction featured 275 lots, including art, " + }, + { + "comment": "The code describes the seizure of antiques, furniture, and clothing from Madoff's residences in New York and Florida. Bernie Madoff is identified as a mastermind of an estimated $50 billion Ponzi scheme, pleading guilty to 11 felony charges in March 2009 and receiving a 150-year prison sentence.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":155-156", + "content": "antiques, furniture, and clothing from the Madoff residences in New York and Palm Beach, Florida.\\n6. Bernie Madoff was the mastermind of an estimated $50 billion Ponzi scheme and pleaded guilty in March 2009 to 11 felony charges. He was sentenced to 150 years in prison.\"}\n{\"text\": \"System: You are an AI assistant. You will be given a task. You must generate a detailed and long answer.\\n\\nUser: This is some data: Alan Bean, almaMater, \\\"UT Austin, B.S. 1955\\\"; Alan Bean, timeInSpace, \\\"100305.0\\\"(minutes); Alan Bean, selectedByNasa, 1963.\\nGenerate a detailed description of this data.\\nSentence:\\n\\nAssistant: The given data provides valuable information about Alan Bean, a notable astronaut. Bean attended the University of Texas at Austin, where he graduated with a Bachelor of Science degree in 1955, as indicated by the almaMater attribute. His time in space amounts to an impressive 100,305.0 minutes, which reveals his extensive experience and dedication to space exploration. Lastly, we " + }, + { + "comment": "The code is breaking down a given text into individual words by identifying word boundaries and separating them with spaces.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":156-157", + "content": "learn that he was selected by NASA in 1963, marking the beginning of his career as an astronaut and his significant contributions to expanding human knowledge of outer space.\"}\n{\"text\": \"System: You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: Add spaces between the words in the following text: SolomonruledoverallthekingdomsfromtheRivertothelandofthePhilistines,andtotheborderofEgypt:theybroughttribute,andservedSolomonallthedaysofhislife.\\n\\nAssistant: 1. Identify the text provided: \\\"SolomonruledoverallthekingdomsfromtheRivertothelandofthePhilistines,andtotheborderofEgypt:theybroughttribute,andservedSolomonallthedaysofhislife.\\\"\\n2. Locate possible word boundaries by recognizing common words, proper nouns, and prefixes/suffixes.\\n3. Begin separating words with spaces:\\n - \\\"Solomon ruled overall the kingdoms from the River to the land of the " + }, + { + "comment": "Code snippet is explaining the process of correcting word boundaries in a text. The given text has been separated into two words \"over all\" to make it more readable and finalized with proper word boundaries. This task involves following instructions faithfully, thinking step-by-step and justifying answers related to sorting a list of dictionaries.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":157-158", + "content": "Philistines, and to the border of Egypt: they brought tribute, and served Solomon all the days of his life.\\\"\\n4. Reassess the text and verify if all word boundaries have been addressed:\\n - The word \\\"overall\\\" should be separated into two words \\\"over all.\\\"\\n5. Finalize the text with correct word boundaries:\\n - \\\"Solomon ruled over all the kingdoms from the River to the land of the Philistines, and to the border of Egypt: they brought tribute, and served Solomon all the days of his life.\\\"\\n6. Task complete: The words in the provided text have been separated with spaces to make it more readable.\"}\n{\"text\": \"System: User will you give you a task with some instruction. Your job is follow the instructions as faithfully as you can. While answering think step-by-step and justify your answer.\\n\\nUser: Teacher:In this task you will be given a list of dictionaries. A dictionary is a set of key-value pairs, where each key is unique and has a value associated with that key. You should sor" + }, + { + "comment": "Sorting the list of dictionaries by 'first' key, then by 'second' key for dictionaries with equal 'first' values. Negative numbers should come before positive ones.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":158-158", + "content": "t the list of dictionaries from smallest to largest by their 'first' key. If there is two dictionaries with the same 'first' value then sort them by their 'second' key. Negative numbers should come before positive numbers.\\nTeacher: Now, understand the problem? Solve this instance: [{'first': 12, 'second': 17}, {'first': -88, 'second': 99}, {'first': -90, 'second': -3}, {'first': 64, 'second': -24}, {'first': -23, 'second': -4}, {'first': -6, 'second': -62}, {'first': 2, 'second': 0}]\\nStudent:\\n\\nAssistant: First, we need to sort the dictionaries by their 'first' key value. We will use the built-in Python 'sorted' function which provides a way to sort elements by providing a function to compute a sort key from an element. In this case, the key function will be a lambda that extracts the 'first' key value from the dictionary.\\nSecond, in case where two dictionaries have the same 'first' value, they should be sorted by 'second' key. The 'sorted' function provides a way to specify multip" + }, + { + "comment": "Sorts the input list of dictionaries based on 'first' and 'second' key values. Uses Python's sorted function with a lambda key function that returns a tuple containing both values for each dictionary in the input. Sorted first by 'first' (ascending) and then by 'second' (ascending) within same 'first' value ties. Result is a list of dictionaries sorted based on these criteria.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":158-158", + "content": "le sort keys by returning a tuple from the key function. Python will sort the tuples primarily by the first element, then the second element in case of ties, and so on.\\nLet's solve:\\ninput = [{'first': 12, 'second': 17}, {'first': -88, 'second': 99}, {'first': -90, 'second': -3}, {'first': 64, 'second': -24}, {'first': -23, 'second': -4}, {'first': -6, 'second': -62}, {'first': 2, 'second': 0}]\\nWe will use the 'sorted' function with key function that returns tuple with 'first' and 'second' values of dictionary:\\nresult = sorted(input, key=lambda d: (d['first'], d['second']))\\nThe result should be the list sorted by 'first' key from smallest to largest, and by 'second' key in case of same 'first' values:\\n[{'first': -90, 'second': -3}, {'first': -88, 'second': 99}, {'first': -23, 'second': -4}, {'first': -6, 'second': -62}, {'first': 2, 'second': 0}, {'first': 12, 'second': 17}, {'first': 64, 'second': -24}] \\nThis result list satisfies all the conditions provided in the task - dictio" + }, + { + "comment": "Step 1: Identify information corresponding to each category in the sentence. Price: \"prices less than 20\". Location: \"located near xnear in the city centre\". FamilyFriendly: \"The coffee shop, xname, is family-friendly\"", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":158-159", + "content": "naries with smaller 'first' key come before ones with larger 'first' key, and in case of a tie the 'second' key is used for ordering. Also, negative values come before positive ones.\"}\n{\"text\": \"System: You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: Detailed Instructions: You will be given a sentence that describes a restaurant. You will also be given a few categories of information regarding that sentence. Your task is to fill each of the categories with the appropriate information from the sentenece.\\nProblem:Sentence: The coffee shop, xname, is family-friendly and has prices less than 20. It is located near xnear in the city centre. ; Categories: price, location, familyFriendly\\nSolution:\\n\\nAssistant: Step 1: Identify the information corresponding to each category in the sentence.\\nPrice: The sentence mentions \\\"prices less than 20.\\\"\\nLo" + }, + { + "comment": "Code is extracting and categorizing information from a given sentence, then presenting it in a structured format. The categories include Price, Location, and Family-friendly status.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":159-160", + "content": "cation: The sentence mentions \\\"located near xnear in the city centre.\\\"\\nFamily-friendly: The sentence mentions the coffee shop is \\\"family-friendly.\\\"\\nStep 2: Fill in the categories with the appropriate information found in Step 1.\\nPrice: Less than 20\\nLocation: Near xnear in the city centre\\nFamily-friendly: Yes\\nStep 3: Present the completed information in the given categories format.\\nSolution:\\nCategories:\\nPrice: Less than 20\\nLocation: Near xnear in the city centre\\nFamily-friendly: Yes\"}\n{\"text\": \"System: You are a helpful assistant, who always provide explanation. Think like you are answering to a five year old.\\n\\nUser: Six Nations rugby in Dublin with our experts\\nDepart from your preferred airport \u2013 either London Heathrow or Manchester \u2013 bound for Dublin. On arrival you will be met by your Gullivers Sports Travel tour manager and transferred into central Dublin. Check in to your four-star hotel \u2013 likely to be the Doubletree by Hilton The Burlington \u2013 for two nights on a b" + }, + { + "comment": "Description of a St. Patrick's Day celebration in Dublin with a rugby event for Telegraph readers, led by correspondent Gavin Mairs and featuring former England players Ugo Monye and Simon Shaw.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":160-160", + "content": "ed and breakfast basis.\\nYou have this afternoon and the evening free to soak up the once-in-a-lifetime experience of spending St Patrick\u2019s Day in Dublin. This used to be a relatively low-key occasion in the Irish capital but is now celebrated with aplomb throughout the city. There is a huge amount going on today, including the parade, which threads through Dublin\u2019s streets, and as convivial an atmosphere as you could wish for in the city\u2019s pubs and bars. You may wish to pack something emerald green into your baggage!\\nToday you will enjoy an event put on exclusively for Telegraph readers, led by our rugby news correspondent Gavin Mairs. Over a Saturday brunch \u2013 to allow for a gentle start for those who have enjoyed their share of local hospitality the evening before \u2013 Gavin will give his insights into the way today\u2019s match is shaping up, as well as leading a Q&A with two former England players (provisionally Ugo Monye and Simon Shaw). You will hear from a player\u2019s perspective about bu" + }, + { + "comment": "Code describes a special trip for attending a Six Nations rugby game in Dublin, including hotel stay, exclusive event, and match at Aviva Stadium before flying home.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":160-160", + "content": "ilding up for a big match, and what the approach of the two teams is likely to be \u2013 in short, the perfect way to whet your appetite for the fixture.\\nYou will make your own way to Aviva Stadium, where you will take your seats and enjoy the build-up before watching what could well be the deciding match of the Six Nations championship.\\nAfter taking breakfast in your hotel, you have a small amount of time to make the most of your final hours in Dublin before being transferred to the airport by luxury coach and your flight home.\\nWhat was this article about?\\n\\nAssistant: This article is about a special trip for enjoying the Six Nations rugby game in Dublin. The trip includes flying from London Heathrow or Manchester to Dublin, staying in a four-star hotel, and celebrating St Patrick's Day in Dublin. You will also attend an exclusive event with rugby experts and former players who will share their insights and answer questions. Finally, you will watch a Six Nations rugby match at the Aviva Stadium before flying back home.\"}" + }, + { + "comment": "This code is a multiple-choice question asking the user to choose the most appropriate option that describes a scenario where a man lifts a heavy weight using his strength at a professional sports event. The correct answer and explanation of why other options are incorrect is required.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":161-161", + "content": "{\"text\": \"System: You are an AI assistant. You should describe the task and explain your answer. While answering a multiple choice question, first output the correct answer(s). Then explain why other answers are wrong. You might need to use additional knowledge to answer the question.\\n\\nUser: Options are:\\n (a). operates off a reception stage in front a yellow curtain on a college field while spectators standon wople in the stands and applaud..\\n (b). approaches a two sided weight as a woman watches from the side near a door..\\n (c). performs with upper arm and shoulders strength, front and back with hand and head and legs, about eighty five degree backs and, arms and feet, extend forward, turn backwards, then grab and hold the weight daily..\\n (d). , engage in arm wrestling with one another, usually in pairs..Choose from options above and answer: What most naturally follows?\\nA man at a professional sports event, lifts a large and heavy weight using knees, back and arm strength while" + }, + { + "comment": "Option (b) is the most likely correct follow up to the scenario, as it maintains consistency with the original context by involving a weight and a woman watching from the side. Options (a), (c), and (d) are incorrect due to inconsistent settings, excessive description, and deviation from the narrative style respectively.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":161-161", + "content": " a woman in a suit watches from the sidelines. a man wearing a \\\" usa \\\" shirt, or team uniform\\nAnswer:\\n\\nAssistant: (b). approaches a two sided weight as a woman watches from the side near a door..\\nOption (b) is most likely the correct follow up to the scenario described. The reasoning behind this is that it mentions a man approaching a weight (which could be the \\\"large and heavy weight\\\" from the initial scenario), and a woman watching from the side (which could be the \\\"woman in a suit\\\").\\nOption (a) is incorrect because the scenario is taking place on a college field, while in the original context, the event was described as a professional sports event. The settings are inconsistent.\\nOption (c) is incorrect because it describes the physical actions of the athlete in an excessive and detailed manner that does not align with the succinct and focused narrative style seen in the original scenario.\\nOption (d) is incorrect because it describes an entirely different scenario of arm" + }, + { + "comment": "Code snippet describes a review of a nail and spa business, mentioning positive past experiences but recent unpleasant visits.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":161-162", + "content": " wrestling. Instead, the original scenario denotes weightlifting, making this option unfit to naturally follow given the context.\"}\n{\"text\": \"System: You are a helpful assistant, who always provide explanation. Think like you are answering to a five year old.\\n\\nUser: I would like to start this review by saying that I have been coming to Ken's Nails and Spa for a few years now for several different services and - in the past - have always had great service and results. I have always been greeted immediately and have never had to wait more than a couple minutes before I am taken back for services. It truly has been my go-to in the past and I am super sad that I am going to stop frequenting this business. I am not exactly sure what has changed but my last few visits have been anything but pleasant. First, a couple weeks ago I came in for my normal eyebrow wax. Usually I am in and out in a couple minutes with perfectly manicured brows. This time I left with my eyebrows on FIRE. I went to" + }, + { + "comment": "This code describes a negative experience with an eyebrow wax and mani-pedi at Albertson's, where the customer faced discomfort, pain, and poor communication with the technician.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":162-162", + "content": " the Albertson's following my appointment and bought some cooling aloe gel for relief but nothing helped. The area ended up breaking out in blisters that last at least a week. I had several people ask me if I had suffered a burn. Yikes! it was not good and extremely painful. I let the bad eyebrow wax slide and thought maybe it was a fluke accident and went back last night for a mani-pedi. I feel bad because I am not sure the tech who was working on me really understood some of the things I was trying to communicate to her. At one point I asked her to file my nails slightly shorter and she said \\\"ok\\\" but then just moved on. That wasn't really the worst part though. During both the mani and pedi she repeatedly hurt me while removing my cuticles. I can be a wuss so I know sometimes it is just me being a baby but this was ridiculous. At one point I gasped and pulled my foot away and the nail tech next to her actually had to scold her and tell her to take it easy. Needless to say I left " + }, + { + "comment": "The code snippet seems to be a combination of different tasks with mixed language inputs, translations, and sentiment analysis. The first part is related to understanding the sentiment of a review text, while the second part involves translating Portuguese sentences into Arabic.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":162-164", + "content": "with one nail bleeding which didn't stop for over an hour and is still painful to the touch.\\nChoose the sentiment of this review?\\npick from the following. I. negative. II. positive.\\n\\nAssistant: The sentiment of this review is negative.\"}\n{\"text\": \"System: You are an AI assistant. Provide a detailed answer so user don\u2019t need to search outside to understand the answer.\\n\\nUser: Detailed Instructions: You are given a sentence in Portuguese. Your job is to translate the Portuguese sentence into Arabic.\\nProblem:Nessa \u00e9poca eu n\u00e3o podia trabalhar na bolsa de valores, n\u00e3o podia conduzir um autocarro nem pilotar um avi\u00e3o.\\nSolution:\\n\\nAssistant: \u0641\u064a \u062a\u0644\u0643 \u0627\u0644\u0641\u062a\u0631\u0629 \u0644\u0645 \u0623\u0633\u062a\u0637\u0639 \u0627\u0644\u0639\u0645\u0644 \u0641\u064a \u0633\u0648\u0642 \u0627\u0644\u0623\u0648\u0631\u0627\u0642 \u0627\u0644\u0645\u0627\u0644\u064a\u0629\u060c \u0648\u0644\u0645 \u0623\u0633\u062a\u0637\u0639 \u0642\u064a\u0627\u062f\u0629 \u062d\u0627\u0641\u0644\u0629 \u0623\u0648 \u0642\u064a\u0627\u062f\u0629 \u0637\u0627\u0626\u0631\u0629.\\nIn this period, I couldn't work in the stock market, I couldn't drive a bus or fly a plane.\"}\n{\"text\": \"System: You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task thin" + }, + { + "comment": "The code is prompting the user to generate a short summary of the given paragraph. It then provides an example of such a summary and follows up with a multiple-choice question where the user must select the best ending for the context from four options (A, B, C, or D).", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":164-165", + "content": "k step-by-step and justify your steps.\\n\\nUser: less than a week after the florida supreme court tossed out a massive class-action lawsuit against big tobacco , lawyers across the state are gearing up for battle on behalf of individuals and families hurt by smoking .\\nCan you generate a short summary of the above paragraph?\\n\\nAssistant: Following the Florida Supreme Court's dismissal of a major class-action lawsuit against big tobacco, attorneys throughout the state are preparing to represent individuals and families affected by smoking.\"}\n{\"text\": \"System: You are an AI assistant that follows instruction extremely well. Help as much as you can.\\n\\nUser: Detailed Instructions: In this task, you are given a context and four options. Each option is a suggested ending for the context. You should read the context and pick the best ending for the context. Please answer with \\\"A\\\", \\\"B\\\", \\\"C\\\", and \\\"D\\\". \\nQ: A group of men on horses talk and as they ride away, the clown restarts his job i" + }, + { + "comment": "The code appears to be describing a scene involving clowns in a circus, with different actions taking place. The user is asking for an analysis of a product review for a book on a medical situation.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":165-166", + "content": "n the ring. the clown(A) smolders up at the man in yellow who claps in response. (B) continues dancing on the horse and then flips the horse and wrists in mid air. (C) starts the advertisement and an introduction comes onto the screen for riding a horse. (D) gets injured again as he jumps over a bulls head and the other clowns help him up.\\nA:\\n\\nAssistant: \\\"D\\\"\"}\n{\"text\": \"System: You are an AI assistant. Provide a detailed answer so user don\u2019t need to search outside to understand the answer.\\n\\nUser: Answer the following question: Title: Surprisingly better than expected Amazon book Product review: The first part of the book to me was more juvenile, about the children, didn't have any idea where it was taking me .. but as you read on, there was a definite shift and a message for everyone whose ever been in this medical situation when you see how the advice given to Katie was understood, taken to heart and followed by her, still a very young girl, and how it affected her siblin" + }, + { + "comment": "This review depicts the product in a mostly flattering light, as it highlights an initially juvenile first part but praises the story's shift and its delightful nature after overcoming that initial impression. The reviewer is ultimately glad they continued reading past the first few chapters.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":166-167", + "content": "gs and her entire family. She was to be admired. Turned out I discovered it became a delightful story that I am glad I took the time to get past the first few chapters. Would you say this review depicts the product in a flattering or unflattering light?\\nAnswer:\\n\\nAssistant: This review depicts the product in a mostly flattering light. The reviewer begins with a slightly negative comment about the book's juvenile first part but goes on to praise the shift in the story, its message, and the impact on the characters as the book progresses. The reviewer ultimately describes the book as a delightful story worth reading and is glad they continued past the first few chapters. While the review starts with a mild criticism, the overall sentiment is positive and flattering towards the book.\"}\n{\"text\": \"System: You are an AI assistant. You will be given a task. You must generate a detailed and long answer.\\n\\nUser: Answer the following question: Title: only the beginning Review: Warlock is the f" + }, + { + "comment": "The code provides a product review for the book \"Warlock\" by author Oakley Hall, praising his writing and universal human truths. The reviewer mentions that Cormac McCarthy was influenced by Hall's novels and suggests readers discover this excellent writer. The review is not negative.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":167-168", + "content": "irst in a trilogy by author Oakley Hall, the second novel in the trilogy being Badlands, followed by Apaches. I was simply awed by the writing of Mr Hall, and the universal human truths he reminds the reader of. I can see that more than a few writers must have read Oakley Hall's novels, most especially Cormac Mccarthy. Warlock was published in 1958, and Badlands was at least 10 yrs later, followed by Apaches, which was at least another decade later. Mr Hall also does the fine Ambrose Bierce series of novels, and with a career spanning 5 decades, he is still underated and underapreciated by the general public. do yourself a favor and discover this most excellent writer. Is this product review negative?\\nOptions:\\n + Yes\\n + No\\nAnswer:\\n\\nAssistant: No\"}\n{\"text\": \"System: You are an AI assistant. You will be given a task. You must generate a detailed and long answer.\\n\\nUser: What is a question about this article? If the question is unanswerable, say \\\"unanswerable\\\".\\nIsrael, and The US" + }, + { + "comment": "Israel, US Air Force, and NATO members have collaborated to develop tactics for air defense suppression by creating advanced electronics intelligence and electronic countermeasures platforms. As better jamming, countermeasures, and anti-radiation weapons emerge, so do enhanced SAM systems with ECCM capabilities and the ability to shoot down anti-radiation missiles and other munitions aimed at them or the targets they defend.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":168-169", + "content": " Air Force, in conjunction with the members of NATO, has developed significant tactics for air defence suppression. Dedicated weapons such as anti-radiation missiles and advanced electronics intelligence and electronic countermeasures platforms seek to suppress or negate the effectiveness of an opposing air-defence system. It is an arms race; as better jamming, countermeasures and anti-radiation weapons are developed, so are better SAM systems with ECCM capabilities and the ability to shoot down anti-radiation missiles and other munitions aimed at them or the targets they are defending.\\nIn agreement with NATO members, the US Air Force and which country has created tactics for air defence suppression?\\n\\nAssistant: Israel, in conjunction with the US Air Force and members of NATO, has created tactics for air defense suppression.\"}\n{\"text\": \"System: User will you give you a task with some instruction. Your job is follow the instructions as faithfully as you can. While answering think step" + }, + { + "comment": "This code checks if a given Bahasa Indonesia sentence is the translation of a Japanese sentence. If the translations align, it generates a \"Yes\" label; otherwise, it outputs \"No.\"", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":169-169", + "content": "-by-step and justify your answer.\\n\\nUser: Q: Given a sentence in the Japanese and Indonesian(Bahasa variant) language. Your task is check if the Bahasa Indonesia sentence is translation of Japanese. if the translation is correct than generate label \\\"Yes\\\", otherwise generate label \\\"No\\\".\\nJapanese: \u5f7c\u306f\u300c\u3053\u3053\u306b\u96c6\u307e\u3063\u3066\u3044\u308b\u5fe0\u5b9f\u306a\u4fe1\u8005\u306b\u3001\u884c\u65b9\u4e0d\u660e\u8005\u306e\u7121\u4e8b\u306a\u5e30\u9084\u3001\u6551\u52a9\u306b\u547d\u3092\u61f8\u3051\u305f\u7537\u5973\u3084\u3001\u3053\u306e\u60b2\u5287\u7684\u306a\u4e8b\u6545\u3067\u5931\u308f\u308c\u305f\u9b42\u306e\u305f\u3081\u306b\u7948\u308b\u3053\u3068\u3092\u6c42\u3081\u308b\u300d\u3068\u8a00\u3063\u305f\u3002 \\n Bahasa Indonesia: Dia banyak menegaskan posisi yang sama dalam percakapan telepon dengan Presiden Amerika Serikat, George W. Bush, hari ini.\\nA:\\n\\nAssistant: To check if the Bahasa Indonesia sentence is a translation of the Japanese sentence, we need to have a general understanding of the meaning of both sentences and compare them.\\nJapanese: \u5f7c\u306f\u300c\u3053\u3053\u306b\u96c6\u307e\u3063\u3066\u3044\u308b\u5fe0\u5b9f\u306a\u4fe1\u8005\u306b\u3001\u884c\u65b9\u4e0d\u660e\u8005\u306e\u7121\u4e8b\u306a\u5e30\u9084\u3001\u6551\u52a9\u306b\u547d\u3092\u61f8\u3051\u305f\u7537\u5973\u3084\u3001\u3053\u306e\u60b2\u5287\u7684\u306a\u4e8b\u6545\u3067\u5931\u308f\u308c\u305f\u9b42\u306e\u305f\u3081\u306b\u7948\u308b\u3053\u3068\u3092\u6c42\u3081\u308b\u300d\u3068\u8a00\u3063\u305f\u3002 \\nTranslation (in English): He said, \\\"I ask the faithful believers gathered here to pray for the safe return of the missing, for the men and women who risked their lives in the rescue, and for th" + }, + { + "comment": "Code explains the process of comparing the meaning of two sentences, one in Japanese and the other in Bahasa Indonesia, and concludes that they are not translations of each other. It also demonstrates a translation task where an English sentence is translated into Spanish using knowledge of both languages.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":169-170", + "content": "e souls lost in this tragic accident.\\\"\\nBahasa Indonesia: Dia banyak menegaskan posisi yang sama dalam percakapan telepon dengan Presiden Amerika Serikat, George W. Bush, hari ini.\\nTranslation (in English): He strongly emphasized the same position in a phone conversation with the President of the United States, George W. Bush, today.\\nComparing the meaning of both sentences, it is clear that the Bahasa Indonesia sentence is not a translation of the Japanese sentence.\\nLabel: No\"}\n{\"text\": \"System: Explain how you used the definition to come up with the answer.\\n\\nUser: Q: You are given a sentence in English. Your job is to translate the English sentence into Spanish.\\nAnd last year I spoke at his memorial service and wished him well in this life.\\nA:\\n\\nAssistant: Y el a\u00f1o pasado habl\u00e9 en su servicio conmemorativo y le dese\u00e9 lo mejor en esta vida.\\nIn order to come up with the answer, I used the definition and knowledge of both languages. First, I identified the main parts and ideas o" + }, + { + "comment": "This code is translating a given sentence from English to Spanish, maintaining the meaning and context. It first splits the sentence into parts, then translates each part individually before combining them back together for the final translation.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":170-171", + "content": "f the sentence: \\\"last year,\\\" \\\"I spoke,\\\" \\\"his memorial service,\\\" \\\"wished him well,\\\" and \\\"in this life.\\\" Then, I translated each part of the sentence into Spanish, while ensuring that the grammar and structure are correct. \\n- \\\"last year\\\" -> \\\"el a\u00f1o pasado\\\"\\n- \\\"I spoke\\\" -> \\\"habl\u00e9\\\"\\n- \\\"his memorial service\\\" -> \\\"su servicio conmemorativo\\\"\\n- \\\"wished him well\\\" -> \\\"le dese\u00e9 lo mejor\\\"\\n- \\\"in this life\\\" -> \\\"en esta vida\\\"\\nFinally, I combined these translations into a coherent sentence in Spanish, keeping the original meaning and context of the English sentence. So the final translation is: \\\"Y el a\u00f1o pasado habl\u00e9 en su servicio conmemorativo y le dese\u00e9 lo mejor en esta vida.\\\"\"}\n{\"text\": \"System: You are an AI assistant. You will be given a task. You must generate a detailed and long answer.\\n\\nUser: Here is a premise:\\nThe Symposium has been the forum for debate and discussion of a wide range of equal justice issues, from the mission of the civil legal assistance " + }, + { + "comment": "Code discusses a Symposium as a forum for debate and discussion of equal justice issues, then presents a hypothesis about whether or not the issues have been debated.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":171-171", + "content": "delivery system to the design and implementation of a number of specific initiatives aimed at unifying, expanding and improving the state's equal justice community.\\nHere is a hypothesis:\\nA range of equal justice issues have been debated.\\nHere are the options: pick from the following. -- yes -- it is not possible to tell -- no\\nIs it possible to conclude that if the premise is true, then so is the hypothesis?\\n\\nAssistant: Yes, it is possible to conclude that if the premise is true, then so is the hypothesis. \\nThe premise states that The Symposium has been the forum for debate and discussion of a wide range of equal justice issues. This implies that there is a variety of topics covered under the umbrella of equal justice, and they have been discussed at this particular forum. \\nThe hypothesis states that a range of equal justice issues have been debated. The implication here is that various equal justice topics have been discussed and debated, which aligns with the information provi" + }, + { + "comment": "The code is providing a definition and explanation on how it uses the given information to come up with an answer. It discusses checking if a Hindi sentence is a translation of an English sentence and generating a \"Yes\" or \"No\" label accordingly.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":171-172", + "content": "ded in the premise.\\nSince the premise and hypothesis are aligned and the information in the hypothesis is derived from the premise, we can conclude that if the premise is true, then the hypothesis is also true.\"}\n{\"text\": \"System: Explain how you used the definition to come up with the answer.\\n\\nUser: Given the task definition and input, reply with output. In this task, you are given a sentence in the English and Hindi language. Your task is check if the Hindi sentence is translation of English. if the translation is correct than generate label \\\"Yes\\\", otherwise generate label \\\"No\\\".\\nEnglish: \\\"The typical incentive programs simply do not resonate in today's economy,\\\" Jesse Toprak, an analyst for Edmunds. \\n Hindi: \u090f\u0921\u092e\u0902\u0921\u094d\u0938 \u0915\u0947 \u090f\u0915 \u0935\u093f\u0936\u094d\u0932\u0947\u0937\u0915 \u091c\u0947\u0938\u0940 \u091f\u094b\u092a\u094d\u0930\u093e\u0915 \u0928\u0947 \u0915\u0939\u093e, \\\"\u0938\u093e\u092e\u093e\u0928\u094d\u092f \u092a\u094d\u0930\u094b\u0924\u094d\u0938\u093e\u0939\u0928 \u0915\u093e\u0930\u094d\u092f\u0915\u094d\u0930\u092e \u0915\u0947\u0935\u0932 \u0906\u091c \u0915\u0940 \u0905\u0930\u094d\u0925\u0935\u094d\u092f\u0935\u0938\u094d\u0925\u093e \u092e\u0947\u0902 \u092a\u094d\u0930\u0924\u093f\u0927\u094d\u0935\u0928\u093f\u0924 \u0928\u0939\u0940\u0902 \u0939\u094b\u0924\u0947 \u0939\u0948\u0902\u0964\\\"\\n\\nAssistant: To determine if the Hindi sentence is a translation of the English sentence, I would first analyze and compare the context a" + }, + { + "comment": "Both sentences mention the lack of resonance of typical incentive programs in today's economy and attribute the statement to Jesse Toprak from Edmunds. The Hindi sentence appears to be a translation of the English sentence, both conveying the same meaning. Therefore, generate the label \"Yes.\"", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":172-173", + "content": "nd meaning of both sentences. \\nEnglish: \\\"The typical incentive programs simply do not resonate in today's economy,\\\" Jesse Toprak, an analyst for Edmunds.\\nHindi: \u090f\u0921\u092e\u0902\u0921\u094d\u0938 \u0915\u0947 \u090f\u0915 \u0935\u093f\u0936\u094d\u0932\u0947\u0937\u0915 \u091c\u0947\u0938\u0940 \u091f\u094b\u092a\u094d\u0930\u093e\u0915 \u0928\u0947 \u0915\u0939\u093e, \\\"\u0938\u093e\u092e\u093e\u0928\u094d\u092f \u092a\u094d\u0930\u094b\u0924\u094d\u0938\u093e\u0939\u0928 \u0915\u093e\u0930\u094d\u092f\u0915\u094d\u0930\u092e \u0915\u0947\u0935\u0932 \u0906\u091c \u0915\u0940 \u0905\u0930\u094d\u0925\u0935\u094d\u092f\u0935\u0938\u094d\u0925\u093e \u092e\u0947\u0902 \u092a\u094d\u0930\u0924\u093f\u0927\u094d\u0935\u0928\u093f\u0924 \u0928\u0939\u0940\u0902 \u0939\u094b\u0924\u0947 \u0939\u0948\u0902\u0964\\\"\\nBased on my understanding of the sentences, the Hindi sentence is indeed a translation of the English sentence. Both sentences mention that typical incentive programs do not resonate in today's economy and attribute the statement to Jesse Toprak, an analyst for Edmunds. Therefore, I would generate the label \\\"Yes.\\\"\"}\n{\"text\": \"System: You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: Summarize:\\nTwenty-year-olds who started antiretroviral therapy in 2010 are projected to live 10 years longer than those first using it in 1996, it found.\\nDoctors say tha" + }, + { + "comment": "This code highlights the importance of early HIV treatment, discussing its success in recent years due to fewer side effects, better replication prevention, and increased resistance to newer drugs. It also mentions the ongoing challenges in developing countries where access to treatments is limited, leading to continued health issues for those with HIV, particularly among those who use injected drugs. Antiretroviral therapy involves a combination of multiple drugs that block HIV progression.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":173-173", + "content": "t starting treatment early is crucial to achieve a long and healthy life.\\nCharities say there are still too many people unaware they have the virus.\\nThis is particularly true in the developing world, where the majority of HIV deaths occur because access to drugs is limited.\\nThe study authors, from the University of Bristol, said the extraordinary success of HIV treatments was a result of newer drugs having fewer side effects and being better at preventing the virus from replicating in the body.\\nIt is also more difficult for the virus to build up a resistance to the most recent drugs.\\nImproved screening and prevention programmes and better treatment of health problems caused by HIV are thought to have helped, too.\\nBut many people with HIV still do not live as long as expected, especially those infected through injecting drugs.\\nAntiretroviral therapy involves a combination of three or more drugs which block the normal progress of HIV (human immunodeficiency virus).\\nThey have been" + }, + { + "comment": "Code snippet discusses a person living with HIV and their experiences with medication, its impact on life, and the importance of research for improved understanding.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":173-173", + "content": " called \\\"one of the greatest public health success stories of the past 40 years\\\".\\nJimmy Isaacs, 28, discovered he had been infected with HIV by a former partner nearly three years ago.\\nHe takes three drugs once a day at 18:00 and will continue to do so for the rest of his life.\\n\\\"My health is absolutely fine. I'm eating healthily and drinking healthily,\\\" he said.\\n\\\"It doesn't impact on my job and hasn't impacted on my social life either.\\\"\\nAlthough it took two changes of medication to find the right combination for him, he says he now has no side effects at all.\\n\\\"I had heard a lot of bad stories about the drugs back in the '90s - but when I did some research, I realised the drugs had completely changed.\\\"\\nNot all his employers have been supportive since his diagnosis and he says that is down to ignorance.\\nHis current employer has given him time off to tour the country and speak to students and school pupils about HIV prevention and treatment.\\nThe researchers looked at 88,5" + }, + { + "comment": "This code discusses a study that found improved life expectancy for people with HIV in Europe and North America due to advancements in treatment. The expected age at death for a 20-year-old patient starting ART after 2008 is now 78 years, similar to the general population. However, this also means more older individuals are living with HIV, straining healthcare systems.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":173-173", + "content": "00 people with HIV from Europe and North America who had been involved in 18 studies.\\nThey based their life-expectancy predictions on death rates during the first three years of follow-up after drug treatment was started.\\nThey found that fewer people who started treatment between 2008 and 2010 died during this period compared with those who began treatment between 1996 and 2007.\\nThe expected age at death of a 20-year-old patient starting antiretroviral therapy (ART) after 2008, with a low viral load and after the first year of treatment, was 78 years - similar to the general population.\\nDr Michael Brady, medical director at the Terrence Higgins Trust, said the study showed how much things had changed since the start of the HIV epidemic in the 1980s.\\nBut he said it also meant people aged over 50 now represented one in three of all those living with HIV.\\n\\\"As it stands, the healthcare, social care and welfare systems simply aren't ready to support the increasing numbers of people g" + }, + { + "comment": "This code discusses the progress made in managing HIV and its impact on older patients, emphasizing the need for a new care model and increased awareness to ensure better healthcare outcomes.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":173-173", + "content": "rowing older with HIV.\\n\\\"We need a new model of care to better integrate primary care with HIV specialist services, and we need a major shift in awareness and training around HIV and ageing, so that we're ready to help older people live well in later life,\\\" he said.\\nProf Helen Stokes-Lampard, who chairs the Royal College of GPs, said: \\\"It's a tremendous medical achievement that an infection that once had such a terrible prognosis is now so manageable, and that patients with HIV are living significantly longer.\\n\\\"We hope the results of this study go a long way to finally removing any remaining stigma associated with HIV, and ensuring that patients with HIV can live long and healthy lives without experiencing difficulties in gaining employment and - in countries where it is necessary - obtaining medical insurance.\\\"\\nShe said steps were being taken to increase appropriate HIV testing by GPs.\\nThe proportion of people with undiagnosed HIV has fallen steadily over the past 20 years.\\n" + }, + { + "comment": "Code snippet seems to be from a text file, and it describes a study about HIV treatment and life expectancy. The user is requesting assistance in completing a task related to the given premise.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":173-174", + "content": "But one in eight people with HIV is still thought to remain undiagnosed.\\nGet news from the BBC in your inbox, each weekday morning\\nSummary:\\n\\nAssistant: A study by the University of Bristol found that 20-year-olds who began antiretroviral therapy for HIV in 2010 are expected to live 10 years longer than those who started treatment in 1996. Newer drugs have fewer side effects and are more capable in preventing the virus from replicating in the body, with the most recent drugs also being more resistant. Improved screening and treatment options have also contributed to the rise in life expectancy. However, many people with HIV still do not live as long as expected, particularly those who contracted the virus through drug injections.\"}\n{\"text\": \"System: You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.\\n\\nUser: Premise: Other hands joined his to boost Ans" + }, + { + "comment": "This code appears to be a conversation between a user and an AI assistant, where the user is providing feedback on a product and asking for assessment of its positive or negative impact. The AI assistant then responds with its evaluation of the review's flattery level.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":174-176", + "content": "e.\\nHypothesis: He let Anse fall to the floor. \\nIs the hypothesis entailed by the premise?\\nChoose your answer from: * yes; * it is not possible to tell; * no; And the answer is:\\n\\nAssistant: * no;\\nAnd the answer is: no.\\nReasoning: The premise states that \\\"other hands joined his to boost Anse,\\\" which suggests that they are supporting or helping Anse. The hypothesis, however, states that \\\"he let Anse fall to the floor,\\\" which contradicts the act of support or help mentioned in the premise.\"}\n{\"text\": \"User: Title: Flayderman's Guide to Antique American Firearms and Their Values Product review: I ordered this for my husband and it was just what he wanted. Could have used a more up-to-date printbut don't think it's available yet.Thanks. Would you say this review depicts the product in a flattering or unflattering light?\\nThe answer to this question is:\\n\\nAssistant: flattering\"}\n{\"text\": \"System: You are an AI assistant. You will be given a task. You must generate a detailed and lon" + }, + { + "comment": "The code describes a memorable NFL game between the Saints and Falcons in 2004, where Steve Gleason blocked a punt leading to the Saints' first score. The Saints dominated the rest of the game, with a final score in the first quarter involving WR Devery Henderson running for a touchdown.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":176-176", + "content": "g answer.\\n\\nUser: Riding high from their two-straight road wins, the Saints returned home to the Louisiana Superdome for the first time since December 26, 2004 for a special Monday Night game against their fellow NFC South division rival, the Atlanta Falcons, in front of a jubilant sold-out crowd of 70,003 and ESPN's largest-ever audience, with about 10.8 million households viewing, at the time the second-largest cable audience in history (behind the 1993 North American Free Trade Agreement (NAFTA) debate between Al Gore and Ross Perot on CNN). On just the fourth overall play of the game, Safety Steve Gleason blocked a punt and DB Curtis Deloatch managed to land on the ball in the end zone for the Saints first score of the game. Former Saints kicker Morten Andersen helped Atlanta get its only score of the game, from a 26-yard field goal. From there, the Saints dominated the rest of the game. For the final score of the first quarter, WR Devery Henderson ran 11 yards for a touchdown " + }, + { + "comment": "The code describes a football game where the Saints defeated the Falcons with strong performances from kicker John Carney and defense. Morten Andersen, former Saints kicker, helped Atlanta score a 26-yard field goal. Green Day and U2 performed jointly before the game, while Goo Goo Dolls held an evening concert outside the stadium.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":176-177", + "content": "on a reverse. Kicker John Carney would provide the scoring for the rest of the game, as he kicked two second-quarter field goals (a 37-yarder and a 51-yarder), and one third-quarter field goal (a 20-yarder) to essentially wrap up the game. From there, the Saints defense continued to put pressure on Falcons quarterback Michael Vick, as they sacked him five times, while limiting the running duo of Vick and Dunn to just a combined total of 101 rushing yards. With their dominating performance, the Saints would take the lead in the NFC South with a surprising 3-0 record. Green Day and U2 performed jointly on the field before the game, while the Goo Goo Dolls held a concert outside the Dome earlier in the evening.\\nAnswer this question based on the article: How many yards did Morten Anderson kick?\\n\\nAssistant: Based on the article, former Saints kicker Morten Andersen helped Atlanta get its only score of the game from a 26-yard field goal.\"}\n{\"text\": \"System: You are an AI assistant. You " + }, + { + "comment": "This code snippet is describing a film noir movie titled \"New World,\" starring Lee Jung-jae, which showcases the conflict between police and mob. The movie was released in 2013, directed by Park Hoon-jung, and revolves around an undercover police officer who infiltrates a powerful crime syndicate known as Goldmoon.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":177-177", + "content": "will be given a task. You must generate a detailed and long answer.\\n\\nUser: Here's a complex question that requires someone to reason about the input, can you answer it? What film noir starring Lee Jung-jae shows the conflict between the police and the mob?\\nA:\\n\\nAssistant: The film noir starring Lee Jung-jae that showcases the conflict between the police and the mob is \\\"New World\\\" (Korean title: \uc2e0\uc138\uacc4; RR: Sinsegye), a South Korean crime thriller film released in 2013. Directed and written by Park Hoon-jung, the movie dives into the dark, tense, and twisted world of organized crime, specifically focusing on the complex power dynamics that govern the interactions between the mob and the police.\\nThe film's plot revolves around an undercover police officer, Lee Ja-sung, played by Lee Jung-jae, who infiltrates one of the largest and most powerful crime syndicates in Korea, known as Goldmoon. Lee Ja-sung spends years working his way up the ranks and develops close relationships with key" + }, + { + "comment": "This code appears to be describing the plot and themes of a film called \"New World\", mentioning its main characters, cinematography, performances, and the exploration of relationships between police and mob organizations. It seems to be part of a larger text or review about the movie.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":177-177", + "content": " members of the organization, including its second-in-command, Jung Chung, played by Hwang Jung-min, and an ambitious, ruthless rising star in the syndicate, Lee Joong-goo, played by Park Sung-woong.\\nAs the story unfolds, we see how Lee Ja-sung's unwavering loyalty to his police chief, Kang Hyung-chul, played by Choi Min-sik, complicates his mission and forces him to make choices that impact the lives of those around him. The film explores themes of loyalty, betrayal, power struggles, and the blurred lines between the worlds of law enforcement and organized crime.\\nNew World has been praised for its gripping narrative, evocative cinematography, and exceptional performances, especially those of Lee Jung-jae and Hwang Jung-min. The film's portrayal of the intricate and often treacherous relationships between the main characters contributes to its complex exploration of the eternal conflict between the police and mob organizations.\\nIn summary, the film noir \\\"New World\\\" starring Lee Ju" + }, + { + "comment": "This chunk of code demonstrates the language processing capabilities of the AI system. It is capturing a conflict scenario, providing explanations for texts in different languages, and ensuring understanding by simplifying complex concepts for a five-year-old audience.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":177-179", + "content": "ng-jae captures the conflict between the police and the mob through its powerful storytelling and engaging characters, offering a poignant and insightful look into the murky world of organized crime in South Korea.\"}\n{\"text\": \"System: You are a helpful assistant, who always provide explanation. Think like you are answering to a five year old.\\n\\nUser: This transparent dialogue also guarantees the citizens of Bulgaria and Romania that the fundamental rights enshrined in the treaties are a reality.\\nTranslate to German\\n\\nAssistant: Dieser durchsichtige Dialog stellt auch sicher, dass f\u00fcr die B\u00fcrgerinnen und B\u00fcrger von Bulgarien und Rum\u00e4nien die grundlegenden Rechte, die in den Vertr\u00e4gen verankert sind, Wirklichkeit werden.\\nLass mich das f\u00fcr dich einfach erkl\u00e4ren: Das bedeutet, dass ein offenes Gespr\u00e4ch hilft, den Menschen in Bulgarien und Rum\u00e4nien sicherzustellen, dass sie wirklich die wichtigen Rechte haben, die in den Abkommen geschrieben stehen.\"}\n{\"text\": \"System: You are an AI assis" + }, + { + "comment": "This code is providing instructions for assigning a label 'formal' or 'informal' to a given conversation between two people. The conversation is separated by 'Person1:' and 'Person2:', and the task requires checking if there is an absence of emotion and presence of questions in the dialogue to determine the label.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":179-179", + "content": "tant, who knows every language and how to translate one language to another. Given a task, you explain in simple steps what the task is asking, any guidelines that it provides. You solve the task and show how you used the guidelines to solve the task.\\n\\nUser: Detailed Instructions: You are given a conversation between two people. 'Person1:' and 'Person2:' are used to separate their respective dialogues. You are required to assign a label 'formal' if there is an absence of emotion and a presence of questions anywhere within the conversation. If such pattern is not found assign the label 'informal'.\\nProblem: Person1: Room service . Can I help you ? \\n Person2:Yes , tomorrow is my friend's birthday.Could I arrange a birthday party for him through you ? \\n Person1: Certainly . \\n Person2:I want to order a large birthday cake , several kinds of cold dishes , pastries and fruits . \\n Person1: Any drinks ? \\n Person2:I've got whiskey and brandy but I need some beer " + }, + { + "comment": "The code analyzes a conversation between two people, determining whether it is formal or informal based on the absence of emotions and presence of questions. In this example, the conversation is formal as both individuals ask questions and there are no emotional expressions.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":179-179", + "content": "and soda.That ' s all . \\nSolution:\\n\\nAssistant: Step 1: Identify the task\\nThe task is to read a conversation between two people and determine whether it is formal or informal by checking for the presence of emotion and questions.\\nStep 2: Understand the guidelines\\nIf there is an absence of emotion and the presence of questions, the conversation is considered formal. If this pattern is not found, it is considered informal.\\nStep 3: Analyze the conversation\\nPerson1: Room service. Can I help you?\\nPerson2: Yes, tomorrow is my friend's birthday. Could I arrange a birthday party for him through you?\\nPerson1: Certainly.\\nPerson2: I want to order a large birthday cake, several kinds of cold dishes, pastries, and fruits.\\nPerson1: Any drinks?\\nPerson2: I've got whiskey and brandy but I need some beer and soda. That's all.\\nStep 4: Check for emotions and questions\\nIn this conversation, there are no expressions of emotions. We can see the presence of questions from both Person1 and Person" + }, + { + "comment": "Step 5: Assign the label\nSince there is an absence of emotion and the presence of questions, the conversation can be labeled as 'formal'.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":179-180", + "content": "2.\\nStep 5: Assign the label\\nSince there is an absence of emotion and the presence of questions, the conversation can be labeled as 'formal'.\\nSolution: Formal\"}\n{\"text\": \"System: You are a helpful assistant, who always provide explanation. Think like you are answering to a five year old.\\n\\nUser: Orrin Grant Hatch (born March 22, 1934) is an American attorney and politician serving as the senior United States Senator for Utah who has been the President pro tempore of the United States Senate since 2015. Having been a senator since 1977, Hatch is the longest-serving Republican Senator in U.S. history. Hatch served as either the chair or ranking minority member of the Senate Judiciary Committee from 1993 to 2005. He previously served as chairman of the Senate Committee on Health, Education, Labor, and Pensions from 1981 to 1987 and currently serves as Chairman of the Senate Finance Committee as well as serving on the board of directors for the United States Holocaust Memorial Museum.\\nH" + }, + { + "comment": "Bill introduced to balance prescription drug abuse penalties and ensure patient access, passing unanimously in Senate but criticized for potentially fueling opioid crisis.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":180-180", + "content": "atch introduced the Ensuring Patient Access and Effective Drug Enforcement Act, narrowing the broad authority of the DEA to suspend drug \\\"manufacturers, distributors, and dispensers\\\". Hatch stated the bill was also written to protect patients from disruptions in the production and delivery of their prescription drugs: \\\"The fact that prescription drugs can be abused should not prevent patients from receiving the medications they need. This bill takes a balanced approach to the problem of prescription drug abuse by clarifying penalties for manufacturing or dispensing outside approved procedures while helping to ensure that supply chains to legitimate users remain intact\\\". The bill passed the Senate unanimously and Tom Marino passed a version of the bill in the House and was signed by President Barack Obama. Critics of the bill claim the new law fuels the opioid crisis by limiting the DEA's ability to halt production and distribution by predatory drug companies. DEA Chief Administrat" + }, + { + "comment": "The code snippet discusses the legal perspective of opioid abuse and mentions a law that significantly reduced the Drug Enforcement Administration's (DEA) authority to suspend drug company operations for noncompliance. It also quotes opinions from Donald Trump's Attorney General Jeff Sessions, who called the law \"dubious\", and former DEA program manager Jim Geldhof, who found the bill \"outrageous\". The comparison of the corrective action plan to one that would allow bank robbers to round up is also provided.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":180-180", + "content": "ive Law Judge John J. Mulrooney II wrote in the Marquette Law Review: \\\"At a time when, by all accounts, opioid abuse, addiction and deaths were increasing markedly, this new law imposed a dramatic diminution of the agency's authority. It is now all but logically impossible for the DEA to suspend a drug company's operations for failing to comply with federal law.\\\" Donald Trump's Attorney General Jeff Sessions called the law \\\"dubious\\\" and joined 44 state attorneys general calling for \\\"repeal or amendment of the law to restore some of the DEA's authority.\\\" Jim Geldhof, a former DEA program manager whom spent 43 years with the DEA called the bill \\\"outrageous. It basically takes any kind of action DEA was going to do with a distributor or manufacturer as far as an immediate suspension off the table. And then the other part of that really infuriates me is that corrective action plan.\\\" Mulrooney compared the corrective action plan to one that would \\\"allow bank robbers to round up and" + }, + { + "comment": "Senator Hatch defends the bill against accusations of conspiracy and misleading practices, stating it was drafted in consultation with DEA to improve prescription drug supply. Senator Whitehouse and Pat Toomey also express their opinions on the matter.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":180-180", + "content": " return inkstained money and agree not to rob any more banks -- all before any of those wrongdoers actually admit fault and without any consequence that might deter such behavior in the future.\\\" Hatch responded to a Washington Post and 60 Minutes investigation into the bill by writing a Washington Post opinion article calling the investigation \\\"misleading\\\" and asking to \\\"leave conspiracy theories to Netflix\\\". Senator Sheldon Whitehouse, a co-sponsor of the senate bill, also defended the bill: \\\"This bill was drafted in consultation with the DEA to offer better guidance for companies working to safely and responsibly supply prescription drugs to pharmacies, and to promote better communication and certainty between companies and regulators.\\\" Republican Pat Toomey Pat Toomey expressed doubts that a conspiracy existed, but still suggested amending the bill: \\\"I'm a little surprised that it passed unanimously in both houses, was signed by President Obama and got no opposition from t" + }, + { + "comment": "This code appears to be a snippet from an article discussing Senator Orrin Hatch, his involvement in creating legislation regarding the drug industry, and potential conflicts of interest due to donations received.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":180-180", + "content": "he DEA at the time. That's not the way controversial legislation usually ends up, but hey, if there's problems, then we ought to revisit them.\\\" Hatch received $177,000 in donations from the drug industry while pushing the bill through and has received $2,178,863 from the Pharmaceuticals/Health Products industry from 1989-2014 according to required filings by the Federal Election Commission.\\nAsk a question about this article.\\n\\nAssistant: Orrin Hatch is a very important man in the United States government. He has been a senator, which means he helps make laws, for a very long time - since 1977! He is from the state of Utah and is a member of the Republican party. He has worked on many different committees that focus on things like health, education, and laws. He created a bill, which is like an idea for a new rule, to make sure people can get the medicine they need but also make sure those medicines are not abused. This bill became a law, but some people think it may not be the best" + }, + { + "comment": "The code contains two JSON objects with text data. The first object discusses the potential consequences of legalizing marijuana, while the second object is a conversation about scheduling a conference talk and requesting feedback on the proposed subject line.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/demo.jsonl\":180-181", + "content": " solution and might make another problem, called the opioid crisis, worse. Orrin Hatch has also received a lot of money from companies that make medicines, which some people think might make him support rules that are good for those companies.\"}\n{\"text\": \"System: You are a helpful assistant, who always provide explanation. Think like you are answering to a five year old.\\n\\nUser: Write a subject line for this message:\\nNick: It looks like Skilling will be able to attend the conference on May 15th. However, he will need to speak in the morning, because his flight leaves on the 15th at 3:50 p.m. Let us know if that will work. Thanks. \\nSubject Line:\\n\\nAssistant: Subject Line: Skilling Can Come to the Conference, but Needs Morning Time to Talk\"}" + } + ] +} \ No newline at end of file diff --git a/docs/doc/905eba71-d1f8-4da8-b312-2825f969db7e.json b/docs/doc/905eba71-d1f8-4da8-b312-2825f969db7e.json new file mode 100644 index 00000000..b721c049 --- /dev/null +++ b/docs/doc/905eba71-d1f8-4da8-b312-2825f969db7e.json @@ -0,0 +1,65 @@ +{ + "summary": "The code sets up a RWKV model trainer, configures the Trainer object with DeepSpeed, and enables FP16 mode for efficient RWKV-v4 language model training.", + "details": [ + { + "comment": "This code is an import section for a language model, specifically RWKV-v4. It sets up logging, imports necessary modules, and includes options to set debug mode or seed the training process for determinism. The code is part of the 'train.py' file in the RWKV-LM repository.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/train.py\":0-18", + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nimport os\nimport logging, types\nfrom src.utils import Dataset\nimport torch\nimport numpy as np\nfrom src.binidx import MMapIndexedDataset\nnp.set_printoptions(precision=4, suppress=True, linewidth=200)\nlogging.basicConfig(format=\"%(asctime)s - %(levelname)s - %(name)s - %(message)s\",\n datefmt=\"%Y-%m-%d %H:%M:%S\", level=logging.INFO,)\n# if False: # True False ---> Set to False if you don't understand it\n# print(\"\\n\\n[[[ SPECIAL DEBUG MODE FOR MYSELF. DON'T ENABLE THIS IF YOU DON'T UNDERSTAND IT ]]]\\n\\n\")\n# import src.utils\n# src.utils.set_seed(42) # make training deterministic (including dataloader). if you are doing this, remember to change seed when you load a model (otherwise the dataloader loads old samples)" + }, + { + "comment": "This code sets the training data and configuration for a text generation model. It uses the RWKV-LM's Pile model and can be fine-tuned with different model names and types. The datafile specifies where to find the training data, and datafile_encoding specifies the file encoding format. If EXPRESS_PILE_MODE is True, the code uses a specific pile model for fine-tuning.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/train.py\":20-41", + "content": "########################################################################################################\n# Step 1: set training data & cfg\n########################################################################################################\nEXPRESS_PILE_MODE = False # True: express mode for fine-tuning a pile model // False: usual training\nEXPRESS_PILE_MODEL_NAME = 'RWKV-4-Pile-169M-20220807-8023'\nEXPRESS_PILE_MODEL_TYPE = 'RWKV-4-Pile-169M'\n# EXPRESS_PILE_MODEL_NAME = 'RWKV-4-Pile-430M-20220808-8066'\n# EXPRESS_PILE_MODEL_TYPE = 'RWKV-4-Pile-430M'\n# EXPRESS_PILE_MODEL_NAME = 'RWKV-4-Pile-1B5-20220903-8040'\n# EXPRESS_PILE_MODEL_TYPE = 'RWKV-4-Pile-1B5'\n########################################################################################################\ndatafile = \"../data/enwik8\" # your data\ndatafile_encoding = 'utf-8' # 'utf-8' / 'utf-16le' / 'numpy' (for fine-tuning pile models) / 'binidx' (the Megatron-LM 'binidx' format)\n# datafile = 'my-gpt_seq_document'\n# datafile_encoding = 'binidx'\nif EXPRESS_PILE_MODE:" + }, + { + "comment": "This code sets the datafile, datafile_encoding, and VOCAB_SIZE environment variables for RWKV-v4 training. It also suggests a procedure for multi-GPU training involving setting RWKV_NUM_GPUS and other parameters. The supported precisions are 'bf16' and 'fp16'.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/train.py\":42-64", + "content": " datafile = 'train.npy' # use 'prepare-data.py' in https://github.com/BlinkDL/RWKV-v2-RNN-Pile/tree/main/RWKV-v3 to tokenize .txt into .npy\n datafile_encoding = 'numpy'\n#\n# set VOCAB_SIZE = 0 (auto-compute) if you are training a char-level LM from scratch\n# set VOCAB_SIZE = 50277 for fine-tuning pile models\n# set VOCAB_SIZE = your_vocab_size for 'binidx' data\n#\nos.environ['VOCAB_SIZE'] = '0'\nif EXPRESS_PILE_MODE:\n os.environ['VOCAB_SIZE'] = '50277'\n#\n# Currently it's slow to initialize a new model. Hence I suggest this procedure for multi-GPU training:\n# 1) set RWKV_NUM_GPUS = '1' and let it run for 1 miniEpoch and it will save a trained-1.pth\n# 2) set RWKV_NUM_GPUS = '8' (or your #GPU), batch_size = single_gpu_batchsz * RWKV_NUM_GPUS,\n# EPOCH_BEGIN = 1, LOAD_MODEL = True, and it will load 'trained-1.pth' and continue the training from it\n#\nos.environ['RWKV_NUM_GPUS'] = '1' # num of GPUs to use\n#\n# 'bf16' (fast & stable)\n# 'fp16' (fast & will overflow after training a large model for very long. can be solved in the future)" + }, + { + "comment": "The code is setting the environment variables for the training process. It allows choosing different float modes and deciding whether to use DeepSpeed for improved efficiency or not. Additionally, it sets the model details such as number of layers, embedding size, and context length.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/train.py\":65-87", + "content": "# 'tf32' (decent speed & stable)\n# 'fp32' (!!!very slow!!! only for verification)\nos.environ['RWKV_FLOAT_MODE'] = 'bf16'\nos.environ['RWKV_DEEPSPEED'] = '1' # Use DeepSpeed? 0 = False, 1 = True\nif int(os.environ['RWKV_NUM_GPUS']) == 1: # Usually you don't need DeepSpeed for 1 GPU training.\n os.environ['RWKV_DEEPSPEED'] = '0' # However, sometimes DeepSpeed saves VRAM even for 1 GPU training. So you shall try it.\nos.environ['USE_WANDB'] = '0' # wandb logging. 0 = False, 1 = True\n########################################################################################################\n# Step 2: set model details\n########################################################################################################\nEPOCH_BEGIN = 0 # begins with miniEpoch = EPOCH_BEGIN\nLOAD_MODEL = False # shall we load the #EPOCH_BEGIN model and continue the training from it?\nn_layer = 6\nn_embd = 512\nctx_len = 1024 # increase T_MAX in src/model.py if your ctx_len is longer\nmodel_type = 'RWKV' # 'RWKV' or 'RWKV-ffnPre' (sometimes better)" + }, + { + "comment": "The code is setting the hyperparameters and model configuration for the RWKV-v4 language model based on the chosen EXPRESS_PILE_MODEL_TYPE. It defines the number of layers, embedding dimension, context length, and batch size according to the selected model type. The code also advises reducing the batch size if encountering \"CUDA out of memory\" error.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/train.py\":89-113", + "content": "# there is also a RWKV_HEAD_QK_DIM in model.py and model_run.py\n# set it to 256, then it's using my headQK trick (a tiny attention) to improve loss\n# set it to 0, then it's a pure RNN (attention-free)\nif EXPRESS_PILE_MODE:\n LOAD_MODEL = True\n if EXPRESS_PILE_MODEL_TYPE == 'RWKV-4-Pile-169M':\n n_layer = 12\n n_embd = 768\n ctx_len = 1024\n elif EXPRESS_PILE_MODEL_TYPE == 'RWKV-4-Pile-430M':\n n_layer = 24\n n_embd = 1024\n ctx_len = 1024\n elif EXPRESS_PILE_MODEL_TYPE == 'RWKV-4-Pile-1B5':\n n_layer = 24\n n_embd = 2048\n ctx_len = 1024\n########################################################################################################\n# Step 3: set batch size & learning rate etc.\n########################################################################################################\n# if you see \"CUDA out of memory\", reduce batch_size. Use nvidia-smi to find the highest value for your GPU.\nbatch_size = 12 * int(os.environ['RWKV_NUM_GPUS'])" + }, + { + "comment": "The code provides suggestions for training an RWKV-v4 model. It recommends setting the initial and final learning rates (lr_init and lr_final) based on the model size, saving partially trained models with epoch_save_frequency, and adjusting the epoch length and batch size according to available GPU resources.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/train.py\":114-134", + "content": "assert (batch_size % int(os.environ['RWKV_NUM_GPUS']) == 0)\n# By default we are using exponential LR decay.\n# Here are my suggestions for training.\n# Let's say you are training a L6-D512 model.\n# 1) Set lr_init = lr_final = 8e-4. Let it run for some mini-epochs, until you feel like reducing LR.\n# 2) Check epoch_save_frequency and make sure the partially-trained model is saved. Ctrl+C to stop the run.\n# 3) Set lr_init = 8e-4, lr_final = 1e-5, betas = (0.9, 0.999).\n# 4) Set EPOCH_BEGIN & LOAD_MODEL to load the partially-trained model. Continue the training.\n# \n# For L12-D768, set lr_init = 6e-4. For L24-D1024, set lr_init = 4e-4. For L24-D2048, set lr_init = 3e-4.\nlr_init = 8e-4\nlr_final = 1e-5\n# the mini-epoch is very short and of fixed length (length = ctx_len * epoch_length_fixed tokens)\nn_epoch = 500\nepoch_length_fixed = (10000 // batch_size) * batch_size # feel free to increase it if you have lots of GPU\n# epoch_save_frequency 0 = never, 1 = every mini-epoch, 2 = every two mini-epochs, ...\nepoch_save_frequency = 10" + }, + { + "comment": "This code sets up various parameters for model training, such as learning rate, epoch number, warmup tokens, betas for optimizer, epsilon, number of data loader workers, and number of GPUs. It also configures some environment variables and enables CUDA benchmarking based on the floating point mode. The code then assigns a model name based on the epoch number or the express pile mode if enabled.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/train.py\":135-167", + "content": "epoch_save_path = 'trained-'\nif EXPRESS_PILE_MODE:\n if EXPRESS_PILE_MODEL_TYPE == 'RWKV-4-Pile-169M':\n lr_init = 2e-5\n else:\n lr_init = 1e-5\n lr_final = 1e-5\n n_epoch = 100000\n### misc stuffs ########################################################################################\nif LOAD_MODEL and EPOCH_BEGIN > 0: # we are not saving gradients, so let's have some warmup if we load a model\n warmup_tokens = 50 * ctx_len * batch_size // NUM_GPUS\nelse:\n warmup_tokens = 0\nbetas = (0.9, 0.99) # set betas = (0.9, 0.999) if your model has been trained for a while\neps = 1e-8\nnum_workers = 1 # DataLoader worker. I only tested num_workers = 1\nNUM_GPUS = int(os.environ['RWKV_NUM_GPUS'])\nos.environ['RWKV_LOAD_MODEL'] = str(LOAD_MODEL)\nMODEL_NAME = epoch_save_path + str(EPOCH_BEGIN)\nif EXPRESS_PILE_MODE:\n betas = (0.9, 0.999)\n MODEL_NAME = EXPRESS_PILE_MODEL_NAME\ntorch.backends.cudnn.benchmark = True\nif os.environ['RWKV_FLOAT_MODE'] == 'fp32':\n torch.backends.cudnn.allow_tf32 = False" + }, + { + "comment": "Loading and preparing data for training the model, with support for different data file formats (binidx, numpy, or text file). Ensures CUDA and cudnn settings are properly configured based on the environment.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/train.py\":168-189", + "content": " torch.backends.cuda.matmul.allow_tf32 = False\nelse:\n torch.backends.cudnn.allow_tf32 = True\n torch.backends.cuda.matmul.allow_tf32 = True\n########################################################################################################\n# Load data\n########################################################################################################\nprint(f'loading {datafile_encoding} data... ' + datafile)\nif datafile_encoding == 'binidx':\n train_dataset = Dataset(MMapIndexedDataset(datafile), ctx_len, epoch_length_fixed)\nelif datafile_encoding == 'numpy':\n train_dataset = Dataset(np.load(datafile).astype('int'), ctx_len, epoch_length_fixed)\nelse:\n train_dataset = Dataset(open(datafile, \"r\", encoding=datafile_encoding).read(), ctx_len, epoch_length_fixed)\n########################################################################################################\n# Train model\n########################################################################################################\nif __name__ == '__main__':" + }, + { + "comment": "This code sets up a trainer for the RWKV model. It prints out information such as the model type, float mode, epoch count, and other relevant parameters before configuring the Trainer object with these details. The code also checks if DeepSpeed should be used based on an environment variable and sets up the Trainer accordingly, using either FP16 precision for GPU acceleration or regular floating point precision for CPU-only execution.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/train.py\":190-208", + "content": " from src.trainer import Trainer, TrainerConfig\n print('\\nmodel', model_type, os.environ['RWKV_FLOAT_MODE'], 'epoch', n_epoch, 'batchsz', batch_size, 'betas',\n betas, 'eps', eps, 'ctx', ctx_len, 'layer', n_layer, 'embd', n_embd, '\\n')\n tconf = TrainerConfig(model_type=model_type, max_epochs=n_epoch, batch_size=batch_size,\n learning_rate=lr_init, lr_decay=True, lr_final=lr_final, betas=betas, eps=eps,\n warmup_tokens=warmup_tokens, final_tokens=n_epoch*len(train_dataset)*ctx_len, num_workers=num_workers, epoch_save_frequency=epoch_save_frequency, epoch_save_path=epoch_save_path)\n m_cfg = types.SimpleNamespace()\n m_cfg.model_type = model_type\n m_cfg.n_layer = n_layer\n m_cfg.n_embd = n_embd\n m_cfg.EPOCH_BEGIN = EPOCH_BEGIN\n m_cfg.LOAD_MODEL = LOAD_MODEL\n m_cfg.MODEL_NAME = MODEL_NAME\n if os.environ['RWKV_DEEPSPEED'] == '0':\n if os.environ['RWKV_FLOAT_MODE'] == 'fp16':\n trainer = Trainer(devices=NUM_GPUS, accelerator=\"gpu\", precision=16) " + }, + { + "comment": "This code checks the value of the environment variable \"RWKV_FLOAT_MODE\" to determine the trainer type and precision for training. If it's 'bf16', a GPU trainer with bf16 precision is used, otherwise if it contains '32', a GPU trainer with 32-bit precision is used. Otherwise, a DeepSpeedStrategy is imported, and its configuration is set up for further optimization during the training process.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/train.py\":209-232", + "content": " elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':\n trainer = Trainer(devices=NUM_GPUS, accelerator=\"gpu\", precision='bf16')\n elif '32' in os.environ['RWKV_FLOAT_MODE']:\n trainer = Trainer(devices=NUM_GPUS, accelerator=\"gpu\", precision=32)\n else:\n from pytorch_lightning.strategies import DeepSpeedStrategy\n DEEPSPEED_CFG = {\n \"zero_allow_untested_optimizer\":True,\n \"zero_optimization\":{\n \"stage\":2,\n \"contiguous_gradients\":True,\n \"overlap_comm\":True,\n \"allgather_partitions\":True,\n \"reduce_scatter\":True,\n \"allgather_bucket_size\":200000000,\n \"reduce_bucket_size\":200000000,\n \"sub_group_size\":1000000000000\n },\n \"activation_checkpointing\":{\n \"partition_activations\":False,\n \"cpu_checkpointing\":False,\n \"contiguous_memory_optimization\":False,\n \"synchronize_checkpoint_boundary\":False" + }, + { + "comment": "This code configures DeepSpeed settings for RWKV-v4 model training. It includes various options such as block size, queue depth, and overlap events for the aio section, gradient clipping and accumulation steps, and DeepSpeed optimization settings like stage, contiguous gradients, and bucket sizes when using 1 GPU. Additionally, it enables FP16 mode if RWKV_FLOAT_MODE is set to 'fp16'.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/train.py\":233-262", + "content": " },\n \"aio\":{\n \"block_size\":1048576,\n \"queue_depth\":8,\n \"single_submit\":False,\n \"overlap_events\":True,\n \"thread_count\":1\n },\n \"gradient_clipping\": 1.0,\n \"gradient_accumulation_steps\": 1,\n }\n if NUM_GPUS == 1:\n DEEPSPEED_CFG['zero_optimization'] = {\n \"stage\":1, # saves some VRAM\n \"contiguous_gradients\":False,\n \"overlap_comm\":False,\n \"allgather_partitions\":False,\n \"reduce_scatter\":False,\n \"allgather_bucket_size\":200000000,\n \"reduce_bucket_size\":200000000,\n \"sub_group_size\":1000000000000\n }\n if os.environ['RWKV_FLOAT_MODE'] == 'fp16':\n DEEPSPEED_CFG[\"fp16\"] = {\n \"fp16\": True,\n \"enabled\": True,\n \"loss_scale\": 0,\n \"initial_scale_power\": 12,\n \"loss_scale_window\": 1000," + }, + { + "comment": "The code checks the environment variable 'RWKV_FLOAT_MODE' and configures the DeepSpeed strategy accordingly. If it is set to 'fp16', it enables fp16 mode with hysteresis and minimum loss scale. If it is set to 'bf16', it enables bf16 mode. If '32' is present in the environment variable, it uses 32-bit precision. The trainer is then initialized with these configurations and the training process starts using the specified strategy.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/train.py\":263-279", + "content": " \"hysteresis\": 2,\n \"min_loss_scale\": 1\n }\n trainer = Trainer(strategy=DeepSpeedStrategy(config=DEEPSPEED_CFG), devices=NUM_GPUS, accelerator=\"gpu\", precision=16)\n elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':\n DEEPSPEED_CFG[\"bf16\"] = {\n \"enabled\": True\n }\n trainer = Trainer(strategy=DeepSpeedStrategy(config=DEEPSPEED_CFG), devices=NUM_GPUS, accelerator=\"gpu\", precision='bf16')\n elif '32' in os.environ['RWKV_FLOAT_MODE']:\n trainer = Trainer(strategy=DeepSpeedStrategy(config=DEEPSPEED_CFG), devices=NUM_GPUS, accelerator=\"gpu\", precision=32)\n print(trainer._strategy.config)\n trainer.run(m_cfg, train_dataset, None, tconf)" + } + ] +} \ No newline at end of file diff --git a/docs/doc/9f5d42ef-2046-46e2-ac51-7403c0ac7518.json b/docs/doc/9f5d42ef-2046-46e2-ac51-7403c0ac7518.json new file mode 100644 index 00000000..a458d4ae --- /dev/null +++ b/docs/doc/9f5d42ef-2046-46e2-ac51-7403c0ac7518.json @@ -0,0 +1,60 @@ +{ + "summary": "The code defines a `my_save()` function for saving PyTorch Lightning model data using AWS S3, handles learning rate scheduling and logs progress. It also loads, reshapes and converts a model dictionary, performs interpolation, saves epoch information, and generates initial weights for model training.", + "details": [ + { + "comment": "This code defines a function `my_save()` that saves the model's data depending on the file path (`ff`) and calls another function `train_callback()`, which is a PyTorch Lightning callback class. The code also uses subprocess to move saved files to AWS S3 storage and incorporates the usage of Deepspeed for distributed training.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/trainer.py\":0-29", + "content": "import os, math, time, datetime, subprocess\nimport torch\nfrom torch.utils.data import DataLoader\nimport pytorch_lightning as pl\nfrom pytorch_lightning.utilities import rank_zero_info, rank_zero_only\ndef my_save(args, trainer, dd, ff):\n if '14b-run1' in ff:\n fn = ff.split('/')[-1]\n fff = '/dev/shm/' + fn\n torch.save(dd, fff)\n subprocess.Popen(f\" aws s3 mv {fff} s3://rwkv-14b-4k/{fn} --quiet\", shell=True)\n elif ('world/14b' in ff) or ('world/7b' in ff):\n aa = ff.split('/')[1]\n fn = ff.split('/')[-1]\n fff = f'/dev/shm/{aa}-{fn}'\n torch.save(dd, fff)\n subprocess.Popen(f\" aws s3 mv {fff} s3://rwkv-world/{aa}-{fn} --quiet\", shell=True)\n else:\n if 'deepspeed_stage_3' in args.strategy:\n trainer.save_checkpoint(ff, weights_only=True)\n else:\n torch.save(dd, ff)\nclass train_callback(pl.Callback):\n def __init__(self, args):\n super().__init__()\n self.args = args\n def on_train_batch_start(self, trainer, pl_module, batch, batch_idx):" + }, + { + "comment": "Code snippet handles learning rate (LR) scheduling and potentially clears GPU cache based on provided arguments. It calculates the real training step, determines if LR should be adjusted based on epoch count and warmup steps, and applies linear or exponential decay to adjust the learning rate. It also prints some info if it's the global zero trainer.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/trainer.py\":30-50", + "content": " args = self.args\n # if args.cuda_cleanup > 0:\n # torch.cuda.empty_cache()\n real_step = trainer.global_step + args.epoch_begin * args.epoch_steps\n # LR schedule\n w_step = args.warmup_steps\n if args.lr_final == args.lr_init or args.epoch_count == 0:\n lr = args.lr_init\n else:\n decay_step = real_step - args.my_pile_edecay * args.epoch_steps\n decay_total = (args.epoch_count - args.my_pile_edecay) * args.epoch_steps\n progress = (decay_step - w_step + 1) / (decay_total - w_step)\n progress = min(1, max(0, progress))\n if args.lr_final == 0 or args.lr_init == 0: # linear decay\n lr = args.lr_init + (args.lr_final - args.lr_init) * progress\n else: # exp decay\n lr = args.lr_init * math.exp(math.log(args.lr_final / args.lr_init) * pow(progress, 1))\n # if trainer.is_global_zero:\n # print(trainer.global_step, decay_step, decay_total, w_step, progress, lr)" + }, + { + "comment": "This code is setting the learning rate (lr) based on the exit tokens. If my_exit_tokens > 0, lr = lr_init * lr_mult. If my_exit_tokens < 0, lr = (lr + lr_init * lr_mult) / 2. The code also saves and exits if progress >= 1 or if global step is less than w_step. This appears to be part of a training process where the learning rate dynamically adjusts during training based on exit tokens.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/trainer.py\":52-71", + "content": " if args.my_exit_tokens != 0: # cosine decay\n real_tokens = real_step * args.ctx_len * args.real_bsz\n warmup_tokens = w_step * args.ctx_len * args.real_bsz\n progress = (real_tokens - warmup_tokens) / (abs(args.my_exit_tokens) - warmup_tokens)\n progress = max(0, min(1, progress))\n lr_final_factor = args.lr_final / args.lr_init \n lr_mult = (0.5 + lr_final_factor / 2) + (0.5 - lr_final_factor / 2) * math.cos(math.pi * progress)\n if args.my_exit_tokens > 0:\n lr = args.lr_init * lr_mult\n else:\n lr = (lr + args.lr_init * lr_mult) / 2\n if progress >= 1:\n if (trainer.is_global_zero) or ('deepspeed_stage_3' in args.strategy):\n my_save(\n args, trainer,\n pl_module.state_dict(),\n f\"{args.proj_dir}/rwkv-final.pth\",\n )\n exit(0)\n if trainer.global_step < w_step:" + }, + { + "comment": "The code sets the learning rate (lr) based on a decay formula and updates the weight decay (wd_now). It iterates through each param group, setting the lr and wd accordingly. If layerwise learning rate is enabled, it adjusts the lr further based on my_lr_scale. The trainer's current lr and wd are stored for future reference, and logging is initialized if this is the first global step.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/trainer.py\":72-96", + "content": " lr = lr * (0.2 + 0.8 * trainer.global_step / w_step)\n if args.weight_decay_final > 0:\n wd_now = args.weight_decay * math.exp(math.log(args.weight_decay_final / args.weight_decay) * progress)\n else:\n wd_now = args.weight_decay\n for param_group in trainer.optimizers[0].param_groups:\n if param_group[\"weight_decay\"] > 0:\n param_group[\"weight_decay\"] = wd_now\n if args.layerwise_lr > 0:\n param_group[\"lr\"] = lr * param_group[\"my_lr_scale\"]\n # print(param_group[\"lr\"], param_group[\"my_lr_scale\"])\n else:\n param_group[\"lr\"] = lr\n trainer.my_lr = lr\n trainer.my_wd = wd_now\n # rank_zero_info(f\"{real_step} {lr}\")\n if trainer.global_step == 0:\n if trainer.is_global_zero: # logging\n trainer.my_loss_sum = 0\n trainer.my_loss_count = 0\n trainer.my_log = open(args.proj_dir + \"/train_log.txt\", \"a\")" + }, + { + "comment": "Writes log information to file, tries printing strategy configuration but handles exceptions, flushes the log, initializes W&B if enabled. In on_train_batch_end, calculates token per step, determines real step, logs only on global_step 0 (zero-based indexing).", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/trainer.py\":97-119", + "content": " trainer.my_log.write(f\"NEW RUN {args.my_timestamp}\\n{vars(self.args)}\\n\")\n try:\n print(f\"\\n{trainer.strategy.config}\\n\")\n trainer.my_log.write(f\"{trainer.strategy.config}\\n\")\n except:\n pass\n trainer.my_log.flush()\n if len(args.wandb) > 0:\n print(\"Login to wandb...\")\n import wandb\n wandb.init(\n project=args.wandb,\n name=args.run_name + \" \" + args.my_timestamp,\n config=args,\n save_code=False,\n )\n trainer.my_wandb = wandb\n def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx):\n args = self.args\n token_per_step = args.ctx_len * args.real_bsz\n real_step = trainer.global_step + args.epoch_begin * args.epoch_steps\n if trainer.is_global_zero: # logging" + }, + { + "comment": "Code block calculates the time taken for training step, real iterations per second (it/s), and kilo-tokens per second (Kt/s). It also logs the learning rate (lr) and current loss for progress tracking. If using PyTorch 2 version, it retrieves loss value differently. It also logs the epoch loss and optionally sends information to W&B if specified in arguments.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/trainer.py\":120-141", + "content": " t_now = time.time_ns()\n kt_s = 0\n try:\n t_cost = (t_now - trainer.my_time_ns) / 1e9\n kt_s = token_per_step / t_cost / 1000\n self.log(\"REAL it/s\", 1.0 / t_cost, prog_bar=True, on_step=True)\n self.log(\"Kt/s\", kt_s, prog_bar=True, on_step=True)\n except:\n pass\n trainer.my_time_ns = t_now\n if pl.__version__[0]=='2':\n trainer.my_loss = outputs[\"loss\"]\n else:\n trainer.my_loss = trainer.my_loss_all.float().mean().item()\n trainer.my_loss_sum += trainer.my_loss\n trainer.my_loss_count += 1\n trainer.my_epoch_loss = trainer.my_loss_sum / trainer.my_loss_count\n self.log(\"lr\", trainer.my_lr, prog_bar=True, on_step=True)\n self.log(\"loss\", trainer.my_epoch_loss, prog_bar=True, on_step=True)\n # self.log(\"s\", real_step, prog_bar=True, on_step=True)\n if len(args.wandb) > 0:" + }, + { + "comment": "This code is responsible for logging training metrics and saving the model checkpoint. It checks if it's the global zero or using DeepSpeed, and saves the model state dictionary as \"rwkv-final.pth\" when the current step meets certain conditions related to batch size and random steps. The logging includes loss, learning rate, weight decay, and token count per second (if applicable).", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/trainer.py\":142-163", + "content": " lll = {\"loss\": trainer.my_loss, \"lr\": trainer.my_lr, \"wd\": trainer.my_wd, \"Gtokens\": real_step * token_per_step / 1e9}\n if kt_s > 0:\n lll[\"kt/s\"] = kt_s\n trainer.my_wandb.log(lll, step=int(real_step))\n if (trainer.is_global_zero) or ('deepspeed_stage_3' in args.strategy): # save pth\n if args.magic_prime > 0:\n expand_factor = 2 if args.my_qa_mask > 0 else 1\n if int(real_step) == int(args.magic_prime * expand_factor // args.real_bsz) - 1 + int(args.my_random_steps):\n to_save_dict = pl_module.state_dict()\n my_save(\n args, trainer,\n to_save_dict,\n f\"{args.proj_dir}/rwkv-final.pth\",\n )\n def on_train_epoch_start(self, trainer, pl_module):\n args = self.args\n if pl.__version__[0]=='2':\n dataset = trainer.train_dataloader.dataset\n else:\n dataset = trainer.train_dataloader.dataset.datasets" + }, + { + "comment": "This code snippet is part of the trainer class and defines a method 'on_train_epoch_end'. It asserts that the dataset has the name 'MyDataset' and assigns values to the dataset object properties. If global_zero or 'deepspeed_stage_3' strategy is used, it saves the model state at specified epoch intervals.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/trainer.py\":164-181", + "content": " assert \"MyDataset\" in str(dataset)\n dataset.global_rank = trainer.global_rank\n dataset.real_epoch = int(args.epoch_begin + trainer.current_epoch)\n dataset.world_size = trainer.world_size\n # print(f'########## world_size {dataset.world_size} global_rank {dataset.global_rank} real_epoch {dataset.real_epoch} ##########')\n def on_train_epoch_end(self, trainer, pl_module):\n args = self.args\n to_save_dict = {}\n if (trainer.is_global_zero) or ('deepspeed_stage_3' in args.strategy): # save pth\n if (args.epoch_save > 0 and trainer.current_epoch % args.epoch_save == 0) or (trainer.current_epoch == args.epoch_count - 1):\n if args.data_type == 'wds_img':\n raw_dict = pl_module.state_dict()\n for k in raw_dict:\n if k.startswith('encoder.') or k.startswith('decoder.'):\n to_save_dict[k] = raw_dict[k]\n else:\n to_save_dict = pl_module.state_dict()" + }, + { + "comment": "Trying to save the model, log epoch information, and optionally exit if the current epoch exceeds a specified limit. Additionally, there's a function for generating initial weights, combining with pre-existing ones if available.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/trainer.py\":182-207", + "content": " try:\n my_save(\n args, trainer,\n to_save_dict,\n f\"{args.proj_dir}/rwkv-{args.epoch_begin + trainer.current_epoch}.pth\",\n )\n except Exception as e:\n print('Error\\n\\n', e, '\\n\\n')\n if trainer.is_global_zero: # logging\n trainer.my_log.write(f\"{args.epoch_begin + trainer.current_epoch} {trainer.my_epoch_loss:.6f} {math.exp(trainer.my_epoch_loss):.4f} {trainer.my_lr:.8f} {datetime.datetime.now()} {trainer.current_epoch}\\n\")\n trainer.my_log.flush()\n trainer.my_loss_sum = 0\n trainer.my_loss_count = 0\n if (args.epoch_begin + trainer.current_epoch) >= args.my_exit:\n exit(0)\n@rank_zero_only\ndef generate_init_weight(model, init_weight_name):\n mm = model.generate_init_weight()\n if model.args.my_pile_stage == 1:\n if len(model.args.load_model) > 0:\n print(f\"Combine weights from {model.args.load_model}...\")" + }, + { + "comment": "The code loads a dictionary from the specified model file, checks if all keys are present in 'mm' dictionary, and reshapes the loaded source to match the shape of existing data in 'mm'. If source shape doesn't match, it performs a linear interpolation to fit the new data. Finally, converts the source to float and cpu numpy array.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/trainer.py\":208-232", + "content": " load_dict = torch.load(model.args.load_model, map_location=\"cpu\")\n for k in load_dict:\n try:\n assert k in mm\n except:\n print('missing', k)\n exit(0)\n src = load_dict[k]\n try:\n mm[k] = src.reshape(mm[k].shape)\n except:\n tmp = mm[k].squeeze().clone()\n print(k, src.shape, '-->', mm[k].shape)\n ss = src.shape[0]\n dd = tmp.shape[0]\n for i in range(dd):\n pos = i / dd * ss\n if pos >= ss - 1:\n tmp[i] = src[ss-1]\n else:\n p0 = int(math.floor(pos))\n ii = pos - p0\n tmp[i] = src[p0] * (1-ii) + src[p0+1] * (ii)\n mm[k] = tmp.reshape(mm[k].shape)\n sss = src.squeeze().float().cpu().numpy()" + }, + { + "comment": "This code segment prints parts of 'sss' and 'mmm', saves 'mm' to a file, and if in stage 1, suggests moving on to stage 2. It seems to be part of a model training process where it displays data, saves an intermediate model checkpoint, and moves to the next phase.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/trainer.py\":233-242", + "content": " print(sss[:10], '...', sss[-10:])\n mmm = mm[k].squeeze().float().cpu().numpy()\n print(mmm[:10], '...', mmm[-10:])\n print(f\"Save to {init_weight_name}...\")\n torch.save(mm, init_weight_name)\n if model.args.my_pile_stage == 1:\n print(\"Done. Now go for stage 2.\")\n exit(0)" + } + ] +} \ No newline at end of file diff --git a/docs/doc/a8f5090c-8113-4369-a725-19e9d9d03789.json b/docs/doc/a8f5090c-8113-4369-a725-19e9d9d03789.json new file mode 100644 index 00000000..dd1528b0 --- /dev/null +++ b/docs/doc/a8f5090c-8113-4369-a725-19e9d9d03789.json @@ -0,0 +1,20 @@ +{ + "summary": "RWKV Language Model is verified using GPT architecture, creating RWKV-GPT and RWKV-RNN models. Context data, input length, and model forward execution are performed on context tensor. Padding and batching done for compatibility with forward and backward groups. Model training forward pass executed, detaching and moving results to CPU before printing.", + "details": [ + { + "comment": "Loading and configuring the RWKV Language Model (RWKV-LM) for verification, using GPT architecture with specific configurations and tokenizer.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/verify.py\":0-30", + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\n# this is for verifying the results of different models and make sure they agree with each other\nimport numpy as np\nnp.set_printoptions(precision=4, suppress=True, linewidth=200)\nimport os\nos.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\nRUN_DEVICE = 'cuda'\nimport torch\nfrom src.model_run import RWKV_RNN, RWKV_GPT\nfrom src.model import GPT, GPTConfig\nctx_len = 1024\nn_layer = 6\nn_embd = 512\nmodel_type = 'RWKV'\nmodel_name = 'trained-1'\nfrom src.utils import TOKENIZER\ntokenizer = TOKENIZER('vocab', UNKNOWN_CHAR=' ')\n########################################################################################################\nmodel_train = GPT(GPTConfig(tokenizer.vocab_size, ctx_len, model_type=model_type, n_layer=n_layer, n_embd=n_embd)).cuda()\nprint('loading ' + model_name)" + }, + { + "comment": "Loading the model from a checkpoint file and creating both RWKV-GPT and RWKV-RNN models.\nPrinting input length and data for context.\nOutput of RWKV-GPT model using forward function on context tensor.\nOutput of RWKV-RNN model running on context with select indices printed.\nOutput of the train model running on context with select indices printed.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/verify.py\":31-60", + "content": "m2 = torch.load(model_name + '.pth', map_location=RUN_DEVICE)\nmodel_train.load_state_dict(m2)\nmodel_rnn = RWKV_RNN(model_name, RUN_DEVICE, model_type, n_layer, n_embd, ctx_len)\nmodel_gpt = RWKV_GPT(model_name, RUN_DEVICE, model_type, tokenizer.vocab_size, n_layer, n_embd, ctx_len).cuda()\n########################################################################################################\ncontext = '\\nIn a'\nctx = [tokenizer.stoi.get(s, tokenizer.UNKNOWN_CHAR) for s in context]\nprint(f'input len {len(ctx)} data {ctx}')\n########################################################################################################\nprint('\\nRWKV-GPT output')\nout = model_gpt.forward(torch.tensor(ctx).unsqueeze(0).cuda())[0].detach().cpu().numpy()\nprint(out)\nprint('\\nRWKV-RNN output')\nmodel_rnn.clear()\nsrc_len = len(ctx)\nfor i in range(src_len):\n x = ctx[:i+1]\n out = model_rnn.run(x)\n if i < 3 or i >= src_len - 3:\n print(torch.tensor(out).detach().cpu().numpy())\n if i == 2:\n print('...')\nprint('\\nRWKV-train output')" + }, + { + "comment": "Padding the context list to match the length of source, then duplicating it to increase batch size for compatibility with forward and backward groups. Performs model training forward pass on padded context, detaches and moves result to CPU before printing.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/verify.py\":61-64", + "content": "ctx += [0] * (ctx_len - src_len) # pad to ctx_len\nctx = [ctx] * 4 # increase batch size (to make it work with B_GROUP_FORWARD & B_GROUP_BACKWARD)\nout = model_train.forward(torch.tensor(ctx).cuda())[0][0][:src_len].detach().cpu().numpy()\nprint(out, '\\n')" + } + ] +} \ No newline at end of file diff --git a/docs/doc/a90500cf-6eb3-49db-97cf-0656cbdfaeef.json b/docs/doc/a90500cf-6eb3-49db-97cf-0656cbdfaeef.json new file mode 100644 index 00000000..91ebc7c6 --- /dev/null +++ b/docs/doc/a90500cf-6eb3-49db-97cf-0656cbdfaeef.json @@ -0,0 +1,45 @@ +{ + "summary": "This code initializes RWKV Language Model, configures parameters, handles potential issues, and utilizes the RWKV-v4neo model for text generation, context refinement, tokenizer error checking, and prompt processing. It iterates through tokens, predicts next tokens, handles special cases, and prints debug output while tracking time for preprocessing, generation, and flushing buffer after each character.", + "details": [ + { + "comment": "This code is initializing the RWKV Language Model. It imports necessary libraries, sets the CUDA device for GPU or CPU usage, and ensures compatibility between v4 and v4neo models. The arguments for model configuration are set to 'cuda' for GPU acceleration or 'cpu' for CPU-only processing.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/run.py\":0-22", + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nimport numpy as np\nimport math, os, sys, types, time, gc\nimport torch\nfrom src.utils import TOKENIZER\ntry:\n os.environ[\"CUDA_VISIBLE_DEVICES\"] = sys.argv[1]\nexcept:\n pass\ntorch.backends.cudnn.benchmark = True\ntorch.backends.cudnn.allow_tf32 = True\ntorch.backends.cuda.matmul.allow_tf32 = True\nnp.set_printoptions(precision=4, suppress=True, linewidth=200)\nargs = types.SimpleNamespace()\n########################################################################################################\n# Step 1: set model & config (use v4 to run your trained-from-scratch models. v4 and v4neo are compatible)\n########################################################################################################\nargs.RUN_DEVICE = \"cuda\" # 'cuda' // 'cpu' (already fast)" + }, + { + "comment": "This code sets various parameters for an RWKV model, including float mode (fp16, fp32 or bf16), JIT environment, tokenizer files, and Pile model options. It also specifies the MODEL_NAME based on downloaded models or a user-defined fine-tuned model. The code is designed for GPU and CPU usage, but some elements may require benchmarking due to potential issues or reduced accuracy.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/run.py\":23-55", + "content": "args.FLOAT_MODE = \"fp16\" # fp16 (good for GPU, does not work for CPU) // fp32 (good for CPU) // bf16 (less accurate, but works for CPU)\n# if args.RUN_DEVICE == \"cuda\":\n# os.environ[\"RWKV_RUN_BACKEND\"] = 'nvfuser' # !!!BUGGY!!! wrong output\nos.environ[\"RWKV_JIT_ON\"] = '1' # '1' or '0'. very useful for GPU/CPU fp32, but might be harmful for GPU fp16. please benchmark !!!\nTOKEN_MODE = \"pile\"\nWORD_NAME = [\n \"20B_tokenizer.json\",\n \"20B_tokenizer.json\",\n] # [vocab, vocab] for Pile model\nUNKNOWN_CHAR = None\nvocab_size = 50277\n# Download Pile models: https://huggingface.co/BlinkDL\n# or, set MODEL_NAME to your fine-tuned model\n# MODEL_NAME = \"/fsx/BlinkDL/rwkv-release/RWKV-4-Pile-169M-20220807-8023\"\n# n_layer = 12\n# n_embd = 768\n# ctx_len = 1024\n# MODEL_NAME = '/fsx/BlinkDL/rwkv-release/RWKV-4-Pile-430M-20220808-8066'\n# n_layer = 24\n# n_embd = 1024\n# ctx_len = 1024\n# MODEL_NAME = '/fsx/BlinkDL/HF-MODEL/rwkv-4-pile-1b5/RWKV-4-Pile-1B5-20220903-8040'\n# n_layer = 24\n# n_embd = 2048\n# ctx_len = 1024\n# MODEL_NAME = '/fsx/BlinkDL/HF-MODEL/rwkv-4-pile-3b/RWKV-4-Pile-3B-20221008-8023'" + }, + { + "comment": "This code sets the model parameters (n_layer, n_embd, ctx_len) and environment variables for RWKV-v4neo's run.py. The context variable holds a text prompt for question and answer tasks in various languages.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/run.py\":56-88", + "content": "# n_layer = 32\n# n_embd = 2560\n# ctx_len = 1024\nMODEL_NAME = '/fsx/BlinkDL/HF-MODEL/rwkv-4-pile-7b/RWKV-4-Pile-7B-20221115-8047'\nn_layer = 32\nn_embd = 4096\nctx_len = 1024\nargs.MODEL_NAME = MODEL_NAME\nargs.n_layer = n_layer\nargs.n_embd = n_embd\nargs.ctx_len = ctx_len\nargs.vocab_size = vocab_size\nargs.head_qk = 0\nargs.pre_ffn = 0\nargs.grad_cp = 0\nargs.my_pos_emb = 0\nos.environ[\"RWKV_RUN_DEVICE\"] = args.RUN_DEVICE\n########################################################################################################\n# Step 2: set prompt & sampling stuffs\n########################################################################################################\n# context = 'A'\n# context = \"\\nIn the\"\n# context = '\\nSugar:'\ncontext = \"\\nIn a shocking finding, scientist discovered a herd of dragons living in a remote, previously unexplored valley, in Tibet. Even more surprising to the researchers was the fact that the dragons spoke perfect Chinese.\"\n# context = \"\\n\u6df1\u5733\u662f\" # test Chinese\n# context = \"\\n\u6771\u4eac\u306f\" # test Japanese\n# ###### A good prompt for Q&A ######" + }, + { + "comment": "This code is a chatbot prompt featuring a conversation between a user and an intelligent AI assistant. The user asks various questions about politics, history, and personal preferences, and the AI provides accurate and informative responses.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/run.py\":89-116", + "content": "# context = '''\n# Questions & Helpful Answers\n# Ask Research Experts\n# Question:\n# Can penguins fly?\n# Full Answer:\n# '''\n# ###### A good prompt for chatbot ######\n# context = '''\n# The following is a conversation between a highly knowledgeable and intelligent AI assistant called Bot, and a human user called User. In the following interactions, User and Bot converse in natural language, and Bot always answer User's questions. Bot is very smart, polite and humorous. Bot knows a lot, and always tells the truth. The conversation begins.\n# User: who is president of usa?\n# Bot: It\u2019s Joe Biden; he was sworn in earlier this year.\n# User: french revolution what year\n# Bot: It started in 1789, but it lasted 10 years until 1799.\n# User: guess i marry who ?\n# Bot: Only if you tell me more about yourself - what are your interests?\n# User: wat is lhc\n# Bot: It\u2019s a large and very expensive piece of science equipment. If I understand correctly, it\u2019s a high-energy particle collider, built by CERN, and completed in 2008. They used it to confirm the existence of the Higgs boson in 2012." + }, + { + "comment": "This code snippet is part of a larger program that uses the RWKV-v4neo model for text generation. It sets up necessary parameters and initializes the model, optimizer, and tokenizer. The context provided to the model is refined if character mode is enabled. The code also includes error checking for specific conditions related to the tokenizer being used.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/run.py\":118-155", + "content": "# User:''' # type your question here\nNUM_TRIALS = 999\nLENGTH_PER_TRIAL = 333\nTEMPERATURE = 1.0\ntop_p = 0.8\ntop_p_newline = 0.9 # only used in TOKEN_MODE = char\nDEBUG_DEBUG = False # True False --> show softmax output\n########################################################################################################\nprint(f'\\nUsing {args.RUN_DEVICE.upper()}. Loading {MODEL_NAME}...')\nfrom src.model_run import RWKV_RNN\nmodel = RWKV_RNN(args)\nprint(f'\\nOptimizing speed...')\nout, _ = model.forward([187], None)\n# print(out)\ngc.collect()\ntorch.cuda.empty_cache()\n# input(0)\nprint(f'\\nLoading tokenizer {WORD_NAME}...')\ntokenizer = TOKENIZER(WORD_NAME, UNKNOWN_CHAR=UNKNOWN_CHAR)\nif TOKEN_MODE == \"pile\":\n assert tokenizer.tokenizer.decode([187]) == '\\n'\n########################################################################################################\nif tokenizer.charMode:\n context = tokenizer.refine_context(context)\n ctx = [tokenizer.stoi.get(s, tokenizer.UNKNOWN_CHAR) for s in context]\nelse:\n ctx = tokenizer.tokenizer.encode(context)" + }, + { + "comment": "This code is running a neural language model for a given prompt. It first processes the prompt to generate an initial state and output, which are then used in subsequent trials. The processing involves using a recurrent neural network (RNN) for the first run if the prompt is long, or GPT otherwise. The code also keeps track of time taken during various operations for potential performance improvements.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/run.py\":156-193", + "content": "src_len = len(ctx)\nsrc_ctx = ctx.copy()\nprint(\"\\nYour prompt has \" + str(src_len) + \" tokens.\")\nprint(\n \"Note: currently the first run takes a while if your prompt is long, as we are using RNN to preprocess the prompt. Use GPT to build the hidden state for better speed.\\n\"\n)\ntime_slot = {}\ntime_ref = time.time_ns()\ndef record_time(name):\n if name not in time_slot:\n time_slot[name] = 1e20\n tt = (time.time_ns() - time_ref) / 1e9\n if tt < time_slot[name]:\n time_slot[name] = tt\ninit_state = None\ninit_out = None\nstate = None\nout = None\nfor TRIAL in range(1 if DEBUG_DEBUG else NUM_TRIALS):\n print((\"-\" * 50) + '\\n' + context, end=\"\")\n time_ref = time.time_ns()\n ctx = src_ctx.copy()\n if TRIAL == 0:\n for i in range(src_len):\n x = ctx[: i + 1]\n if i == src_len - 1:\n init_out, init_state = model.forward(x, init_state)\n else:\n init_state = model.forward(x, init_state, preprocess_only=True)\n gc.collect()\n torch.cuda.empty_cache()" + }, + { + "comment": "This code is iterating through a sequence of tokens, using a model to predict the next token based on the previous ones. If in debug mode, it prints out the output of the model for each step. It also has special handling for the \"<|endoftext|>\" token, disabling it if the tokenizer mode is set to \"pile\". The code then adds the predicted token to the context and either prints out each character if in character mode or combines the tokens into a string if not.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/run.py\":195-226", + "content": " record_time('preprocess')\n out_last = src_len\n for i in range(src_len, src_len + (1 if DEBUG_DEBUG else LENGTH_PER_TRIAL)):\n x = ctx[: i + 1]\n x = x[-ctx_len:]\n if i == src_len:\n out = init_out.clone()\n state = init_state.clone()\n else:\n out, state = model.forward(x, state)\n if DEBUG_DEBUG:\n print(\"model\", np.array(x), \"==>\", np.array(out), np.max(out.cpu().numpy()), np.min(out.cpu().numpy()))\n if TOKEN_MODE == \"pile\":\n out[0] = -999999999 # disable <|endoftext|>\n ttt = tokenizer.sample_logits(\n out,\n x,\n ctx_len,\n temperature=TEMPERATURE,\n top_p_usual=top_p,\n top_p_newline=top_p_newline,\n )\n ctx += [ttt]\n if tokenizer.charMode:\n char = tokenizer.itos[ttt]\n print(char, end=\"\", flush=True)\n else:\n char = tokenizer.tokenizer.decode(ctx[out_last:])\n if '\\ufffd' not in char: # is valid utf8 string?" + }, + { + "comment": "This code block prints the time taken for preprocessing and generation, separates with a line of dashes, and then proceeds to print information about the time slots. It also flushes the buffer immediately after each character is printed to update the output instantly. The comments are for record-keeping and provide an organized summary of the code's actions.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/run.py\":227-236", + "content": " print(char, end=\"\", flush=True)\n out_last = i+1\n record_time('total')\n # print(f'\\n\\n{time_slot}\\n\\n')\n print(\n f\"\\n\\n--- preprocess {round(time_slot['preprocess'], 2)}s, generation {round(time_slot['total']-time_slot['preprocess'], 2)}s \", end = ''\n )\nprint((\"-\" * 50) + '\\n')" + } + ] +} \ No newline at end of file diff --git a/docs/doc/ad207efd-37b3-4b81-897f-f94fe571ccdc.json b/docs/doc/ad207efd-37b3-4b81-897f-f94fe571ccdc.json new file mode 100644 index 00000000..253b379f --- /dev/null +++ b/docs/doc/ad207efd-37b3-4b81-897f-f94fe571ccdc.json @@ -0,0 +1,45 @@ +{ + "summary": "The code establishes the RWKV Language Model trainer, trains a model with specified parameters, updates optimizer, handles multi-GPU, averages losses, logs progress, and saves states at each epoch.", + "details": [ + { + "comment": "This code sets up the trainer configuration for the RWKV Language Model. It defines the batch size, learning rate, optimizer parameters, gradient norm clip, and warmup tokens. The code also ensures proper CUDA backend configurations based on the environment variables.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/src/trainer.py\":0-33", + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nimport os\nNUM_GPUS = int(os.environ['RWKV_NUM_GPUS'])\nUSE_WANDB = (int(os.environ['USE_WANDB']) == 1)\nfrom torch.utils.data.dataloader import DataLoader\nimport torch\nfrom tqdm.auto import tqdm\nimport logging\nimport datetime\nimport math\nfrom pytorch_lightning.lite import LightningLite\nimport gc\nlogger = logging.getLogger(__name__)\ntorch.backends.cudnn.benchmark = True\nif os.environ['RWKV_FLOAT_MODE'] == 'fp32':\n torch.backends.cudnn.allow_tf32 = False\n torch.backends.cuda.matmul.allow_tf32 = False\nelse:\n torch.backends.cudnn.allow_tf32 = True\n torch.backends.cuda.matmul.allow_tf32 = True\nclass TrainerConfig:\n batch_size = 64\n learning_rate = 4e-4\n betas = (0.9, 0.99)\n eps = 1e-8\n grad_norm_clip = 1.0\n warmup_tokens = 0" + }, + { + "comment": "The code defines a Trainer class that initializes various variables and contains methods for model training. The `get_run_name` method generates the run name based on the model's configuration, `run` method prepares the model, and in this snippet, it checks if a pre-trained model should be loaded.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/src/trainer.py\":34-62", + "content": " final_tokens = 0\n epoch_save_frequency = 0\n epoch_save_path = 'trained-'\n num_workers = 0 # for DataLoader\n def __init__(self, **kwargs):\n for k, v in kwargs.items():\n setattr(self, k, v)\nfrom src.model import GPT, GPTConfig\nclass Trainer(LightningLite):\n def get_run_name(self):\n raw_model = self.model.module if hasattr(\n self.model, \"module\") else self.model\n cfg = raw_model.config\n run_name = str(cfg.vocab_size) + '-' + str(cfg.ctx_len) + '-' + \\\n cfg.model_type + '-' + str(cfg.n_layer) + '-' + str(cfg.n_embd)\n return run_name\n def run(self, m_cfg, train_dataset, test_dataset, config):\n self.cuda_id = int(str(self.device).strip('cuda:'))\n print('[0]')\n model = GPT(GPTConfig(train_dataset.vocab_size, train_dataset.ctx_len, model_type=m_cfg.model_type,\n n_layer=m_cfg.n_layer, n_embd=m_cfg.n_embd))\n print('[1]')\n with torch.no_grad():\n if m_cfg.LOAD_MODEL:" + }, + { + "comment": "Loading model, transferring it to GPU, and initializing logging for training.\nThe code loads the model from a specified file path, transfers it to the device's GPU, and opens a log file if necessary. If WandB is enabled, it initializes WandB with project details and a unique run name based on the current date and time. The configuration is combined and saved without saving the code itself.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/src/trainer.py\":63-86", + "content": " print('loading', m_cfg.MODEL_NAME)\n m2 = torch.load(m_cfg.MODEL_NAME + '.pth', map_location='cpu')\n model.load_state_dict(m2)\n del m2\n model.to(self.device)\n self.model = model\n self.train_dataset = train_dataset\n self.test_dataset = test_dataset\n self.config = config\n self.avg_loss = -1\n self.EPOCH_BEGIN = m_cfg.EPOCH_BEGIN\n self.steps = self.EPOCH_BEGIN * (len(self.train_dataset) // (config.batch_size // NUM_GPUS))\n if self.cuda_id == 0:\n log_file = open(\"mylog.txt\", \"a\")\n if USE_WANDB:\n print('logging to wandb... (comment it if you don\\'t have wandb)')\n import wandb # comment this if you don't have wandb\n cfg = model.config\n for k in config.__dict__:\n setattr(cfg, k, config.__dict__[k]) # combine cfg\n wandb.init(project=\"RWKV-LM\", name=self.get_run_name() + '-' + datetime.datetime.today().strftime('%Y-%m-%d-%H-%M-%S'), config=cfg, save_code=False)" + }, + { + "comment": "This code defines a function \"run_epoch\" that takes in a split (train or test) and performs the necessary configurations for training or testing. It sets the model to train mode if split is 'train'. Then, it assigns the corresponding dataset (train or test) to the variable data. The data's idx_begin is set to the current steps multiplied by config.batch_size + 1, and cuda_id is set to self.cuda_id. Finally, it creates a DataLoader for the dataset with specified batch size and number of workers based on config settings.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/src/trainer.py\":88-108", + "content": " model, config = self.model, self.config\n raw_model = model.module if hasattr(self.model, \"module\") else model\n optimizer = raw_model.configure_optimizers(config)\n model, optimizer = self.setup(model, optimizer)\n print('[3]')\n def run_epoch(split):\n is_train = split == 'train'\n model.train(is_train)\n data = self.train_dataset if is_train else self.test_dataset\n data.idx_begin = self.steps * config.batch_size + 1\n data.cuda_id = self.cuda_id\n if config.num_workers > 0:\n loader = DataLoader(data, shuffle=False, pin_memory=True,\n batch_size=config.batch_size // NUM_GPUS,\n num_workers=config.num_workers)\n else:\n loader = DataLoader(data, shuffle=False,\n batch_size=config.batch_size // NUM_GPUS,\n num_workers=config.num_workers)" + }, + { + "comment": "This code sets up a dataloader and trains an RWKV model. It iterates over the dataloader, forwards data through the model, calculates loss, and performs backpropagation if training. Depending on the DEEPSPEED environment variable, it handles all_gather for multi-GPU scenarios. Finally, it updates the optimizer and decay the learning rate based on progress.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/src/trainer.py\":110-135", + "content": " pbar = tqdm(enumerate(loader), total=len(\n loader), bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}') if is_train else enumerate(loader)\n loader = self.setup_dataloaders(loader)\n gc.collect()\n torch.cuda.empty_cache()\n for it, (x, y) in pbar:\n with torch.set_grad_enabled(is_train):\n loss = model(x, y) # forward the model\n if os.environ['RWKV_DEEPSPEED'] == '0':\n all_loss = [loss.clone()]\n else:\n all_loss = [loss.clone() for _ in range(NUM_GPUS)]\n torch.distributed.all_gather(all_loss, loss)\n if is_train: # backprop and update the parameters\n model.zero_grad()\n self.backward(loss)\n # deepspeed will handle gradient_clipping\n optimizer.step()\n # decay the learning rate based on our progress\n self.tokens += (y >= 0).sum() # number of tokens processed this step (i.e. label is not -100)" + }, + { + "comment": "This code determines the learning rate (lr) for training a model using the RWKV algorithm. It uses config parameters such as lr_final, learning_rate, warmup_tokens, and final_tokens to calculate the learning rate based on whether the current token count is in the warm-up phase or not. If in the warm-up phase (tokens < warmup_tokens), it performs linear interpolation. If past warm-up phase, it does exponential decay. The calculated lr is then applied to optimizer's param_groups and stored in self.lr.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/src/trainer.py\":136-155", + "content": " lr_final_factor = config.lr_final / config.learning_rate\n if self.tokens < config.warmup_tokens:\n # linear warmup\n lr_mult = lr_final_factor + \\\n (1 - lr_final_factor) * float(self.tokens) / \\\n float(config.warmup_tokens)\n progress = 0\n else:\n # exponential learning rate decay\n progress = float(self.tokens - config.warmup_tokens) / float(max(1, config.final_tokens - config.warmup_tokens))\n if progress >= 1:\n lr_mult = lr_final_factor\n else:\n lr_mult = math.exp(math.log(lr_final_factor) * pow(progress, 1))\n lr = config.learning_rate * lr_mult\n for param_group in optimizer.param_groups:\n param_group['lr'] = lr\n self.lr = lr" + }, + { + "comment": "The code calculates the average loss over multiple GPUs, updates a moving average of the loss, logs the current loss to Wandb (if applicable), and sets the progress description. It also resets the tokens counter for learning rate decay and runs an epoch.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/src/trainer.py\":156-178", + "content": " self.steps += 1\n now_loss = 0\n for gg in range(NUM_GPUS):\n now_loss += all_loss[gg].item()\n now_loss = now_loss / NUM_GPUS # report progress \n if USE_WANDB and self.cuda_id == 0:\n wandb.log({\"loss\": now_loss}, step = self.steps)\n if self.avg_loss < 0:\n self.avg_loss = now_loss\n else:\n factor = 1 / (it + 1)\n self.avg_loss = self.avg_loss * (1.0 - factor) + now_loss * factor\n pbar.set_description(f\"miniE {epoch+1+self.EPOCH_BEGIN} s {self.steps} prog {progress*100.0:.2f}% : ppl {math.exp(self.avg_loss):.6f} loss {self.avg_loss:.6f} lr {lr:e}\")\n self.tokens = 0 # counter used for learning rate decay\n for epoch in range(99999999):\n run_epoch('train')\n if math.isnan(self.avg_loss):\n exit(0)" + }, + { + "comment": "This code snippet saves the model's state every time an epoch ends, or if the current epoch is a multiple of `config.epoch_save_frequency`. If using GPU, it saves the model's state dict as a .pth file with the epoch number in the filename and path specified by `config.epoch_save_path`. Additionally, it logs loss values during each epoch.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/src/trainer.py\":180-186", + "content": " if self.cuda_id == 0:\n log_file.write(f'{epoch+1+self.EPOCH_BEGIN} {self.avg_loss:.6f} {math.exp(self.avg_loss):.4f} {self.lr:.8f} {datetime.datetime.now()} {epoch+1} \\n')\n log_file.flush()\n if (self.config.epoch_save_frequency > 0 and epoch % self.config.epoch_save_frequency == 0) or (epoch == config.max_epochs - 1):\n raw_model = self.model.module if hasattr(self.model, \"module\") else self.model\n torch.save(raw_model.state_dict(), self.config.epoch_save_path + str(epoch+1+self.EPOCH_BEGIN) + '.pth')" + } + ] +} \ No newline at end of file diff --git a/docs/doc/ad6c404f-96be-46db-8abc-1f26cb85ae9c.json b/docs/doc/ad6c404f-96be-46db-8abc-1f26cb85ae9c.json new file mode 100644 index 00000000..bdcd133f --- /dev/null +++ b/docs/doc/ad6c404f-96be-46db-8abc-1f26cb85ae9c.json @@ -0,0 +1,35 @@ +{ + "summary": "The CUDA code computes dot products between weight matrix and input data for an RNN model, utilizing shared memory and optimized access. It performs forward pass with iterative dot products and updates variables using input matrices g and k. The code calculates RWKV-v2-RNN time step output and configures backward propagation kernel dimensions in `cuda_backward`.", + "details": [ + { + "comment": "Kernel function for forward pass in RWKV-v2-RNN, with CUDA implementation. Uses shared memory to optimize access time. Requires T <= Tmax, B % BF == 0, and B % BB === 0. Initializes ww and kk arrays using w and k parameters, then sets s array to eps for each thread's j in BF. Performs a forward pass on the RNN using shared memory for efficiency.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v2-RNN/cuda/timex_cuda.cu\":0-32", + "content": "#include \n// require T <= Tmax, T % 4 == 0, B % BF == 0, B % BB === 0 (Tmax and BF and BB are passed by compiler)\n#define F4(A, B) ((float4 *)(A))[(B) >> 2]\ntemplate \n__global__ void kernel_forward(const F *__restrict__ const __w, const F *__restrict__ const __k, F *__restrict__ const x,\n const F eps, const int B, const int C, const int T) {\n const int i = blockIdx.y;\n const int ij = (B * C) / BF;\n const int t = threadIdx.x << 2;\n __shared__ F ww[Tmax];\n __shared__ F kk[Tmax * BF];\n F4(ww, t) = F4(__w, t + T * (i % C));\n #pragma unroll\n for (int j = 0; j < BF; j++) {\n F4(kk, t + Tmax * j) = F4(__k, t + T * (i + ij * j));\n }\n __syncthreads();\n float4 s[BF];\n #pragma unroll\n for (int j = 0; j < BF; j++) {\n s[j] = {eps, eps, eps, eps};\n }\n const F *__restrict__ const w = ww + T - t - 4;\n for (int u = 0; u <= t; u++) {\n #pragma unroll\n for (int j = 0; j < BF; j++) {\n const F x = kk[u + Tmax * j];" + }, + { + "comment": "This code calculates the dot product between the weight matrix and input data, then updates the output. It performs this operation for each thread and stores the results in shared memory. The kernel function is defined to operate on a specific block of threads, where i represents the block index, and t represents the thread index within that block. The code uses CUDA programming features such as __restrict__ pointers, __global__ functions, and shared memory to optimize performance.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v2-RNN/cuda/timex_cuda.cu\":33-62", + "content": " s[j].x += w[u + 3] * x;\n s[j].y += w[u + 2] * x;\n s[j].z += w[u + 1] * x;\n s[j].w += w[u + 0] * x;\n }\n }\n #pragma unroll\n for (int j = 0; j < BF; j++) {\n const F *__restrict__ const k = kk + Tmax * j;\n s[j].y += w[t + 3] * k[t + 1];\n s[j].z += w[t + 2] * k[t + 1];\n s[j].z += w[t + 3] * k[t + 2];\n s[j].w += w[t + 1] * k[t + 1];\n s[j].w += w[t + 2] * k[t + 2];\n s[j].w += w[t + 3] * k[t + 3];\n F4(x, t + T * (i + ij * j)) = s[j];\n }\n}\ntemplate \n__global__ void kernel_backward_W(const F *__restrict__ const __w, const F *__restrict__ const __k, const F *__restrict__ const __gwk,\n F *__restrict__ const gw, F *__restrict__ const gk,\n const int B, const int C, const int T) {\n const int i = blockIdx.y;\n const int t = threadIdx.x << 2;\n __shared__ F k[Tmax];\n __shared__ F gg[Tmax];\n F4(k, t) = F4(__k, t + T * i);\n F4(gg, t) = F4(__gwk, t + T * i);" + }, + { + "comment": "This code performs a forward pass of an RNN model using CUDA. It calculates the output by summing up the contributions from each time step, taking into account the input sequence and the hidden state. The function `cuda_forward` sets up the grid and block dimensions for the kernel launch, while the `kernel_forward` kernel itself performs the actual computation on the GPU.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v2-RNN/cuda/timex_cuda.cu\":63-92", + "content": " __syncthreads();\n float4 s = {0, 0, 0, 0};\n const F *__restrict__ const g = gg + T - t - 4;\n for (int u = 0; u <= t; u++) {\n F x = k[u];\n s.x += g[u + 3] * x;\n s.y += g[u + 2] * x;\n s.z += g[u + 1] * x;\n s.w += g[u + 0] * x;\n }\n s.y += g[t + 3] * k[t + 1];\n s.z += g[t + 2] * k[t + 1];\n s.z += g[t + 3] * k[t + 2];\n s.w += g[t + 1] * k[t + 1];\n s.w += g[t + 2] * k[t + 2];\n s.w += g[t + 3] * k[t + 3];\n F4(gw, t + T * i) = s;\n}\nvoid cuda_forward(const float *w, const float *k, float *x, float eps, int B, int C, int T) {\n dim3 gridDim(1, B * C / BF);\n dim3 blockDim(T >> 2);\n kernel_forward<< >>(w, k, x, eps, B, C, T);\n}\ntemplate \n__global__ void kernel_backward(const F *__restrict__ const __w, const F *__restrict__ const __k, const F *__restrict__ const __gwk,\n F *__restrict__ const gw, F *__restrict__ const gk,\n const int B, const int C, const int T) {" + }, + { + "comment": "Code initializes shared memory arrays for weights, kernel, and input-kernel product. It then calculates thread-specific weight tensor, loads kernel and input-kernel product into shared memory, and synchronizes threads. Finally, it iteratively performs dot product between shared kernel and input-kernel product tensors to accumulate output tensor values in shared memory.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v2-RNN/cuda/timex_cuda.cu\":93-129", + "content": " const int i = blockIdx.y;\n const int ij = (B * C) / BB;\n const int t = threadIdx.x << 2;\n __shared__ F w[Tmax];\n __shared__ F kk[Tmax * BB];\n __shared__ F gg[Tmax * BB];\n F4(w, t) = F4(__w, t + T * (i % C));\n #pragma unroll\n for (int j = 0; j < BB; j++) {\n F4(kk, t + Tmax * j) = F4(__k, t + T * (i + ij * j));\n F4(gg, t + Tmax * j) = F4(__gwk, t + T * (i + ij * j));\n }\n __syncthreads();\n float4 s[BB];\n #pragma unroll\n for (int j = 0; j < BB; j++) {\n s[j] = {0, 0, 0, 0};\n }\n for (int u = 0; u <= t; u++) {\n #pragma unroll\n for (int j = 0; j < BB; j++) {\n const F *__restrict__ const g = gg + Tmax * j + T - t - 4;\n F x = kk[u + Tmax * j];\n s[j].x += g[u + 3] * x;\n s[j].y += g[u + 2] * x;\n s[j].z += g[u + 1] * x;\n s[j].w += g[u + 0] * x;\n }\n }\n #pragma unroll\n for (int j = 0; j < BB; j++) {\n const F *__restrict__ const k = kk + Tmax * j;\n const F *__restrict__ const g = gg + Tmax * j + T - t - 4;" + }, + { + "comment": "This code updates the values of a set of variables (s[j].x, s[j].y, s[j].z, s[j].w) based on different input matrices g and k. It utilizes unroll to optimize performance by performing multiple calculations simultaneously.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v2-RNN/cuda/timex_cuda.cu\":130-162", + "content": " s[j].y += g[t + 3] * k[t + 1];\n s[j].z += g[t + 2] * k[t + 1];\n s[j].z += g[t + 3] * k[t + 2];\n s[j].w += g[t + 1] * k[t + 1];\n s[j].w += g[t + 2] * k[t + 2];\n s[j].w += g[t + 3] * k[t + 3];\n F4(gw, t + T * (i + ij * j)) = s[j];\n }\n #pragma unroll\n for (int j = 0; j < BB; j++) {\n s[j] = {0, 0, 0, 0};\n }\n for (int u = t + 3; u < T; u++) {\n F x = w[u];\n #pragma unroll\n for (int j = 0; j < BB; j++) {\n const F *__restrict__ const g = gg + Tmax * j + T + t - 3;\n s[j].x += g[2 - u] * x;\n s[j].y += g[3 - u] * x;\n s[j].z += g[4 - u] * x;\n s[j].w += g[5 - u] * x;\n } \n }\n #pragma unroll\n for (int j = 0; j < BB; j++) {\n const F *__restrict__ const g = gg + Tmax * j + T + t - 3;\n s[j].x += g[2 - t] * w[t + 0];\n s[j].x += g[1 - t] * w[t + 1];\n s[j].x += g[0 - t] * w[t + 2];\n s[j].y += g[2 - t] * w[t + 1];\n s[j].y += g[1 - t] * w[t + 2];" + }, + { + "comment": "This code snippet is part of the RWKV-v2-RNN implementation in CUDA. It calculates the output of a time step and assigns it to the corresponding location in memory for gradient computation. The `cuda_backward` function configures the grid and block dimensions for a GPU kernel that performs backward propagation on a given dataset.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v2-RNN/cuda/timex_cuda.cu\":163-171", + "content": " s[j].z += g[2 - t] * w[t + 2];\n F4(gk, t + T * (i + ij * j)) = s[j];\n }\n}\nvoid cuda_backward(const float *w, const float *k, const float *gwk, float *gw, float *gk, int B, int C, int T) {\n dim3 gridDim(1, B * C / BB);\n dim3 blockDim(T >> 2);\n kernel_backward<< >>(w, k, gwk, gw, gk, B, C, T);\n}" + } + ] +} \ No newline at end of file diff --git a/docs/doc/af7aedb5-12a9-42da-a131-b4797f982098.json b/docs/doc/af7aedb5-12a9-42da-a131-b4797f982098.json new file mode 100644 index 00000000..5d89d7e4 --- /dev/null +++ b/docs/doc/af7aedb5-12a9-42da-a131-b4797f982098.json @@ -0,0 +1,15 @@ +{ + "summary": "This code defines CUDA functions wrapped with Torch extension for PyTorch's forward and backward passes, implemented as a PyBind11 module for seamless integration.", + "details": [ + { + "comment": "This code defines two functions, `forward` and `backward`, which perform the forward and backward passes of a computation. These functions are implemented in CUDA and wrapped with Torch extension for seamless integration with PyTorch. The `forward` function takes in Tensor inputs and calls the CUDA `cuda_forward` function to perform the computation on GPU, while `backward` performs the backward pass using the corresponding CUDA function.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/cuda/wkv_op.cpp\":0-9", + "content": "#include \nvoid cuda_forward(int B, int T, int C, float *w, float *u, float *k, float *v, float *y);\nvoid cuda_backward(int B, int T, int C, float *w, float *u, float *k, float *v, float *y, float *gy, float *gw, float *gu, float *gk, float *gv);\nvoid forward(int64_t B, int64_t T, int64_t C, torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y) {\n cuda_forward(B, T, C, w.data_ptr (), u.data_ptr (), k.data_ptr (), v.data_ptr (), y.data_ptr ());\n}\nvoid backward(int64_t B, int64_t T, int64_t C, torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y, torch::Tensor &gy, torch::Tensor &gw, torch::Tensor &gu, torch::Tensor &gk, torch::Tensor &gv) {\n cuda_backward(B, T, C, w.data_ptr (), u.data_ptr (), k.data_ptr (), v.data_ptr (), y.data_ptr (), gy.data_ptr (), gw.data_ptr (), gu.data_ptr (), gk.data_ptr (), gv.data_ptr ());" + }, + { + "comment": "This code defines a PyBind11 module for the Torch library, named \"TORCH_EXTENSION_NAME\". It includes forward and backward functions with their respective definitions. The module is then linked to the library using TORCH_LIBRARY.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/cuda/wkv_op.cpp\":10-20", + "content": "}\nPYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {\n m.def(\"forward\", &forward, \"wkv forward\");\n m.def(\"backward\", &backward, \"wkv backward\");\n}\nTORCH_LIBRARY(wkv, m) {\n m.def(\"forward\", forward);\n m.def(\"backward\", backward);\n}" + } + ] +} \ No newline at end of file diff --git a/docs/doc/b01c71f3-ea2f-400d-97f4-e6f4cfb3a668.json b/docs/doc/b01c71f3-ea2f-400d-97f4-e6f4cfb3a668.json new file mode 100644 index 00000000..bfb30952 --- /dev/null +++ b/docs/doc/b01c71f3-ea2f-400d-97f4-e6f4cfb3a668.json @@ -0,0 +1,35 @@ +{ + "summary": "The code introduces RWKV, a deep learning NLP model with tokenization classes and RWKV_RNN functions. It initializes the model, loads checkpoints, optimizes performance using layer norm, time-dependent mixing, and sigmoid activation functions, generates context, predicts next character based on model output, and uses layer norm and time-mixing operations for processing.", + "details": [ + { + "comment": "This code is a part of the RWKV Language Model, which is a deep learning model for natural language processing. The code snippet defines an equation and tokenizes it using a word table to map numbers and symbols to corresponding strings. It also initializes an object with a word_table dictionary and sets up arguments such as MODEL_NAME, n_layer, and n_embd.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/math_demo/run.py\":0-24", + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nimport numpy as np\nnp.set_printoptions(precision=4, suppress=True, linewidth=200)\nimport types, torch\nfrom torch.nn import functional as F\n# only + - *\nequation = \"4.2379*564.778-1209.01\" # 1184.4626862\n# equation = \"32731423*2189286\" # 71658446133978\n# equation = \"18239.715*9.728263\" # 177440.744565045\n# equation = \"2067*9832*4549\" # 92448162456\n########################################################################################################\nargs = types.SimpleNamespace()\nargs.MODEL_NAME = 'rwkv-200'\nargs.n_layer = 6\nargs.n_embd = 192\nclass TOKENIZER():\n def __init__(self):\n self.word_table = {\"0\": \"\\n\", \"1\": \" \", \"2\": \"(\", \"3\": \")\", \"4\": \"*\", \"5\": \"+\", \"6\": \"-\", \"7\": \".\", \"8\": \"0\", \"9\": \"1\", \"10\": \"2\", \"11\": \"3\", \"12\": \"4\", \"13\": \"5\", \"14\": \"6\", \"15\": \"7\", \"16\": \"8\", \"17\": \"9\", \"18\": \"=\", \"19\": \"e\", \"20\": \"f\"}" + }, + { + "comment": "This code defines a class for RWKV tokenization and another class for RWKV_RNN. The RWKV_RNN class initializes with arguments, loads the model from a checkpoint file, and adjusts some parameters as needed. It sets the model to inference mode using eval() function and converts certain parameters to floating point type. The code also includes functions for tokenization: encode and decode which convert tokens to integers and integers back to tokens respectively.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/math_demo/run.py\":25-52", + "content": " self.vocab_size = len(self.word_table)\n self.stoi = {v: int(k) for k, v in self.word_table.items()}\n self.itos = {int(k): v for k, v in self.word_table.items()}\n def encode(self, x):\n return [self.stoi[t] for t in x]\n def decode(self, x):\n return ''.join([self.itos[t] for t in x])\ntokenizer = TOKENIZER()\n########################################################################################################\nclass RWKV_RNN(torch.jit.ScriptModule):\n def __init__(self, args):\n super().__init__()\n self.args = args\n self.eval() # set torch to inference mode\n w = torch.load(args.MODEL_NAME + '.pth', map_location='cpu')\n for k in w.keys():\n if '.time_' in k: w[k] = w[k].squeeze()\n if '.time_decay' in k: w[k] = -torch.exp(w[k].float()) # the real time decay is like e^{-e^x}\n else: w[k] = w[k].float() # convert to f32 type\n self.w = types.SimpleNamespace() # set self.w from w\n self.w.blocks = {}" + }, + { + "comment": "This code is defining a function called \"channel_mixing\" which performs channel-wise mixing and normalization on input tensor \"x\". It uses layer norm, time-wise mixing, and sigmoid activation functions. The function also updates the state variable for future iterations. The code organizes the weights in a specific way and utilizes torch script method to optimize performance.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/math_demo/run.py\":53-76", + "content": " for k in w.keys(): # example: \"blocks.0.att.time_first\" => self.w.blocks[0].att.time_first\n parts = k.split('.')\n last = parts.pop()\n here = self.w\n for p in parts:\n if p.isdigit():\n p = int(p)\n if p not in here: here[p] = types.SimpleNamespace()\n here = here[p]\n else:\n if not hasattr(here, p): setattr(here, p, types.SimpleNamespace())\n here = getattr(here, p)\n setattr(here, last, w[k])\n def layer_norm(self, x, w):\n return F.layer_norm(x, (self.args.n_embd,), weight=w.weight, bias=w.bias)\n @torch.jit.script_method\n def channel_mixing(self, x, state, i:int, time_mix_k, time_mix_r, kw, vw, rw):\n xk = x * time_mix_k + state[5*i+0] * (1 - time_mix_k)\n xr = x * time_mix_r + state[5*i+0] * (1 - time_mix_r)\n state[5*i+0] = x\n r = torch.sigmoid(rw @ xr)\n k = torch.square(torch.relu(kw @ xk)) # square relu, primer paper" + }, + { + "comment": "This code performs time-dependent mixing and computes the output for each step of a recurrent neural network. The `time_mixing` method updates hidden states, applies transformations based on time constants, and calculates the weighted sum of inputs using dot products. The `forward` method takes input tokens and current state as arguments to perform forward pass calculations.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/math_demo/run.py\":77-108", + "content": " return r * (vw @ k)\n @torch.jit.script_method\n def time_mixing(self, x, state, i:int, time_mix_k, time_mix_v, time_mix_r, time_first, time_decay, kw, vw, rw, ow):\n xk = x * time_mix_k + state[5*i+1] * (1 - time_mix_k)\n xv = x * time_mix_v + state[5*i+1] * (1 - time_mix_v)\n xr = x * time_mix_r + state[5*i+1] * (1 - time_mix_r)\n state[5*i+1] = x\n r = torch.sigmoid(rw @ xr)\n k = kw @ xk\n v = vw @ xv\n aa = state[5*i+2]\n bb = state[5*i+3]\n pp = state[5*i+4]\n ww = time_first + k\n qq = torch.maximum(pp, ww)\n e1 = torch.exp(pp - qq)\n e2 = torch.exp(ww - qq)\n a = e1 * aa + e2 * v\n b = e1 * bb + e2\n wkv = a / b\n ww = pp + time_decay\n qq = torch.maximum(ww, k)\n e1 = torch.exp(ww - qq)\n e2 = torch.exp(k - qq)\n state[5*i+2] = e1 * aa + e2 * v\n state[5*i+3] = e1 * bb + e2\n state[5*i+4] = qq\n return ow @ (r * wkv)\n def forward(self, token, state):" + }, + { + "comment": "Iterates over layers, applies time-mixing and channel-mixing operations, layer norm, and final weighted operation.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/math_demo/run.py\":109-126", + "content": " with torch.no_grad():\n if state == None:\n state = torch.zeros(self.args.n_layer * 5, self.args.n_embd)\n for i in range(self.args.n_layer): state[5*i+4] = -1e30 # -infinity\n x = self.w.emb.weight[token]\n x = self.layer_norm(x, self.w.blocks[0].ln0)\n for i in range(self.args.n_layer):\n att = self.w.blocks[i].att\n x = x + self.time_mixing(self.layer_norm(x, self.w.blocks[i].ln1), state, i, \n att.time_mix_k, att.time_mix_v, att.time_mix_r, att.time_first, att.time_decay, \n att.key.weight, att.value.weight, att.receptance.weight, att.output.weight)\n ffn = self.w.blocks[i].ffn\n x = x + self.channel_mixing(self.layer_norm(x, self.w.blocks[i].ln2), state, i, \n ffn.time_mix_k, ffn.time_mix_r, \n ffn.key.weight, ffn.value.weight, ffn.receptance.weight)\n x = self.w.head.weight @ self.layer_norm(x, self.w.ln_out)" + }, + { + "comment": "This code loads an RWKV_RNN model, generates context using provided equation and tokenizes it. It then iterates through tokens, predicting the next character based on the model's output, until a newline is predicted.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/math_demo/run.py\":127-149", + "content": " return x.float(), state\n##########################################################################################################\nprint(f'\\nUsing CPU. Loading {args.MODEL_NAME} ...')\nmodel = RWKV_RNN(args)\ncontext = \"\\n\" + equation.strip().replace(' ','') + \"=\"\nprint(context, f'(python answer {eval(equation)})')\nstate = None\nfor token in tokenizer.encode(context):\n out, state = model.forward(token, state)\nfor i in range(4096):\n token = int(torch.argmax(out))\n tmp = tokenizer.decode([token])\n print(tmp, end=\"\", flush=True)\n if tmp == '\\n':\n break\n out, state = model.forward(token, state) \nprint()" + } + ] +} \ No newline at end of file diff --git a/docs/doc/b12cb3c2-b423-4d3e-82b6-8fb2bce109e1.json b/docs/doc/b12cb3c2-b423-4d3e-82b6-8fb2bce109e1.json new file mode 100644 index 00000000..5af900b1 --- /dev/null +++ b/docs/doc/b12cb3c2-b423-4d3e-82b6-8fb2bce109e1.json @@ -0,0 +1,15 @@ +{ + "summary": "The code defines C++ functions for the forward and backward passes of a WKV model in RWKV-LM/RWKV-v4, utilizing PyTorch's torch::Tensor class and CUDA kernels on GPU.", + "details": [ + { + "comment": "This code defines a C++ function for the forward pass of the WaveGrad RWKV model, and a backward pass for gradient calculation. It uses the torch::Tensor class from PyTorch library to handle tensors and wraps the CUDA kernels with torch::extension to enable GPU computation.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/cuda/wkv_op.cpp\":0-13", + "content": "#include \nvoid cuda_forward(int B, int T, int C, float *w, float *u, float *k, float *v, float *y);\nvoid cuda_backward(int B, int T, int C, float *w, float *u, float *k, float *v, float *gy, float *gw, float *gu, float *gk, float *gv);\nvoid forward(int64_t B, int64_t T, int64_t C, torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y) {\n cuda_forward(B, T, C, w.data_ptr (), u.data_ptr (), k.data_ptr (), v.data_ptr (), y.data_ptr ());\n}\nvoid backward(int64_t B, int64_t T, int64_t C, torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &gy, torch::Tensor &gw, torch::Tensor &gu, torch::Tensor &gk, torch::Tensor &gv) {\n cuda_backward(B, T, C, w.data_ptr (), u.data_ptr (), k.data_ptr (), v.data_ptr (), gy.data_ptr (), gw.data_ptr (), gu.data_ptr (), gk.data_ptr (), gv.data_ptr ());\n}\nPYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {\n m.def(\"forward\", &forward, \"wkv forward\");" + }, + { + "comment": "The code is defining C++ functions for the forward and backward passes of a Wavelet Quantized Variational Kalman Filter (WKV) model in the RWKV-LM/RWKV-v4/cuda/wkv_op.cpp file, which are then exported as bindings through Torch library with names \"forward\" and \"backward\".", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/cuda/wkv_op.cpp\":14-20", + "content": " m.def(\"backward\", &backward, \"wkv backward\");\n}\nTORCH_LIBRARY(wkv, m) {\n m.def(\"forward\", forward);\n m.def(\"backward\", backward);\n}" + } + ] +} \ No newline at end of file diff --git a/docs/doc/d069a05d-5795-4a4d-ada3-3a2df7542829.json b/docs/doc/d069a05d-5795-4a4d-ada3-3a2df7542829.json new file mode 100644 index 00000000..3638e032 --- /dev/null +++ b/docs/doc/d069a05d-5795-4a4d-ada3-3a2df7542829.json @@ -0,0 +1,210 @@ +{ + "summary": "This code imports libraries, sets up profiling, defines modules based on environment variables, utilizes CUDA for efficient computation, and initializes a neural network model with backpropagation support. It applies layer normalization using CUDA function calculations and introduces the RWKV_TimeMix class for the RWKV5 model. The code initializes optimizer groups and models with layer-specific learning rates, performs all-gather for losses, handles various settings, and initializes model weights using orthogonal initialization.", + "details": [ + { + "comment": "This code imports necessary libraries, sets up profiling executor and mode for torch.nn, defines MyModule and MyFunction based on JIT environment variable, and imports DeepSpeed if available.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model.py\":0-30", + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nimport os, math, gc, importlib\nimport torch\n# torch._C._jit_set_profiling_executor(True)\n# torch._C._jit_set_profiling_mode(True)\nimport torch.nn as nn\nfrom torch.nn import functional as F\nimport pytorch_lightning as pl\nfrom pytorch_lightning.utilities import rank_zero_info, rank_zero_only\nfrom pytorch_lightning.strategies import DeepSpeedStrategy\nif importlib.util.find_spec('deepspeed'):\n import deepspeed\n from deepspeed.ops.adam import DeepSpeedCPUAdam, FusedAdam\n# from deepspeed.runtime.fp16.onebit.zoadam import ZeroOneAdam\ntry:\n print('RWKV_MY_TESTING', os.environ[\"RWKV_MY_TESTING\"])\nexcept:\n os.environ[\"RWKV_MY_TESTING\"] = ''\ndef __nop(ob):\n return ob\nMyModule = nn.Module\nMyFunction = __nop\nif os.environ[\"RWKV_JIT_ON\"] == \"1\":" + }, + { + "comment": "This code defines a class 'WKV' that utilizes CUDA for efficient computation, and loads a corresponding CUDA kernel module depending on the environment variable \"RWKV_FLOAT_MODE\". This process involves setting T_MAX and loading the appropriate C++ modules with specific compiler flags.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model.py\":31-51", + "content": " MyModule = torch.jit.ScriptModule\n MyFunction = torch.jit.script_method\n########################################################################################################\n# CUDA Kernel\n########################################################################################################\nT_MAX = int(os.environ[\"RWKV_T_MAX\"]) # TAKES LOTS OF VRAM!\n# it's possible to go beyond CUDA limitations if you slice the ctx and pass the hidden state in each slice\nfrom torch.utils.cpp_extension import load\nif os.environ[\"RWKV_FLOAT_MODE\"] == \"bf16\":\n wkv_cuda = load(name=f\"wkv_{T_MAX}_bf16\", sources=[\"cuda/wkv_op_bf16.cpp\", \"cuda/wkv_cuda_bf16.cu\"], verbose=True, extra_cuda_cflags=[\"-t 4\", \"-std=c++17\", \"-res-usage\", \"--maxrregcount 60\", \"--use_fast_math\", \"-O3\", \"-Xptxas -O3\", \"--extra-device-vectorization\", f\"-DTmax={T_MAX}\"])\n class WKV(torch.autograd.Function):\n @staticmethod\n def forward(ctx, B, T, C, w, u, k, v):\n ctx.B = B\n ctx.T = T\n ctx.C = C" + }, + { + "comment": "This code defines a model function with forward and backward operations. It checks some conditions, initializes tensors, performs model computations, and saves the intermediate states for later backpropagation.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model.py\":52-72", + "content": " assert T <= T_MAX\n assert B * C % min(C, 32) == 0\n w = -torch.exp(w.float().contiguous())\n u = u.contiguous()\n k = k.contiguous()\n v = v.contiguous()\n y = torch.empty((B, T, C), device=w.device, memory_format=torch.contiguous_format, dtype=torch.bfloat16)\n wkv_cuda.forward(B, T, C, w, u, k, v, y)\n ctx.save_for_backward(w, u, k, v, y)\n return y\n @staticmethod\n def backward(ctx, gy):\n B = ctx.B\n T = ctx.T\n C = ctx.C\n assert T <= T_MAX\n assert B * C % min(C, 32) == 0\n w, u, k, v, y = ctx.saved_tensors\n gw = torch.empty((B, C), device=gy.device, memory_format=torch.contiguous_format, dtype=torch.bfloat16)\n gu = torch.empty((B, C), device=gy.device, memory_format=torch.contiguous_format, dtype=torch.bfloat16)\n gk = torch.empty((B, T, C), device=gy.device, memory_format=torch.contiguous_format, dtype=torch.bfloat16)" + }, + { + "comment": "This code initializes a tensor and calls a function. It then calculates the sum along dimension 0 for two other tensors, and returns them alongside others. It seems to be part of a neural network model with CUDA support. The forward method creates an instance variable for each argument, checks if T is within a limit, and performs some transformations on certain arguments.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model.py\":73-92", + "content": " gv = torch.empty((B, T, C), device=gy.device, memory_format=torch.contiguous_format, dtype=torch.bfloat16)\n wkv_cuda.backward(B, T, C, w, u, k, v, y, gy.contiguous(), gw, gu, gk, gv)\n gw = torch.sum(gw, dim=0)\n gu = torch.sum(gu, dim=0)\n return (None, None, None, gw, gu, gk, gv)\nelse:\n wkv_cuda = load(name=f\"wkv_{T_MAX}\", sources=[\"cuda/wkv_op.cpp\", \"cuda/wkv_cuda.cu\"], verbose=True, extra_cuda_cflags=[\"-res-usage\", \"--maxrregcount 60\", \"--use_fast_math\", \"-O3\", \"-Xptxas -O3\", \"--extra-device-vectorization\", f\"-DTmax={T_MAX}\"])\n class WKV(torch.autograd.Function):\n @staticmethod\n def forward(ctx, B, T, C, w, u, k, v):\n ctx.B = B\n ctx.T = T\n ctx.C = C\n assert T <= T_MAX\n assert B * C % min(C, 32) == 0\n if \"32\" in os.environ[\"RWKV_FLOAT_MODE\"]:\n w = -torch.exp(w.contiguous())\n u = u.contiguous()\n k = k.contiguous()\n v = v.contiguous()" + }, + { + "comment": "The code is handling the forward and backward passes of a model. It first checks the environment variable 'RWKV_FLOAT_MODE' to determine the data type for output 'y'. If '32' is present in this environment variable, it directly returns 'y'. If 'fp16' is set, it converts 'y' to half precision and returns it. If 'bf16' is set, it converts 'y' to BFloat16 and returns it. The backward method applies constraints on the dimensions of tensors and retrieves saved tensors from context for gradients calculation.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model.py\":93-116", + "content": " else:\n w = -torch.exp(w.float().contiguous())\n u = u.float().contiguous()\n k = k.float().contiguous()\n v = v.float().contiguous()\n y = torch.empty((B, T, C), device=w.device, memory_format=torch.contiguous_format)\n wkv_cuda.forward(B, T, C, w, u, k, v, y)\n ctx.save_for_backward(w, u, k, v, y)\n if \"32\" in os.environ[\"RWKV_FLOAT_MODE\"]:\n return y\n elif os.environ[\"RWKV_FLOAT_MODE\"] == \"fp16\":\n return y.half()\n elif os.environ[\"RWKV_FLOAT_MODE\"] == \"bf16\":\n return y.bfloat16()\n @staticmethod\n def backward(ctx, gy):\n B = ctx.B\n T = ctx.T\n C = ctx.C\n assert T <= T_MAX\n assert B * C % min(C, 32) == 0\n w, u, k, v, y = ctx.saved_tensors\n gw = torch.empty((B, C), device=gy.device, memory_format=torch.contiguous_format)\n gu = torch.empty((B, C), device=gy.device, memory_format=torch.contiguous_format)" + }, + { + "comment": "The code defines a function for running CUDA on BERT-like transformer models. It initializes empty tensors for gradients, and then calls the backward operation of WKV to compute gradients. Depending on the float mode environment variable, it returns the gradients in different precisions: None, half (fp16), or bfloat16. If no CUDA is used, the code calls a RUN_CUDA function, which applies the transformer model with CUDA.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model.py\":117-134", + "content": " gk = torch.empty((B, T, C), device=gy.device, memory_format=torch.contiguous_format)\n gv = torch.empty((B, T, C), device=gy.device, memory_format=torch.contiguous_format)\n if \"32\" in os.environ[\"RWKV_FLOAT_MODE\"]:\n wkv_cuda.backward(B, T, C, w, u, k, v, y, gy.contiguous(), gw, gu, gk, gv)\n else:\n wkv_cuda.backward(B, T, C, w, u, k, v, y, gy.float().contiguous(), gw, gu, gk, gv)\n gw = torch.sum(gw, dim=0)\n gu = torch.sum(gu, dim=0)\n if \"32\" in os.environ[\"RWKV_FLOAT_MODE\"]:\n return (None, None, None, gw, gu, gk, gv)\n elif os.environ[\"RWKV_FLOAT_MODE\"] == \"fp16\":\n return (None, None, None, gw.half(), gu.half(), gk.half(), gv.half())\n elif os.environ[\"RWKV_FLOAT_MODE\"] == \"bf16\":\n return (None, None, None, gw.bfloat16(), gu.bfloat16(), gk.bfloat16(), gv.bfloat16())\ndef RUN_CUDA(B, T, C, w, u, k, v):\n return WKV.apply(B, T, C, w, u, k, v)" + }, + { + "comment": "This code defines a class for the RWKV_TimeMix_RWKV5_Preview module, which is an extension of the MyModule class. It initializes instance variables related to the model's parameters and performs some sanity checks on the input arguments. The time_mix_k and time_mix_v parameters are calculated based on a combination of the layer index and other factors.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model.py\":136-161", + "content": "########################################################################################################\nclass RWKV_TimeMix_RWKV5_Preview(MyModule):\n def __init__(self, args, layer_id):\n super().__init__()\n self.args = args\n self.layer_id = layer_id\n self.head_size = 64\n self.n_head = args.dim_att // self.head_size\n assert args.dim_att % self.n_head == 0\n self.head_size_divisor = 8\n self.chunk_len = 512\n assert args.ctx_len % self.chunk_len == 0\n with torch.no_grad():\n ratio_0_to_1 = layer_id / (args.n_layer - 1) # 0 to 1\n ratio_1_to_almost0 = 1.0 - (layer_id / args.n_layer) # 1 to ~0\n ddd = torch.ones(1, 1, args.n_embd)\n for i in range(args.n_embd):\n ddd[0, 0, i] = i / args.n_embd\n # fancy time_mix\n self.time_mix_k = nn.Parameter(torch.pow(ddd, ratio_1_to_almost0))\n self.time_mix_v = nn.Parameter(torch.pow(ddd, ratio_1_to_almost0) + 0.3 * ratio_0_to_1)" + }, + { + "comment": "This code initializes parameters for a time-related model component, including time_mix_r, time_mix_g (conditionally), time_decay, and time_faaaa (conditionally). The values are determined by ratios and layer index. Printing the parameter values is optional based on environment variables.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model.py\":162-181", + "content": " self.time_mix_r = nn.Parameter(torch.pow(ddd, 0.5 * ratio_1_to_almost0))\n if 'r3' in os.environ[\"RWKV_MY_TESTING\"]:\n self.time_mix_g = nn.Parameter(torch.pow(ddd, 0.5 * ratio_1_to_almost0))\n self.gate = nn.Linear(args.n_embd, args.dim_att, bias=False)\n # fancy time_decay\n decay_speed = torch.ones(self.n_head)\n for h in range(self.n_head):\n decay_speed[h] = -6 + 5 * (h / (self.n_head - 1)) ** (0.7 + 1.3 * ratio_0_to_1)\n self.time_decay = nn.Parameter(decay_speed)\n # print(layer_id, self.time_decay.flatten()[:3].cpu().numpy(), '...', self.time_decay.flatten()[-3:].cpu().numpy())\n if 'r2' in os.environ[\"RWKV_MY_TESTING\"]:\n tmp = torch.zeros(self.n_head)\n for h in range(self.n_head):\n tmp[h] = ratio_0_to_1 * (1 - (h / (self.n_head - 1)))\n self.time_faaaa = nn.Parameter(tmp)\n else:\n self.time_first = nn.Parameter(torch.ones(self.n_head) * (-3.0))" + }, + { + "comment": "The code defines a model with time shift, receptance, key, value, and output layers. It also includes a group normalization layer (ln_x). If 'r3' is present in the environment, it creates a jit_func that processes input x by mixing it with the previous timestep to produce k, v, r, and g. The receptance layer then transforms the r values into BHTS format.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model.py\":183-203", + "content": " self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))\n self.receptance = nn.Linear(args.n_embd, args.dim_att, bias=False)\n self.key = nn.Linear(args.n_embd, args.dim_att, bias=False)\n self.value = nn.Linear(args.n_embd, args.dim_att, bias=False)\n self.output = nn.Linear(args.dim_att, args.n_embd, bias=False)\n self.ln_x = nn.GroupNorm(self.n_head, args.dim_att)\n if 'r3' in os.environ[\"RWKV_MY_TESTING\"]:\n @MyFunction\n def jit_func(self, x):\n B, TT, C = x.size()\n xx = self.time_shift(x) # Mix x with the previous timestep to produce xk, xv, xr\n xk = x * self.time_mix_k + xx * (1 - self.time_mix_k)\n xv = x * self.time_mix_v + xx * (1 - self.time_mix_v)\n xr = x * self.time_mix_r + xx * (1 - self.time_mix_r)\n xg = x * self.time_mix_g + xx * (1 - self.time_mix_g)\n r = self.receptance(xr).view(B, TT, self.n_head, self.head_size).transpose(1, 2) # BTC -> BHTS\n k " + }, + { + "comment": "This code is defining and implementing a function for an attention mechanism in a transformer model. It calculates the attention scores, performs weighted sum of values, updates the states, and applies non-linearity before returning the final output.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model.py\":203-227", + "content": "= self.key(xk).view(B, TT, self.n_head, self.head_size).transpose(1, 2).transpose(-2, -1) # BTC -> BHTS -> BHST\n v = self.value(xv).view(B, TT, self.n_head, -1).transpose(1, 2) # BTC -> BHTS\n g = F.silu(self.gate(xg))\n return r, k, v, g\n @MyFunction\n def jit_func_2(self, r, k, v, g, w, wk, wb, ws):\n B, H, TT, S = r.size()\n T = self.chunk_len\n s = torch.zeros(B, H, S, S, device=r.device, dtype=r.dtype) # state\n x = torch.zeros(B, H, TT, S, device=r.device, dtype=r.dtype) # output\n for i in range(TT // T):\n rr = r[:, :, i*T:i*T+T, :]\n kk = k[:, :, :, i*T:i*T+T]\n vv = v[:, :, i*T:i*T+T, :]\n x[:, :, i*T:i*T+T, :] = ((rr @ kk) * w) @ vv + (rr @ s) * wb\n s = ws * s + (kk * wk) @ vv\n x = x.transpose(1, 2).contiguous().view(B * TT, H*S) # BHTS -> BTHS -> BTC\n x = self.ln_x(x / self.head_size_divisor).view(B, TT, H*S) * g" + }, + { + "comment": "This code defines a class with two JIT functions. The first function takes an input tensor x and performs time shifting, then separates it into three components (k, v, r). The second function takes the output of the first function and creates another tensor s using torch.zeros. Both functions use jitted methods for faster execution.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model.py\":228-250", + "content": " return self.output(x)\n else:\n @MyFunction\n def jit_func(self, x):\n B, TT, C = x.size()\n xx = self.time_shift(x) # Mix x with the previous timestep to produce xk, xv, xr\n xk = x * self.time_mix_k + xx * (1 - self.time_mix_k)\n xv = x * self.time_mix_v + xx * (1 - self.time_mix_v)\n xr = x * self.time_mix_r + xx * (1 - self.time_mix_r)\n r = self.receptance(xr).view(B, TT, self.n_head, self.head_size).transpose(1, 2) # BTC -> BHTS\n k = self.key(xk).view(B, TT, self.n_head, self.head_size).transpose(1, 2).transpose(-2, -1) # BTC -> BHTS -> BHST\n v = self.value(xv).view(B, TT, self.n_head, self.head_size).transpose(1, 2) # BTC -> BHTS\n return r, k, v\n @MyFunction\n def jit_func_2(self, r, k, v, w, wk, wb, ws):\n B, H, TT, S = r.size()\n T = self.chunk_len\n s = torch.zeros(B, H, S, S, device=r.device, dtype=r.dtype) # state" + }, + { + "comment": "Initializes a tensor with zeros, performs matrix operations to compute the output tensor, and applies layer normalization before returning the final output. The code also includes variable assignments for time-based decay and initial values for other computations.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model.py\":251-280", + "content": " x = torch.zeros(B, H, TT, S, device=r.device, dtype=r.dtype) # output\n for i in range(TT // T):\n rr = r[:, :, i*T:i*T+T, :]\n kk = k[:, :, :, i*T:i*T+T]\n vv = v[:, :, i*T:i*T+T, :]\n x[:, :, i*T:i*T+T, :] = ((rr @ kk) * w) @ vv + (rr @ s) * wb\n s = ws * s + (kk * wk) @ vv\n x = x.transpose(1, 2).contiguous().view(B * TT, H*S) # BHTS -> BTHS -> BTC\n x = self.ln_x(x / self.head_size_divisor).view(B, TT, H*S)\n return self.output(x)\n def forward(self, x):\n H = self.n_head\n T = self.chunk_len\n if 'r3' in os.environ[\"RWKV_MY_TESTING\"]:\n r, k, v, g = self.jit_func(x)\n else:\n r, k, v = self.jit_func(x)\n w = torch.exp(-torch.exp(self.time_decay.float())).unsqueeze(-1)\n if 'r2' in os.environ[\"RWKV_MY_TESTING\"]:\n u = self.time_faaaa.float().unsqueeze(-1)\n else:\n u = torch.exp(self.time_first.float()).unsqueeze(-1)" + }, + { + "comment": "This code segment is manipulating and reshaping a tensor 'w' by repeating, powering, transposing, and padding it. It then reshapes the result to specific dimensions and performs type conversions before returning from either jit_func_2 function depending on the environment variable \"RWKV_MY_TESTING\".", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model.py\":282-310", + "content": "################################################################################\n########\n ws = w.pow(T).reshape(1, H, 1, 1)\n ind = torch.arange(T-1, -1, -1, device=r.device).unsqueeze(0).repeat(H, 1)\n w = w.repeat(1, T).pow(ind)\n wk = w.reshape(1, H, 1, T)\n wb = wk.transpose(-2, -1).flip(2)\n w = torch.cat([w[:, 1:], u], dim=1)\n w = F.pad(w, (0, T))\n w = torch.tile(w, [T])\n w = w[:, :-T].reshape(-1, T, 2 * T - 1)\n w = w[:, :, T-1:].reshape(1, H, T, T)\n########\n################################################################################\n w = w.to(dtype=r.dtype)\n wk = wk.to(dtype=r.dtype)\n wb = wb.to(dtype=r.dtype)\n ws = ws.to(dtype=r.dtype)\n if 'r3' in os.environ[\"RWKV_MY_TESTING\"]:\n return self.jit_func_2(r, k, v, g, w, wk, wb, ws)\n else:\n return self.jit_func_2(r, k, v, w, wk, wb, ws) \n########################################################################################################" + }, + { + "comment": "This code defines a WKV_5 function as a wrapper for CUDA RWKV5 kernel, which performs tensor operations on B, T, C, H, r, k, v, w, and u tensors. The kernel is loaded from the specified sources, with optional environment variables controlling its size. It asserts that certain conditions are met, such as matching data types and sizes for each input tensor, before proceeding with further computations.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model.py\":311-333", + "content": "# CUDA RWKV5 Kernel\n########################################################################################################\nif 'r4' in os.environ[\"RWKV_MY_TESTING\"]:\n HEAD_SIZE = int(os.environ[\"RWKV_HEAD_SIZE_A\"])\n wkv5_cuda = load(name=\"wkv5\", sources=[\"cuda/wkv5_op.cpp\", f\"cuda/wkv5_cuda.cu\"],\n verbose=True, extra_cuda_cflags=[\"-res-usage\", \"--use_fast_math\", \"-O3\", \"-Xptxas -O3\", \"--extra-device-vectorization\", f\"-D_N_={HEAD_SIZE}\"])\n class WKV_5(torch.autograd.Function):\n @staticmethod\n def forward(ctx, B, T, C, H, r, k, v, w, u):\n with torch.no_grad():\n assert r.dtype == torch.bfloat16\n assert k.dtype == torch.bfloat16\n assert v.dtype == torch.bfloat16\n assert w.dtype == torch.bfloat16\n assert u.dtype == torch.bfloat16\n assert HEAD_SIZE == C // H\n ctx.B = B\n ctx.T = T\n ctx.C = C\n ctx.H = H\n assert r.is_contiguous()" + }, + { + "comment": "This code snippet is checking the contiguity of tensors k, v, w, and u. It then initializes eew and ew as exponentiated versions of w, saves them along with r, k, v, and u for backpropagation. Next, it creates an empty tensor y for forward pass with specific device, dtype, and memory format. Finally, it defines a backward method to compute gradients for backpropagation.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model.py\":334-355", + "content": " assert k.is_contiguous()\n assert v.is_contiguous()\n assert w.is_contiguous()\n assert u.is_contiguous()\n ew = (-torch.exp(w.float())).contiguous()\n eew = (torch.exp(ew)).contiguous()\n ctx.save_for_backward(r, k, v, eew, ew, u)\n y = torch.empty((B, T, C), device=r.device, dtype=torch.bfloat16, memory_format=torch.contiguous_format) # .uniform_(-1, 1)\n wkv5_cuda.forward(B, T, C, H, r, k, v, eew, u, y)\n return y\n @staticmethod\n def backward(ctx, gy):\n with torch.no_grad():\n assert gy.dtype == torch.bfloat16\n B = ctx.B\n T = ctx.T\n C = ctx.C\n H = ctx.H\n assert gy.is_contiguous()\n r, k, v, eew, ew, u = ctx.saved_tensors\n gr = torch.empty((B, T, C), device=gy.device, requires_grad=False, dtype=torch.bfloat16, memory_format=torch.contiguous_format) # .uniform_(-1, 1)" + }, + { + "comment": "This code initializes four tensors with uniform random values for model parameters, then calls a CUDA function to perform calculations on the input data. The resulting gradients are stored in gk and gu and returned along with other results.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model.py\":356-366", + "content": " gk = torch.empty((B, T, C), device=gy.device, requires_grad=False, dtype=torch.bfloat16, memory_format=torch.contiguous_format) # .uniform_(-1, 1)\n gv = torch.empty((B, T, C), device=gy.device, requires_grad=False, dtype=torch.bfloat16, memory_format=torch.contiguous_format) # .uniform_(-1, 1)\n gw = torch.empty((B, C), device=gy.device, requires_grad=False, dtype=torch.bfloat16, memory_format=torch.contiguous_format) # .uniform_(-1, 1)\n gu = torch.empty((B, C), device=gy.device, requires_grad=False, dtype=torch.bfloat16, memory_format=torch.contiguous_format) # .uniform_(-1, 1)\n wkv5_cuda.backward(B, T, C, H, r, k, v, eew, ew, u, gy, gr, gk, gv, gw, gu)\n gw = torch.sum(gw, 0).view(H, C//H)\n gu = torch.sum(gu, 0).view(H, C//H)\n return (None, None, None, None, gr, gk, gv, gw, gu)\n def RUN_CUDA_RWKV5(B, T, C, H, r, k, v, w, u):\n return WKV_5.apply(B, T, C, H, r, k, v, w, u)" + }, + { + "comment": "The code initializes the RWKV_TimeMix_RWKV5 class, sets up layer parameters based on input arguments, and creates time_mix_k and time_mix_v parameters using a combination of ratio-based calculations and input arguments.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model.py\":368-391", + "content": "########################################################################################################\nclass RWKV_TimeMix_RWKV5(MyModule):\n def __init__(self, args, layer_id):\n super().__init__()\n self.args = args\n self.layer_id = layer_id\n self.head_size = args.head_size_a\n assert HEAD_SIZE == self.head_size # change HEAD_SIZE to match args.head_size_a\n self.n_head = args.dim_att // self.head_size\n assert args.dim_att % self.n_head == 0\n self.head_size_divisor = args.head_size_divisor\n with torch.no_grad():\n ratio_0_to_1 = layer_id / (args.n_layer - 1) # 0 to 1\n ratio_1_to_almost0 = 1.0 - (layer_id / args.n_layer) # 1 to ~0\n ddd = torch.ones(1, 1, args.n_embd)\n for i in range(args.n_embd):\n ddd[0, 0, i] = i / args.n_embd\n # fancy time_mix\n self.time_mix_k = nn.Parameter(torch.pow(ddd, ratio_1_to_almost0))\n self.time_mix_v = nn.Parameter(torch.pow(ddd, ratio_1_to_almost0) + 0.3 * ratio_0_to_1)" + }, + { + "comment": "This code initializes three learnable parameters: time_mix_r, time_mix_g, and time_faaaa. It also includes a decay speed parameter (time_decay) and a zero-padding operation (time_shift). These parameters are used for time-related operations in the model. The receptance layer is initialized as a linear layer without bias, taking input embeddings of size args.n_embd to args.dim_att. This code likely contributes to the model's ability to process temporal information effectively.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model.py\":392-410", + "content": " self.time_mix_r = nn.Parameter(torch.pow(ddd, 0.5 * ratio_1_to_almost0))\n self.time_mix_g = nn.Parameter(torch.pow(ddd, 0.5 * ratio_1_to_almost0))\n # fancy time_decay\n decay_speed = torch.ones(args.dim_att)\n for n in range(args.dim_att):\n decay_speed[n] = -6 + 5 * (n / (args.dim_att - 1)) ** (0.7 + 1.3 * ratio_0_to_1)\n self.time_decay = nn.Parameter(decay_speed.reshape(self.n_head, self.head_size))\n # print(layer_id, self.time_decay.flatten()[:3].cpu().numpy(), '...', self.time_decay.flatten()[-3:].cpu().numpy())\n tmp = torch.zeros(args.dim_att)\n for n in range(args.dim_att):\n zigzag = ((n + 1) % 3 - 1) * 0.1\n tmp[n] = ratio_0_to_1 * (1 - (n / (args.dim_att - 1))) + zigzag\n self.time_faaaa = nn.Parameter(tmp.reshape(self.n_head, self.head_size))\n self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))\n self.receptance = nn.Linear(args.n_embd, args.dim_att, bias=False)" + }, + { + "comment": "The code defines a model with four linear layers and one group normalization layer. The `jit_func` method performs time-shifted mixing of input `x` to produce key, value, residual, and gate tensors, which are then passed through their respective layers and normalized. The `jit_func_2` method applies group normalization and divides by the head size divisor before reshaping the tensor.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model.py\":411-440", + "content": " self.key = nn.Linear(args.n_embd, args.dim_att, bias=False)\n self.value = nn.Linear(args.n_embd, args.dim_att, bias=False)\n self.output = nn.Linear(args.dim_att, args.n_embd, bias=False)\n self.gate = nn.Linear(args.n_embd, args.dim_att, bias=False)\n self.ln_x = nn.GroupNorm(self.n_head, args.dim_att)\n @MyFunction\n def jit_func(self, x):\n B, T, C = x.size()\n xx = self.time_shift(x) # Mix x with the previous timestep to produce xk, xv, xr\n xk = x * self.time_mix_k + xx * (1 - self.time_mix_k)\n xv = x * self.time_mix_v + xx * (1 - self.time_mix_v)\n xr = x * self.time_mix_r + xx * (1 - self.time_mix_r)\n xg = x * self.time_mix_g + xx * (1 - self.time_mix_g)\n r = self.receptance(xr)\n k = self.key(xk)\n v = self.value(xv)\n g = F.silu(self.gate(xg))\n return r, k, v, g\n @MyFunction\n def jit_func_2(self, x, g):\n B, T, C = x.size()\n x = x.view(B * T, C)\n x = self.ln_x(x / self.head_size_divisor).view(B, T, C)" + }, + { + "comment": "This code defines a class `RWKV_TimeMix` that inherits from `MyModule`. It initializes attributes such as `args`, `layer_id`, `ctx_len`, and `n_embd`. It also performs a fancy init by calculating ratios based on the layer ID and number of layers.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model.py\":441-471", + "content": " x = self.output(x * g)\n return x\n def forward(self, x):\n B, T, C = x.size()\n H = self.n_head\n r, k, v, g = self.jit_func(x)\n x = RUN_CUDA_RWKV5(B, T, C, H, r, k, v, w=self.time_decay, u=self.time_faaaa)\n return self.jit_func_2(x, g)\n########################################################################################################\n# RWKV: RWKV Time-mix + RWKV Channel-mix\n########################################################################################################\nclass RWKV_TimeMix(MyModule):\n def __init__(self, args, layer_id):\n super().__init__()\n self.args = args\n self.layer_id = layer_id\n self.ctx_len = args.ctx_len\n self.n_embd = args.n_embd\n with torch.no_grad(): # fancy init\n ratio_0_to_1 = layer_id / (args.n_layer - 1) # 0 to 1\n ratio_1_to_almost0 = 1.0 - (layer_id / args.n_layer) # 1 to ~0\n ddd = torch.ones(1, 1, args.n_embd)\n for i in range(args.n_embd):" + }, + { + "comment": "This code is initializing parameters for a time-based transformer model layer. It sets the decay rate, first value in time series, and mixing coefficients using fancy techniques to achieve more complexity in the attention mechanism. The time shift operation pads the input, and a linear transformation is applied to create the key values.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model.py\":472-491", + "content": " ddd[0, 0, i] = i / args.n_embd\n # fancy time_decay\n decay_speed = torch.ones(args.dim_att)\n for h in range(args.dim_att):\n decay_speed[h] = -5 + 8 * (h / (args.dim_att - 1)) ** (0.7 + 1.3 * ratio_0_to_1)\n self.time_decay = nn.Parameter(decay_speed)\n # print(layer_id, self.time_decay.flatten()[:3].cpu().numpy(), '...', self.time_decay.flatten()[-3:].cpu().numpy())\n # fancy time_first\n zigzag = torch.tensor([(i + 1) % 3 - 1 for i in range(args.dim_att)]) * 0.5\n self.time_first = nn.Parameter(torch.ones(args.dim_att) * math.log(0.3) + zigzag)\n # fancy time_mix\n self.time_mix_k = nn.Parameter(torch.pow(ddd, ratio_1_to_almost0))\n self.time_mix_v = nn.Parameter(torch.pow(ddd, ratio_1_to_almost0) + 0.3 * ratio_0_to_1)\n self.time_mix_r = nn.Parameter(torch.pow(ddd, 0.5 * ratio_1_to_almost0))\n self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))\n self.key = nn.Linear(args.n_embd, args.dim_att, bias=False)" + }, + { + "comment": "This code initializes layers for a transformer model and includes optional testing-specific buffers and parameters. The 'MyFunction' decorator suggests additional functionality is defined elsewhere.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model.py\":492-509", + "content": " self.value = nn.Linear(args.n_embd, args.dim_att, bias=False)\n self.receptance = nn.Linear(args.n_embd, args.dim_att, bias=False)\n self.output = nn.Linear(args.dim_att, args.n_embd, bias=False)\n if 'a' in os.environ[\"RWKV_MY_TESTING\"]:\n self.register_buffer(\"att_mask\", torch.tril(torch.ones(args.ctx_len, args.ctx_len)))\n d_qkv = args.n_embd // 16\n self.qq = nn.Linear(args.n_embd, d_qkv, bias=False)\n self.kk = nn.Linear(args.n_embd, d_qkv, bias=False)\n self.vv = nn.Linear(args.n_embd, d_qkv, bias=False)\n self.oo = nn.Linear(d_qkv, args.n_embd, bias=False)\n with torch.no_grad():\n self.time_mix_qq = nn.Parameter(torch.pow(ddd, ratio_1_to_almost0))\n self.time_mix_kk = nn.Parameter(torch.pow(ddd, ratio_1_to_almost0))\n self.time_mix_vv = nn.Parameter(torch.pow(ddd, ratio_1_to_almost0) + 0.3 * ratio_0_to_1)\n if 'a' not in os.environ[\"RWKV_MY_TESTING\"]:\n @MyFunction" + }, + { + "comment": "This function defines jit_func, which mixes the current timestep with the previous one for x, k, and v. Then forward function calculates RWKV using these variables and applies softmax to attentions. QKV function computes attention scores between query (q) and key (k), masking zero-valued elements and applying softmax along dim=-1.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model.py\":510-532", + "content": " def jit_func(self, x):\n xx = self.time_shift(x) # Mix x with the previous timestep to produce xk, xv, xr\n xk = x * self.time_mix_k + xx * (1 - self.time_mix_k)\n xv = x * self.time_mix_v + xx * (1 - self.time_mix_v)\n xr = x * self.time_mix_r + xx * (1 - self.time_mix_r)\n k = self.key(xk)\n v = self.value(xv)\n r = self.receptance(xr)\n sr = torch.sigmoid(r)\n return sr, k, v\n def forward(self, x):\n B, T, C = x.size() # x = (Batch,Time,Channel)\n sr, k, v = self.jit_func(x)\n rwkv = sr * RUN_CUDA(B, T, self.args.dim_att, self.time_decay, self.time_first, k, v)\n return self.output(rwkv)\n if 'a' in os.environ[\"RWKV_MY_TESTING\"]:\n @MyFunction\n def QKV(self, q, k, v):\n att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))\n att = att.masked_fill(self.att_mask == 0, float('-inf'))\n att = F.softmax(att, dim = -1)" + }, + { + "comment": "The code defines a function `jit_funcQKV` that takes an input tensor `x`, applies time shifting and mixing, then extracts key, value, receptance, qq, kk, vv from the result. The forward function reshapes input tensor `x` to (Batch, Time, Channel) and calls `jit_funcQKV` to compute sr, k, v, qq, kk, vv. It then uses these outputs with another function `RUN_CUDA` to calculate the final output rwkv.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model.py\":533-557", + "content": " x = att @ v\n return x\n @MyFunction\n def jit_funcQKV(self, x):\n xx = self.time_shift(x) # Mix x with the previous timestep to produce xk, xv, xr\n xk = x * self.time_mix_k + xx * (1 - self.time_mix_k)\n xv = x * self.time_mix_v + xx * (1 - self.time_mix_v)\n xr = x * self.time_mix_r + xx * (1 - self.time_mix_r)\n xqq = x * self.time_mix_qq + xx * (1 - self.time_mix_qq)\n xkk = x * self.time_mix_kk + xx * (1 - self.time_mix_kk)\n xvv = x * self.time_mix_vv + xx * (1 - self.time_mix_vv)\n k = self.key(xk)\n v = self.value(xv)\n r = self.receptance(xr)\n sr = torch.sigmoid(r)\n qq = self.qq(xqq)\n kk = self.kk(xkk)\n vv = self.vv(xvv)\n return sr, k, v, qq, kk, vv\n def forward(self, x):\n B, T, C = x.size() # x = (Batch,Time,Channel)\n sr, k, v, qq, kk, vv = self.jit_funcQKV(x)\n rwkv = sr * RUN_CUDA(B, T, self.args.dim_att, self.time_decay, self.time_first, k, v)" + }, + { + "comment": "The code defines a class `RWKV_ChannelMix` that extends the `MyModule` class. It initializes the layer with parameters based on the input arguments and layer ID, and includes time mixing and linear layers for key, receptance, and value operations.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model.py\":558-580", + "content": " rwkv = self.output(rwkv) + self.oo(self.QKV(qq, kk, vv))\n return rwkv\n########################################################################################################\nclass RWKV_ChannelMix(MyModule):\n def __init__(self, args, layer_id):\n super().__init__()\n self.args = args\n self.layer_id = layer_id\n self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))\n with torch.no_grad(): # fancy init of time_mix\n ratio_1_to_almost0 = 1.0 - (layer_id / args.n_layer) # 1 to ~0\n ddd = torch.ones(1, 1, args.n_embd)\n for i in range(args.n_embd):\n ddd[0, 0, i] = i / args.n_embd\n self.time_mix_k = nn.Parameter(torch.pow(ddd, ratio_1_to_almost0))\n self.time_mix_r = nn.Parameter(torch.pow(ddd, ratio_1_to_almost0))\n self.key = nn.Linear(args.n_embd, args.dim_ffn, bias=False)\n self.receptance = nn.Linear(args.n_embd, args.n_embd, bias=False)\n self.value = nn.Linear(args.dim_ffn, args.n_embd, bias=False)" + }, + { + "comment": "The code defines a class \"MishGLU\" that inherits from \"MyModule\". It initializes the object with parameters and layers. The class contains a \"forward\" method which performs time shifting, mixing, and calculations using various linear layers and functions. It uses parameters that depend on the layer ID and number of layers for time-based mixing. The output is obtained through a combination of sigmoid and value calculation.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model.py\":582-609", + "content": " @MyFunction\n def forward(self, x):\n xx = self.time_shift(x)\n xk = x * self.time_mix_k + xx * (1 - self.time_mix_k)\n xr = x * self.time_mix_r + xx * (1 - self.time_mix_r)\n k = self.key(xk)\n k = torch.relu(k) ** 2\n kv = self.value(k)\n return torch.sigmoid(self.receptance(xr)) * kv\nclass MishGLU(MyModule):\n def __init__(self, args, layer_id):\n super().__init__()\n self.args = args\n self.layer_id = layer_id\n self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))\n with torch.no_grad():\n ratio_1_to_almost0 = 1.0 - (layer_id / args.n_layer)\n x = torch.ones(1, 1, args.n_embd)\n for i in range(args.n_embd):\n x[0, 0, i] = i / args.n_embd\n self.time_mix_k = nn.Parameter(torch.pow(x, ratio_1_to_almost0))\n self.time_mix_r = nn.Parameter(torch.pow(x, ratio_1_to_almost0))\n self.aa = nn.Linear(args.n_embd, args.dim_ffn, bias=False)\n self.bb = nn.Linear(args.n_embd, args.dim_ffn, bias=False)" + }, + { + "comment": "The code defines a model with a block class. It contains linear layers, layer normalization, and Mish activation function. The block has different layer norms for each stage, with the first layer having an additional normalization layer. There is also a parameter for positional embedding if enabled in the arguments.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model.py\":610-638", + "content": " self.value = nn.Linear(args.dim_ffn, args.n_embd, bias=False)\n @MyFunction\n def forward(self, x):\n xx = self.time_shift(x)\n xa = x * self.time_mix_k + xx * (1 - self.time_mix_k)\n xb = x * self.time_mix_r + xx * (1 - self.time_mix_r)\n a = self.aa(xa)\n b = self.bb(xb)\n return self.value(a * F.mish(b))\n########################################################################################################\n# The RWKV Model with our blocks\n########################################################################################################\nclass Block(nn.Module):\n def __init__(self, args, layer_id):\n super().__init__()\n self.args = args\n self.layer_id = layer_id\n self.ln1 = nn.LayerNorm(args.n_embd)\n self.ln2 = nn.LayerNorm(args.n_embd)\n if self.layer_id == 0:\n self.ln0 = nn.LayerNorm(args.n_embd)\n if args.my_pos_emb > 0:\n self.pos_emb_x = nn.Parameter(torch.zeros((1,args.my_pos_emb,args.n_embd)))" + }, + { + "comment": "The code initializes the model's parameters, creates layers for position embedding and various attention mechanisms based on environment variables, and includes a MishGLU or RWKV_ChannelMix layer depending on the environment variable. If 'g' is in the environment variable, it uses a different ffn (fully connected feedforward) layer. Additionally, if args.tiny\\_att\\_dim is greater than 0 and the current layer matches args.tiny\\_att\\_layer, it adds specific layers for tiny attention with LayerNorm and linear layers for query, key, and value.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model.py\":639-660", + "content": " self.pos_emb_y = nn.Parameter(torch.zeros((args.my_pos_emb,1,args.n_embd)))\n if self.layer_id == 0 and self.args.pre_ffn > 0:\n self.ffnPre = RWKV_ChannelMix(args, 0)\n else:\n if 'r4' in os.environ[\"RWKV_MY_TESTING\"]:\n self.att = RWKV_TimeMix_RWKV5(args, layer_id)\n elif 'r' in os.environ[\"RWKV_MY_TESTING\"]:\n self.att = RWKV_TimeMix_RWKV5_Preview(args, layer_id)\n else:\n self.att = RWKV_TimeMix(args, layer_id)\n if 'g' in os.environ[\"RWKV_MY_TESTING\"]:\n self.ffn = MishGLU(args, layer_id)\n else:\n self.ffn = RWKV_ChannelMix(args, layer_id)\n if args.tiny_att_dim > 0 and self.layer_id == args.tiny_att_layer:\n self.tiny_ln = nn.LayerNorm(args.n_embd)\n self.tiny_q = nn.Linear(args.n_embd, args.tiny_att_dim, bias=False)\n self.tiny_k = nn.Linear(args.n_embd, args.tiny_att_dim, bias=False)\n self.tiny_v = nn.Linear(args.n_embd, args.n_embd, bias=False)" + }, + { + "comment": "The code defines a forward function for a model and includes dropout layers, LayerNorm (lnX), attention layer (att), feed-forward network (ffnPre and ffn), position embedding (pos_emb_x and pos_emb_y), and a triangular mask (tiny_mask). It applies the layers based on input parameters like dropout rate, pre-ffn flag, and layer index.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model.py\":661-687", + "content": " self.register_buffer(\"tiny_mask\", torch.tril(torch.ones(args.ctx_len, args.ctx_len)))\n if args.dropout > 0:\n self.drop0 = nn.Dropout(p = args.dropout)\n self.drop1 = nn.Dropout(p = args.dropout)\n def forward(self, x, x_emb=None):\n args = self.args\n B, T, C = x.size()\n if self.layer_id == 0:\n x = self.ln0(x)\n if args.my_pos_emb > 0:\n pos_emb = (self.pos_emb_x + self.pos_emb_y).reshape(T+1, -1)[:-1,:]\n x = x + pos_emb\n if self.args.dropout == 0:\n if self.layer_id == 0 and args.pre_ffn > 0:\n x = x + self.ffnPre(self.ln1(x))\n else:\n x = x + self.att(self.ln1(x))\n x = x + self.ffn(self.ln2(x))\n else:\n if self.layer_id == 0 and args.pre_ffn > 0:\n x = self.drop0(x + self.ffnPre(self.ln1(x)))\n else:\n x = self.drop0(x + self.att(self.ln1(x)))\n x = self.drop1(x + self.ffn(self.ln2(x)))" + }, + { + "comment": "The code is implementing an attention mechanism in the RWKV model. It checks if the layer id matches a specific layer and if a smaller attention dimension is desired. If both conditions are met, it applies the scaled dot product attention, masks out padding tokens, and adds the result to the input. The L2Wrap function is used to implement an L2 regularization term in the loss computation.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model.py\":689-720", + "content": " if args.tiny_att_dim > 0 and self.layer_id == args.tiny_att_layer:\n xx = self.tiny_ln(x)\n q = self.tiny_q(xx)[:, :T, :]\n k = self.tiny_k(xx)[:, :T, :]\n c = (q @ k.transpose(-2, -1)) * (args.tiny_att_dim ** (-0.5))\n c = c.masked_fill(self.tiny_mask[:T, :T] == 0, 0)\n x = x + c @ self.tiny_v(x_emb)\n return x\nclass L2Wrap(torch.autograd.Function):\n @staticmethod\n def forward(ctx, loss, y):\n ctx.save_for_backward(y)\n return loss\n @staticmethod\n def backward(ctx, grad_output):\n y = ctx.saved_tensors[0]\n # to encourage the logits to be close to 0\n factor = 1e-4 / (y.shape[0] * y.shape[1])\n maxx, ids = torch.max(y, -1, keepdim=True)\n gy = torch.zeros_like(y)\n gy.scatter_(-1, ids, maxx * factor)\n return (grad_output, gy)\nclass RWKV(pl.LightningModule):\n def __init__(self, args):\n super().__init__()\n self.args = args\n if not hasattr(args, 'dim_att'):" + }, + { + "comment": "This code initializes a transformer model by setting attributes based on argument values, creating embedding and linear layers, and organizing blocks within the model. Assertions ensure that certain dimensions are multiples of 32 for optimization purposes. If certain attributes aren't set, the code assigns default values. Additionally, if dropout or key/query projection is enabled, corresponding layers are created.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model.py\":721-744", + "content": " args.dim_att = args.n_embd\n if not hasattr(args, 'dim_ffn'):\n args.dim_ffn = args.n_embd * 4\n if not hasattr(args, 'tiny_att_layer'):\n args.tiny_att_layer = -1\n if not hasattr(args, 'tiny_att_dim'):\n args.tiny_att_dim = -1\n assert args.n_embd % 32 == 0\n assert args.dim_att % 32 == 0\n assert args.dim_ffn % 32 == 0\n self.emb = nn.Embedding(args.vocab_size, args.n_embd)\n self.blocks = nn.ModuleList([Block(args, i) for i in range(args.n_layer)])\n self.ln_out = nn.LayerNorm(args.n_embd)\n self.head = nn.Linear(args.n_embd, args.vocab_size, bias=False)\n if args.head_qk > 0:\n self.head_q = nn.Linear(args.n_embd, args.head_qk, bias=False)\n self.head_k = nn.Linear(args.n_embd, args.head_qk, bias=False)\n self.register_buffer(\"copy_mask\", torch.tril(torch.ones(args.ctx_len, args.ctx_len)))\n if args.dropout > 0:\n self.drop0 = nn.Dropout(p = args.dropout)" + }, + { + "comment": "This function sets up optimizers based on parameter names and provided arguments. It categorizes parameters into different learning rate groups: 1x, 2x, or 3x multipliers for time-related layers; decay rate for weight decay; and all others with a single learning rate.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model.py\":746-774", + "content": " def configure_optimizers(self):\n args = self.args\n lr_decay = set()\n lr_1x = set()\n lr_2x = set()\n lr_3x = set()\n for n, p in self.named_parameters():\n if (\"time_mix\" in n) and (args.layerwise_lr > 0):\n if args.my_pile_stage == 2:\n lr_2x.add(n)\n else:\n lr_1x.add(n)\n elif (\"time_decay\" in n) and (args.layerwise_lr > 0):\n if args.my_pile_stage == 2:\n lr_3x.add(n)\n else:\n lr_2x.add(n)\n elif (\"time_faaaa\" in n) and (args.layerwise_lr > 0):\n if args.my_pile_stage == 2:\n lr_2x.add(n)\n else:\n lr_1x.add(n)\n elif (\"time_first\" in n) and (args.layerwise_lr > 0):\n lr_3x.add(n)\n elif (len(p.squeeze().shape) >= 2) and (args.weight_decay > 0):\n lr_decay.add(n)\n else:\n lr_1x.add(n)" + }, + { + "comment": "This code is initializing optimizer groups for layerwise learning rate (LR) in a neural network model. It sorts the LR values and creates separate optimizer groups for each level of scaling, assigning different layers to each group based on their corresponding scaling factor. The layer-specific scaling is applied when args.my_pile_stage == 2, otherwise, it uses a different set of scales. This allows for more fine-grained control over the learning rates for different layers during training.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model.py\":776-796", + "content": " lr_decay = sorted(list(lr_decay))\n lr_1x = sorted(list(lr_1x))\n lr_2x = sorted(list(lr_2x))\n lr_3x = sorted(list(lr_3x))\n # print('decay', lr_decay)\n # print('1x', lr_1x)\n # print('2x', lr_2x)\n # print('3x', lr_3x)\n param_dict = {n: p for n, p in self.named_parameters()}\n if args.layerwise_lr > 0:\n if args.my_pile_stage == 2:\n optim_groups = [\n {\"params\": [param_dict[n] for n in lr_1x], \"weight_decay\": 0.0, \"my_lr_scale\": 1.0},\n {\"params\": [param_dict[n] for n in lr_2x], \"weight_decay\": 0.0, \"my_lr_scale\": 5.0},# test: 2e-3 / args.lr_init},\n {\"params\": [param_dict[n] for n in lr_3x], \"weight_decay\": 0.0, \"my_lr_scale\": 5.0},# test: 3e-3 / args.lr_init},\n ]\n else:\n optim_groups = [\n {\"params\": [param_dict[n] for n in lr_1x], \"weight_decay\": 0.0, \"my_lr_scale\": 1.0},\n {\"params\": [param_dict[n] for n in lr_2x], \"weight_decay\": 0.0, \"my_lr_scale\": 2.0}," + }, + { + "comment": "This code initializes optimizer groups for model parameters based on learning rates and weight decay. If weight decay is provided, it adds separate optimization group with the specified weight decay. Depending on Deepspeed offload flag, it returns either DeepSpeedCPUAdam or FusedAdam optimizer instances.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model.py\":797-809", + "content": " {\"params\": [param_dict[n] for n in lr_3x], \"weight_decay\": 0.0, \"my_lr_scale\": 3.0},\n ]\n else:\n optim_groups = [{\"params\": [param_dict[n] for n in lr_1x], \"weight_decay\": 0.0, \"my_lr_scale\": 1.0}]\n if args.weight_decay > 0:\n optim_groups += [{\"params\": [param_dict[n] for n in lr_decay], \"weight_decay\": args.weight_decay, \"my_lr_scale\": 1.0}]\n if self.deepspeed_offload:\n return DeepSpeedCPUAdam(optim_groups, lr=self.args.lr_init, betas=self.args.betas, eps=self.args.adam_eps, bias_correction=True, adamw_mode=True, amsgrad=False)\n return FusedAdam(optim_groups, lr=self.args.lr_init, betas=self.args.betas, eps=self.args.adam_eps, bias_correction=True, adam_w_mode=True, amsgrad=False)\n else:\n if self.deepspeed_offload:\n return DeepSpeedCPUAdam(optim_groups, lr=self.args.lr_init, betas=self.args.betas, eps=self.args.adam_eps, bias_correction=True, adamw_mode=False, weight_decay=0, amsgrad=False)" + }, + { + "comment": "This code snippet defines a model that uses either FusedAdam or ZeroOneAdam optimizer based on the deepspeed_offload property. The function forward takes an index and processes it through embedding, potentially dropout, and then passes it to multiple blocks if grad_cp is 1. The deepspeed_offload property checks a DeepSpeedStrategy configuration for offloading settings.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model.py\":810-833", + "content": " return FusedAdam(optim_groups, lr=self.args.lr_init, betas=self.args.betas, eps=self.args.adam_eps, bias_correction=True, adam_w_mode=False, weight_decay=0, amsgrad=False)\n # return ZeroOneAdam(optim_groups, lr=self.args.lr_init, betas=self.args.betas, eps=self.args.adam_eps, bias_correction=True, weight_decay=0, amsgrad=False, cuda_aware=False)\n @property\n def deepspeed_offload(self) -> bool:\n strategy = self.trainer.strategy\n if isinstance(strategy, DeepSpeedStrategy):\n cfg = strategy.config[\"zero_optimization\"]\n return cfg.get(\"offload_optimizer\") or cfg.get(\"offload_param\")\n return False\n def forward(self, idx):\n args = self.args\n B, T = idx.size()\n assert T <= args.ctx_len, \"Cannot forward, model ctx_len is exhausted.\"\n x = self.emb(idx)\n x_emb = x\n if args.dropout > 0:\n x = self.drop0(x)\n if args.tiny_att_dim > 0:\n for block in self.blocks:\n if args.grad_cp == 1:" + }, + { + "comment": "This code is responsible for handling the checkpointing and layer execution within a specific block in the RWKV model. It also handles head attention calculations and applies one-hot encoding based on the floating-point mode environment variable.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model.py\":834-859", + "content": " x = deepspeed.checkpointing.checkpoint(block, x, x_emb)\n else:\n x = block(x, x_emb)\n else:\n for block in self.blocks:\n if args.grad_cp == 1:\n x = deepspeed.checkpointing.checkpoint(block, x)\n else:\n x = block(x)\n x = self.ln_out(x)\n if args.head_qk > 0:\n q = self.head_q(x)[:, :T, :]\n k = self.head_k(x)[:, :T, :]\n c = (q @ k.transpose(-2, -1)) * (1.0 / args.head_qk)\n c = c.masked_fill(self.copy_mask[:T, :T] == 0, 0)\n if \"32\" in os.environ[\"RWKV_FLOAT_MODE\"]:\n c = c @ F.one_hot(idx, num_classes=args.vocab_size)\n elif os.environ[\"RWKV_FLOAT_MODE\"] == \"fp16\":\n c = c @ F.one_hot(idx, num_classes=args.vocab_size).half()\n elif os.environ[\"RWKV_FLOAT_MODE\"] == \"bf16\":\n c = c @ F.one_hot(idx, num_classes=args.vocab_size).bfloat16()\n x = self.head(x) + c" + }, + { + "comment": "This code contains two methods: '__call__' and 'training_step'. The '__call__' method performs a forward pass through the model by calling the 'head' method if not in training mode, otherwise it returns the input 'x' as-is. In the 'training_step' method, it checks the argument 'args.my_qa_mask' to determine whether to perform training or not. If 'args.my_qa_mask != 1', it calculates and returns a loss using cross entropy loss function. If 'args.my_qa_mask == 1', it applies a mask to the targets during training.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model.py\":860-888", + "content": " else:\n x = self.head(x)\n return x\n def training_step(self, batch, batch_idx):\n args = self.args\n if args.my_qa_mask != 1:\n idx, targets = batch\n logits = self(idx)\n loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))\n # if '0' in os.environ[\"RWKV_MY_TESTING\"]:\n # print('logits', logits)\n # torch.set_printoptions(threshold=10000)\n # print('idx', idx)\n # exit(0)\n else:\n idx, targets, mask = batch\n mask = mask.view(-1)\n sum_mask = torch.sum(mask).item()\n # if sum_mask == 0:\n # return torch.tensor([0.0], requires_grad=True)\n logits = self(idx)\n if sum_mask == mask.shape[0]:\n loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))\n # print('rank', self.global_rank, 'loss', loss.item())\n else:\n loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), reduction='none')" + }, + { + "comment": "This code is defining the model, loss calculation, and training step functions for a neural network. It calculates the loss based on a masked sum of the loss_raw values, and then performs an all-gather operation to collect the losses from all ranks before proceeding to the next steps. The output is wrapped with L2 norm using the apply function. The generate_init_weight function prints some information at the start of training.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model.py\":889-915", + "content": " # loss_raw = loss\n loss = torch.sum(loss * mask) / sum_mask\n # torch.set_printoptions(threshold=10000)\n # if True: #self.global_rank == 1:\n # tmp = ''\n # sss = 0\n # ccc = 0\n # for i in range(mask.shape[0]):\n # if mask[i] > 0:\n # tmp += str(idx.view(-1)[i].item()) + ','\n # sss += loss_raw.view(-1)[i].float().item()\n # ccc += 1\n # print('rank', self.global_rank, 'loss', loss.item(), 'lavg', sss / ccc)#, 'tmp', tmp, 'input', idx)\n return L2Wrap.apply(loss, logits)\n def training_step_end(self, batch_parts):\n if pl.__version__[0]!='2':\n all = self.all_gather(batch_parts)\n if self.trainer.is_global_zero:\n self.trainer.my_loss_all = all\n def generate_init_weight(self):\n print(\n f\"\"\"\n############################################################################" + }, + { + "comment": "Initializing model weights for large models is slow, so the code handles this process by iterating over each item in self.state_dict(). For certain specific named parameters (like 'ln_x.weight', '.att.output.', etc.), it sets values accordingly. Otherwise, it simply copies the original values. The lr_init is used to set scale and gain based on the shape of the parameter. If the environment variable RWKV_MY_TESTING contains 'r', certain parameters are excluded from initialization.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model.py\":916-942", + "content": "#\n# Init model weight (slow for large models)...\n#\n############################################################################\n\"\"\"\n )\n m = {}\n for n in self.state_dict():\n p = self.state_dict()[n]\n shape = p.shape\n gain = 1.0\n scale = 1.0\n if \"ln_\" in n or \".ln\" in n or \"time_\" in n or \"_mask\" in n or \"pos_emb\" in n or '.mask.' in n:\n if 'ln_x.weight' in n:\n layer_scale = (1+int(n.split('.')[1])) / self.args.n_layer\n m[n] = (p * 0.0) + (layer_scale ** 0.7)\n else:\n m[n] = p\n else:\n if n == \"emb.weight\":\n scale = -1 * self.args.lr_init\n else:\n if shape[0] > shape[1]:\n gain = math.sqrt(shape[0] / shape[1])\n if 'r' in os.environ[\"RWKV_MY_TESTING\"]:\n zero = [\".att.output.\", \".ffn.value.\", \".ffn.receptance.\", \".ffnPre.value.\", \".ffnPre.receptance.\", \"head_q.\", '.oo.', '.rr.']" + }, + { + "comment": "This code checks the variable 'n' to determine its scale factor. If 'n' matches a specific set of keys, it sets the scale to 0 (zero initialization), 0.5, or 0.1. It then creates an empty tensor based on GPU accelerator settings and initializes it with zero or uniform values depending on the scale.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model.py\":943-965", + "content": " else:\n zero = [\".att.key.\", \".att.receptance.\", \".att.output.\", \".ffn.value.\", \".ffn.receptance.\", \".ffnPre.value.\", \".ffnPre.receptance.\", \"head_q.\", '.oo.', '.rr.']\n for kk in zero:\n if kk in n:\n scale = 0\n if n == \"head.weight\":\n scale = 0.5\n if \"head_k.\" in n:\n scale = 0.1\n if \"head_q.\" in n:\n scale = 0\n print(f\"{str(shape[0]).ljust(5)} {str(shape[1]).ljust(5)} {str(scale).ljust(4)} {n}\")\n if self.args.accelerator.upper() == \"GPU\":\n m[n] = torch.empty((shape[0], shape[1]), device=\"cuda\")\n else:\n m[n] = torch.empty((shape[0], shape[1]))\n if scale == 0:\n nn.init.zeros_(m[n])\n elif scale < 0:\n nn.init.uniform_(m[n], a=scale, b=-scale)" + }, + { + "comment": "This code initializes the model's weights using orthogonal initialization with a specified gain and scale. It then moves the weights to CPU and converts them based on the RWKV_FLOAT_MODE environment variable. Finally, it collects garbage and empties the CUDA cache before returning the updated model parameters.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/model.py\":966-980", + "content": " else:\n nn.init.orthogonal_(m[n], gain=gain * scale)\n m[n] = m[n].cpu()\n if os.environ[\"RWKV_FLOAT_MODE\"] == \"fp16\":\n m[n] = m[n].half()\n elif os.environ[\"RWKV_FLOAT_MODE\"] == \"bf16\":\n m[n] = m[n].bfloat16()\n # if n == \"emb.weight\":\n # print(m[n])\n gc.collect()\n torch.cuda.empty_cache()\n return m" + } + ] +} \ No newline at end of file diff --git a/docs/doc/dc2d49b5-fb90-4754-bc11-f2e2eb0d86ef.json b/docs/doc/dc2d49b5-fb90-4754-bc11-f2e2eb0d86ef.json new file mode 100644 index 00000000..30147375 --- /dev/null +++ b/docs/doc/dc2d49b5-fb90-4754-bc11-f2e2eb0d86ef.json @@ -0,0 +1,25 @@ +{ + "summary": "The code sets up an RWKV model, tokenizes prompts, and handles unknown characters using rarest tokens from vocab.json. It saves/loads the model, generates text with logit sampling, updates context, measures time per iteration, and provides debug info.", + "details": [ + { + "comment": "Code imports necessary libraries, sets up configurations for the RWKV language model, and defines variables to control its behavior. It also ensures that tensor computations will utilize TF32 for faster performance if possible. The code specifies a context length (ctx_len), number of layers (n_layer), and embeddings size (n_embd). The type of model is determined by the value of model_type, either 'RWKV' or 'RWKV-ffnPre'. The trained model name and vocabulary file are also defined for later use. Unknown characters in input are set to be the rarest token from the vocab.json file.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/run.py\":0-29", + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nimport numpy as np\nimport math\nimport time\nimport types\nimport copy\nimport torch\nfrom torch.nn import functional as F\nfrom src.utils import TOKENIZER, Dataset\nfrom src.model_run import RWKV_RNN\ntorch.backends.cudnn.benchmark = True\ntorch.backends.cudnn.allow_tf32 = True\ntorch.backends.cuda.matmul.allow_tf32 = True\nnp.set_printoptions(precision=4, suppress=True, linewidth=200)\n### Step 1: set model ##################################################################################\nctx_len = 1024\nn_layer = 6\nn_embd = 512\nmodel_type = 'RWKV' # 'RWKV' or 'RWKV-ffnPre'\n# your trained model\nMODEL_NAME = 'trained-1'\nWORD_NAME = 'vocab' # the .json vocab (generated by train.py\n# --> set UNKNOWN_CHAR to the rarest token in your vocab.json <--" + }, + { + "comment": "Loading the RWKV model with specified configuration and tokenizing the prompt for further processing.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/run.py\":30-57", + "content": "# --> all unknown tokens in your context will be denoted by it <--\nUNKNOWN_CHAR = ' ' # here we just set it to [space] for simplicity\nRUN_DEVICE = 'cpu' # 'cpu' (already very fast) or 'cuda'\nDEBUG_DEBUG = False # True False - show softmax output\n### Step 2: set context ################################################################################\ncontext = \"\\nIn the\" # ==> this is your prompt\nNUM_TRIALS = 999\nLENGTH_PER_TRIAL = 500\nTEMPERATURE = 1.0\ntop_p = 0.7\ntop_p_newline = 0.9\n########################################################################################################\nprint(f'Loading {MODEL_NAME}...')\nmodel = RWKV_RNN(MODEL_NAME, RUN_DEVICE, model_type, n_layer, n_embd, ctx_len)\ntokenizer = TOKENIZER(WORD_NAME, UNKNOWN_CHAR=UNKNOWN_CHAR)\n########################################################################################################\ncontext = tokenizer.refine_context(context)\nprint('\\nYour prompt has ' + str(len(context)) + ' tokens.')\nprint('\\n--> Currently the firs" + }, + { + "comment": "This code is preparing a model for processing a given context. It initializes the model, clears its state, and then runs it on the input context to generate an output. The model is saved after initialization and loaded when needed. The process repeats for each trial length specified, building upon the previous hidden state. The debug mode prints additional information for troubleshooting.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/run.py\":57-88", + "content": "t run takes a while if your prompt is long, as we are using RNN to process the prompt. Use GPT to build the hidden state for better speed. <--\\n')\nfor TRIAL in range(1 if DEBUG_DEBUG else NUM_TRIALS):\n t_begin = time.time_ns()\n src_len = len(context)\n ctx = [tokenizer.stoi.get(s, tokenizer.UNKNOWN_CHAR) for s in context]\n print(('-' * 30) + context, end='')\n model.clear()\n if TRIAL == 0:\n init_state = types.SimpleNamespace()\n for i in range(src_len):\n x = ctx[:i+1]\n if i == src_len - 1:\n init_state.out = model.run(x)\n else:\n model.run(x)\n model.save(init_state)\n else:\n model.load(init_state)\n for i in range(src_len, src_len + (1 if DEBUG_DEBUG else LENGTH_PER_TRIAL)):\n x = ctx[:i+1]\n x = x[-ctx_len:]\n if i == src_len:\n out = copy.deepcopy(init_state.out)\n else:\n out = model.run(x)\n if DEBUG_DEBUG:\n print('model', np.array(x), '==>', np.array(" + }, + { + "comment": "This code generates text by sampling logits from a tokenizer, then prints the corresponding character and updates the context. It also measures the time taken for each iteration and outputs it in seconds.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v3/run.py\":89-97", + "content": " out), np.max(out), np.min(out))\n char = tokenizer.sample_logits(out, x, ctx_len, temperature=TEMPERATURE,\n top_p_usual=top_p, top_p_newline=top_p_newline)\n char = char.item()\n print(tokenizer.itos[int(char)], end='', flush=True)\n ctx += [char]\n t_end = time.time_ns()\n print(\"\\n----------\", round((t_end - t_begin) / (10 ** 9), 2), end='s ')" + } + ] +} \ No newline at end of file diff --git a/docs/doc/e1e4d547-1204-4e38-86a9-0c0300ca243f.json b/docs/doc/e1e4d547-1204-4e38-86a9-0c0300ca243f.json new file mode 100644 index 00000000..5898016a --- /dev/null +++ b/docs/doc/e1e4d547-1204-4e38-86a9-0c0300ca243f.json @@ -0,0 +1,35 @@ +{ + "summary": "This CUDA code optimizes neural network forward pass with shared memory, efficient matrix operations, and parallel computation. It performs convolution using kernel functions, shared memory, synchronization, and unrolled loops. Assertions ensure efficient GPU computation.", + "details": [ + { + "comment": "Code implements a CUDA kernel for the forward pass of a neural network layer, where each thread calculates output values based on input data and pre-stored parameters. It uses shared memory to store intermediate results and synchronizes threads with `__syncthreads()`. The loop iterates over time steps, applying element-wise operations to calculate output values.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/cuda/wkv5_cuda.cu\":0-35", + "content": "#include \n#include \n#include \"ATen/ATen.h\"\ntypedef at::BFloat16 bf16;\ntemplate \n__global__ void kernel_forward(const int B, const int T, const int C, const int H,\n const F *__restrict__ const _r, const F *__restrict__ const _k, const F *__restrict__ const _v, const float *__restrict__ _w, const F *__restrict__ _u,\n F *__restrict__ const _y)\n{\n const int b = blockIdx.x / H;\n const int h = blockIdx.x % H;\n const int i = threadIdx.x;\n _w += h*_N_;\n _u += h*_N_;\n __shared__ float r[_N_], k[_N_], u[_N_], w[_N_];\n float state[_N_] = {0};\n __syncthreads();\n w[i] = _w[i];\n u[i] = float(_u[i]);\n __syncthreads();\n for (int t = b*T*C + h*_N_ + i; t < (b+1)*T*C + h*_N_ + i; t += C)\n {\n __syncthreads();\n r[i] = float(_r[t]);\n k[i] = float(_k[t]);\n __syncthreads();\n const float v = float(_v[t]);\n float y = 0;\n #pragma unroll\n for (int j = 0; j < _N_; j+=4)" + }, + { + "comment": "This code is performing a matrix multiplication operation using CUDA. It takes in four input matrices, calculates the dot product between two sets of vectors, and updates the state vector accordingly. The result is then passed to a function F for further processing.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/cuda/wkv5_cuda.cu\":36-65", + "content": " {\n const float4& r_ = (float4&)(r[j]);\n const float4& k_ = (float4&)(k[j]);\n const float4& w_ = (float4&)(w[j]);\n const float4& u_ = (float4&)(u[j]);\n float4& s = (float4&)(state[j]);\n float4 x;\n x.x = k_.x * v;\n x.y = k_.y * v;\n x.z = k_.z * v;\n x.w = k_.w * v;\n y += r_.x * (u_.x * x.x + s.x);\n y += r_.y * (u_.y * x.y + s.y);\n y += r_.z * (u_.z * x.z + s.z);\n y += r_.w * (u_.w * x.w + s.w);\n s.x = s.x * w_.x + x.x;\n s.y = s.y * w_.y + x.y;\n s.z = s.z * w_.z + x.z;\n s.w = s.w * w_.w + x.w;\n }\n _y[t] = F(y);\n }\n}\ntemplate \n__global__ void kernel_backward(const int B, const int T, const int C, const int H,\n const F *__restrict__ const _r, const F *__restrict__ const _k, const F *__restrict__ const _v, const float *__restrict__ _w, const float *__restrict__ __w, const F *__restrict__ _u, const F *__restrict__ const _gy," + }, + { + "comment": "This function calculates the recurrent weight updates in a neural network using CUDA. It uses shared memory for efficient parallel computation and synchronizes threads with `__syncthreads()`. The variables `w`, `u` represent input and output tensors, while `v` and `gy` store intermediate results. The loop iterates over the time dimension (T) and channel dimension (C).", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/cuda/wkv5_cuda.cu\":66-103", + "content": " F *__restrict__ const _gr, F *__restrict__ const _gk, F *__restrict__ const _gv, F *__restrict__ const _gw, F *__restrict__ const _gu)\n{\n const int b = blockIdx.x / H;\n const int h = blockIdx.x % H;\n const int i = threadIdx.x;\n _w += h*_N_;\n _u += h*_N_;\n __w += h*_N_;\n __shared__ float w_[_N_], u_[_N_];\n __shared__ float r[_N_], k[_N_], v[_N_], gy[_N_];\n __syncthreads();\n w_[i] = _w[i];\n u_[i] = float(_u[i]);\n __syncthreads();\n const float w = w_[i];\n const float ww = __w[i];\n const float u = u_[i];\n float state[_N_] = {0}, saaaa[_N_] = {0}, sbbbb[_N_] = {0}, scccc[_N_] = {0}, sdddd[_N_] = {0};\n float gw = 0, gu = 0;\n const int t000 = b*T*C + h*_N_ + i;\n const int t111 = (b+1)*T*C + h*_N_ + i;\n const int t222 = t111 - 2*C;\n for (int t = t000; t < t111; t += C)\n {\n __syncthreads();\n v[i] = float(_v[t]);\n gy[i] = float(_gy[t]);\n __syncthreads();\n const float k = float(_k[t]);\n float gr = 0, gu_ = 0;\n #pragma unroll" + }, + { + "comment": "This code appears to be performing element-wise operations on arrays, such as adding and multiplying elements. It uses a for loop to iterate through the array elements, with each iteration updating the state variables and accumulating results for later use. The code also includes synchronization points (__syncthreads) for thread coordination, and it utilizes arrays with names like _gr, _gu, and _gw for storing intermediate and final results. Overall, this code seems to be part of a larger computation involving array manipulation and synchronization.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/cuda/wkv5_cuda.cu\":104-148", + "content": " for (int j = 0; j < _N_; j++)\n {\n float& s = state[j];\n float x = k * v[j];\n gr += (u * x + s) * gy[j];\n gu_ += x * gy[j];\n s = s * w + x;\n }\n _gr[t] = F(gr);\n gu += float(_r[t]) * gu_;\n }\n _gu[b*C + h*_N_ + i] = F(gu);\n for (int t = t000; t < t222; t += C)\n {\n __syncthreads();\n v[i] = float(_v[t]);\n gy[i] = float(_gy[t + 2*C]);\n __syncthreads();\n const float k = float(_k[t]);\n float gw_ = 0;\n #pragma unroll\n for (int j = 0; j < _N_; j++)\n {\n float& s = saaaa[j];\n float& s2 = sbbbb[j];\n float x = k * v[j];\n float tmp = w * (x + s);\n s = tmp;\n s2 = tmp + w * s2;\n gw_ += s2 * gy[j];\n }\n gw += float(_r[t + 2*C]) * gw_;\n } \n _gw[b*C + h*_N_ + i] = F(ww * gw);\n for (int t = t111 - C; t >= t000; t -= C)\n {\n __syncthreads();\n v[i] = float(_v[t]);\n gy[i] = float(_gy[t]);" + }, + { + "comment": "The code is performing a convolution operation in CUDA. It first executes a kernel function that processes the data in blocks and threads, then calls another kernel function to compute the final results. The code uses shared memory, synchronization, and unrolling of loops for efficient computation. The `cuda_forward` function initializes the necessary variables and launches the kernel functions with appropriate dimensions.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/cuda/wkv5_cuda.cu\":149-196", + "content": " __syncthreads();\n const float rr = float(_r[t]);\n float gk = 0;\n #pragma unroll\n for (int j = 0; j < _N_; j++)\n {\n float& s = scccc[j];\n float x = rr * gy[j];\n gk += (u * x + s) * v[j];\n s = x + s * w;\n }\n _gk[t] = F(gk);\n }\n for (int t = t111 - C; t >= t000; t -= C)\n {\n __syncthreads();\n r[i] = float(_r[t]);\n k[i] = float(_k[t]);\n __syncthreads();\n const float gyy = float(_gy[t]);\n float gv = 0;\n #pragma unroll\n for (int j = 0; j < _N_; j++)\n {\n float& s = sdddd[j];\n float x = gyy * r[j];\n gv += (u_[j] * x + s) * k[j];\n s = x + s * w_[j];\n }\n _gv[t] = F(gv);\n }\n}\nvoid cuda_forward(int B, int T, int C, int H, bf16 *r, bf16 *k, bf16 *v, float *w, bf16 *u, bf16 *y)\n{\n assert(H*_N_ == C);\n assert(_N_%4 == 0);\n kernel_forward<< >>(B, T, C, H, r, k, v, w, u, y);\n}\nvo" + }, + { + "comment": "Function `cuda_backward` is a CUDA kernel launcher, taking input/output parameters and invoking the `kernel_backward` kernel with appropriate grid and block dimensions. The assertions ensure correct memory layouts and alignment for efficient GPU computation.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/cuda/wkv5_cuda.cu\":196-201", + "content": "id cuda_backward(int B, int T, int C, int H, bf16 *r, bf16 *k, bf16 *v, float *w, float *ww, bf16 *u, bf16 *gy, bf16 *gr, bf16 *gk, bf16 *gv, bf16 *gw, bf16 *gu)\n{\n assert(H*_N_ == C);\n assert(_N_%4 == 0);\n kernel_backward<< >>(B, T, C, H, r, k, v, w, ww, u, gy, gr, gk, gv, gw, gu);\n}" + } + ] +} \ No newline at end of file diff --git a/docs/doc/e72a1d43-6375-4c30-95f5-6ad99ee7d872.json b/docs/doc/e72a1d43-6375-4c30-95f5-6ad99ee7d872.json new file mode 100644 index 00000000..d49ea02f --- /dev/null +++ b/docs/doc/e72a1d43-6375-4c30-95f5-6ad99ee7d872.json @@ -0,0 +1,40 @@ +{ + "summary": "The code defines a class \"MMapIndexedDataset\" with indexing methods and binary indexing functions for datasets, loading binary data from file paths using numpy's memmap. It supports setting document index and checking file existence.", + "details": [ + { + "comment": "This code is importing necessary libraries and defining functions for indexed datasets. It defines a class \"MMapIndexedDataset\" with an inner class \"Index\". The file includes utility functions like _warmup_mmap_file, print_rank_0, index_file_path, data_file_path, and code which handle reading and manipulating binary data from files. It also defines dtypes dictionary mapping numerical types to their respective codes.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/src/binidx.py\":0-47", + "content": "from lib2to3.pgen2 import token\nimport os\nimport torch\nimport numpy as np\nimport shutil\nimport struct\nfrom functools import lru_cache\nfrom itertools import accumulate\ndef print_rank_0(*message):\n \"\"\"If distributed is initialized print only on rank 0.\"\"\"\n if torch.distributed.is_initialized():\n if torch.distributed.get_rank() == 0:\n print(*message, flush=True)\n else:\n print(*message, flush=True)\ndef _warmup_mmap_file(path):\n pass\n # with open(path, \"rb\") as stream:\n # while stream.read(100 * 1024 * 1024):\n # pass\ndtypes = {\n 1: np.uint8,\n 2: np.int8,\n 3: np.int16,\n 4: np.int32,\n 5: np.int64,\n 6: float,\n 7: np.double,\n 8: np.uint16,\n}\ndef code(dtype):\n for k in dtypes.keys():\n if dtypes[k] == dtype:\n return k\n raise ValueError(dtype)\ndef index_file_path(prefix_path):\n return prefix_path + \".idx\"\ndef data_file_path(prefix_path):\n return prefix_path + \".bin\"\nclass MMapIndexedDataset(torch.utils.data.Dataset):\n class Index(object):" + }, + { + "comment": "This code is initializing a class that reads an index file, checking if it matches the expected format, and storing relevant information such as dtype, dtype size, length, and document count. The skip_warmup parameter allows for optional warming up of the index file.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/src/binidx.py\":48-71", + "content": " _HDR_MAGIC = b\"MMIDIDX\\x00\\x00\"\n def __init__(self, path, skip_warmup=False):\n with open(path, \"rb\") as stream:\n magic_test = stream.read(9)\n assert self._HDR_MAGIC == magic_test, (\n \"Index file doesn't match expected format. \"\n \"Make sure that --dataset-impl is configured properly.\"\n )\n # Little endian unsigned 64 Bit integer\n version = struct.unpack(\" 0:\n # self.data_pile = MMapIndexedDataset('/fsx/pile/pile_20B_tokenizer_text_document')\n self.data_pile = MMapIndexedDataset('/fsx/pile_deduped/pile_0.87_deduped_text_document')\n self.data_pile_size = len(self.data_pile._bin_buffer) // self.data._index._dtype_size" + }, + { + "comment": "The code is defining a dataset class. If data_size and vocab_size are not provided, the data pile is set to None. If my_pile_stage is greater than 0, it asserts that the data size is correct, calculates samples per epoch based on epoch_steps and real_bsz, checks if magic_prime is prime and within certain range, or loads data from file if data type is numpy. The code also provides information about current vocab size for the user.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/src/dataset.py\":42-59", + "content": " else:\n self.data_pile = None\n self.data_pile_size = 0\n if args.my_pile_stage > 0:\n # assert self.data_size == 332115325534 and self.vocab_size == 50277\n self.samples_per_epoch = args.epoch_steps * args.real_bsz\n assert self.samples_per_epoch == 40320\n rank_zero_info(f\"########## Pile 20b-tokenized stage {args.my_pile_stage} ##########\")\n dataset_slot = self.data_size // args.ctx_len\n if args.my_pile_stage != 4:\n assert MaybeIsPrime(args.magic_prime)\n assert args.magic_prime % 3 == 2\n assert args.magic_prime / dataset_slot > 0.99 and args.magic_prime / dataset_slot <= 1\n elif args.data_type == \"numpy\":\n self.data = np.load(args.data_file).astype(\"int\")\n self.vocab_size = args.vocab_size\n rank_zero_info(f\"Current vocab size = {self.vocab_size} (make sure it's correct)\")" + }, + { + "comment": "The code reads data from a file based on the provided data type and builds an array of tokens. It checks if the data_type is \"RWKV-LM/RWKV-v5/src/dataset.py\":60-79\n\"RWKV-LM/RWKV-v5/src/dataset.py\":60-79", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/src/dataset.py\":60-79", + "content": " self.data_size = len(self.data)\n rank_zero_info(f\"Data has {self.data_size} tokens.\")\n elif args.data_type == \"uint16\":\n self.data = np.fromfile(args.data_file, dtype=np.uint16).astype(\"int32\").reshape(-1, args.my_sample_len)\n self.vocab_size = args.vocab_size\n rank_zero_info(f\"Current vocab size = {self.vocab_size} (make sure it's correct)\")\n self.data_size = self.data.shape[0]\n rank_zero_info(f\"Data has {self.data_size} samples.\")\n else:\n if args.data_type == \"dummy\":\n rank_zero_info(\"Building dummy data...\")\n self.data = \"\"\n for i in range(100000):\n aa = (i) % 10000\n bb = (i * i) % 10000\n cc = aa + bb\n self.data += f\".{aa}+{bb}={cc}.\"\n else:\n self.data = open(args.data_file, \"r\", encoding=args.data_type).read()\n rank_zero_info(\"Building token list...\")" + }, + { + "comment": "The code snippet initializes a vocabulary and associated dictionaries, writes the vocabulary to a JSON file, sets the data size and vocab size, and defines two helper methods for converting between indices and tokens. It also includes a print statement that displays information about the current epoch, index, and rank in case of distributed training.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/src/dataset.py\":80-106", + "content": " unique = sorted(list(set(self.data)))\n self.vocab_size = len(unique)\n # rank_zero_info()\n # for u in unique:\n # print(u, end=' ')\n # rank_zero_info('\\n\\n')\n xx = 0\n xxObj = {}\n for u in unique:\n xxObj[xx] = u\n xx += 1\n with open(f\"{args.proj_dir}/vocab.json\", \"w\", encoding=\"utf-8\") as vocab_file:\n vocab_file.write(json.dumps(xxObj, ensure_ascii=False))\n self.data_size = len(self.data)\n rank_zero_info(f\"Data has {self.data_size} tokens, {self.vocab_size} vocab size.\")\n self.stoi = {ch: i for i, ch in enumerate(unique)}\n self.itos = {i: ch for i, ch in enumerate(unique)}\n def __len__(self):\n return self.args.epoch_steps * self.args.micro_bsz\n def __getitem__(self, idx):\n args = self.args\n rank = self.global_rank\n epoch = self.real_epoch\n world_size = self.world_size\n # print(f\"epoch {epoch} idx {idx} rank {rank}/{world_size}\")" + }, + { + "comment": "This code retrieves data from a dataset based on the specified data type. If the data type is uint16, it selects a random index and uses the sliced data for training. Otherwise, it determines the context length and required data length based on arguments, then picks a random spot in the dataset or cheats by picking a random location when specified.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/src/dataset.py\":108-133", + "content": " if args.data_type == \"uint16\":\n i = np.random.randint(0, self.data_size-1)\n dix = self.data[i]\n x = torch.tensor(dix[:-1], dtype=torch.long)\n y = torch.tensor(dix[1:], dtype=torch.long)\n else:\n ctx_len = args.ctx_len\n req_len = ctx_len + 1\n magic_prime = args.magic_prime\n data = self.data\n if args.my_pile_stage > 0:\n ii = 1 + epoch * self.samples_per_epoch + (idx * world_size) + rank\n if args.my_qa_mask > 0:\n ii_orig = ii\n if ii % 2 == 0:\n ii = -1\n data = self.data_pile\n else:\n ii = ii // 2\n if data == self.data_pile:\n i = np.random.randint(0, self.data_pile_size - req_len)\n else:\n if args.my_pile_stage == 4 or ii < args.my_random_steps:\n # cheat: pick a random spot in dataset" + }, + { + "comment": "This code randomly selects a chunk of data from a dataset. If the `args.my_pile_version` is 1, it picks a random position within the remaining data after subtracting `req_len`. For other cases, it calculates the index using a complex formula involving `ii`, `magic_prime`, and `ctx_len`. If `args.data_type` is \"binidx\" and `args.my_pile_version` is 1, it retrieves data from the `data` object at the specified offset and length. Otherwise, it simply picks a random position within the remaining data after subtracting `req_len`.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/src/dataset.py\":134-153", + "content": " if args.my_pile_version == 1:\n i = np.random.randint(0, self.data_size - req_len)\n else:\n i = np.random.randint(0, self.data_size)\n else:\n ii = ii - args.my_random_steps\n factor = (math.sqrt(5) - 1) / 2\n factor = int(magic_prime * factor)\n i = ((factor * ii * ii * ii) % magic_prime) * ctx_len\n i = i + args.my_pile_shift\n # print(f\"epoch {epoch} idx {idx} rank {rank}/{world_size} ii {ii} pos {round(i / self.data_size, 3)}\")\n else:\n # cheat: pick a random spot in dataset\n i = np.random.randint(0, self.data_size - req_len)\n if args.data_type == \"binidx\":\n if args.my_pile_version == 1:\n dix = data.get(idx=0, offset=i, length=req_len).astype(int)\n else:\n # self.data : cutoff, chunk_count, data" + }, + { + "comment": "Code is slicing data from a list based on the provided index 'i' and length 'req_len'. If the data type is \"numpy\", it directly assigns the slice to dix. Otherwise, it converts the slice into indices using self.stoi. If args.my_qa_mask == 1, it checks for a specific sequence of tokens in the sliced data and assigns either [1] * ctx_len or [0] * ctx_len to 'z'.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/src/dataset.py\":154-176", + "content": " for j in range(len(data)):\n if i < data[j][0]:\n ii = i\n i = (i - (data[j-1][0] if j > 0 else 0)) % data[j][1]\n dix = data[j][2].get(idx=0, offset=i, length=req_len).astype(int)\n # print(ii, j, i)\n break\n elif args.data_type == \"numpy\":\n dix = data[i : i + req_len]\n else:\n dix = [self.stoi[s] for s in data[i : i + req_len]]\n if args.my_qa_mask == 1:\n if data == self.data_pile:\n z = [1] * ctx_len\n else:\n z = [0] * ctx_len\n z_sum = 0\n isGood = False\n for i in range(3, ctx_len):\n if dix[i] == 27 and dix[i-1] == 34 and dix[i-2] == 187 and dix[i-3] == 187:\n isGood = True\n if dix[i] == 0:" + }, + { + "comment": "This code is initializing a context (z) for a dataset. It checks if the context sum is zero and if so, sets all elements of z to 1 randomly selects an index from the data pile, converts indices to tensors, and returns the input (x), output (y), and context (z) unless my_qa_mask is set to 1. In that case, it only returns x, y.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v5/src/dataset.py\":177-199", + "content": " isGood = False\n if isGood:\n z[i] = 1\n z_sum += 1\n if z_sum == 0:\n z = [1] * ctx_len\n i = np.random.randint(0, self.data_pile_size - req_len)\n dix = self.data_pile.get(idx=0, offset=i, length=req_len).astype(int)\n z = torch.tensor(z, dtype=torch.bfloat16)\n x = torch.tensor(dix[:-1], dtype=torch.long)\n y = torch.tensor(dix[1:], dtype=torch.long)\n # if ii_orig < 50:\n # # if rank == 1:\n # print('rank', rank, 'i', ii_orig, ii, i, 'x', x[:5], '...', x[-5:])\n # else:\n # exit(0)\n if args.my_qa_mask == 1:\n return x, y, z\n return x, y" + } + ] +} \ No newline at end of file diff --git a/docs/doc/f3931c54-491d-4513-a050-f5c297119bcf.json b/docs/doc/f3931c54-491d-4513-a050-f5c297119bcf.json new file mode 100644 index 00000000..40a04ea6 --- /dev/null +++ b/docs/doc/f3931c54-491d-4513-a050-f5c297119bcf.json @@ -0,0 +1,65 @@ +{ + "summary": "This code initializes a custom PyTorch dataset, handling data loading and resizing for various scenarios. It manages token lists, WebDatasets, indices, and data selection for centralized or distributed training. The code selects random data based on stage, data type, and my_pile_version, converts indices to string tokens, checks input data patterns, generates random index if not found, converts to tensors, and handles different return values based on `args.my_qa_mask`.", + "details": [ + { + "comment": "This code is initializing a custom PyTorch dataset named MyDataset. It takes arguments and checks if the data type is \"binidx\". If so, it sets the vocabulary size, loads the dataset from a file using MMapIndexedDataset, and calculates the total number of tokens in the dataset. It also provides informational messages to the user about the vocabulary size and the total number of tokens in the dataset.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/dataset.py\":0-24", + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nimport json, math, random, os, sys\nimport numpy as np\nimport torch\nfrom torch.utils.data import Dataset\nfrom pytorch_lightning.utilities import rank_zero_info\nfrom .binidx import MMapIndexedDataset\nfrom .utils import MaybeIsPrime\nclass MyDataset(Dataset):\n def __init__(self, args):\n self.args = args\n if args.data_type == \"binidx\":\n self.vocab_size = args.vocab_size\n rank_zero_info(f\"Current vocab size = {self.vocab_size} (make sure it's correct)\")\n if args.my_pile_version == 1:\n self.data = MMapIndexedDataset(args.data_file)\n self.data_size = len(self.data._bin_buffer) // self.data._index._dtype_size\n rank_zero_info(f\"Data has {self.data_size} tokens.\")" + }, + { + "comment": "This code block is checking if the `my_pile_version` argument is equal to 2. If so, it reads in the data file and prepares it for use. It splits the data into chunks and asserts that the size of each chunk matches the expected size. Finally, if the `my_qa_mask` argument is greater than zero, it assigns a specific dataset to `data_pile`.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/dataset.py\":25-41", + "content": " elif args.my_pile_version == 2:\n data_list = open(args.data_file, \"r\", encoding='utf-8').read().strip().split('\\n')\n data_list = [i.strip().split(' ') for i in data_list]\n self.data = []\n self.data_size = int(data_list[-1][-1])\n rank_zero_info(f\"Data has {self.data_size} chunks.\")\n for d in data_list:\n data = MMapIndexedDataset(d[0])\n data_size = len(data._bin_buffer) // data._index._dtype_size\n assert (data_size - args.ctx_len) == int(d[1])\n self.data += [[int(d[-1]), int(d[1]), data]]\n # rank_zero_info(self.data)\n if args.my_qa_mask > 0:\n # self.data_pile = MMapIndexedDataset('/fsx/pile/pile_20B_tokenizer_text_document')\n self.data_pile = MMapIndexedDataset('/fsx/pile_deduped/pile_0.87_deduped_text_document')\n self.data_pile_size = len(self.data_pile._bin_buffer) // self.data._index._dtype_size" + }, + { + "comment": "If the data is not tokenized, self.data_pile is set to None and self.data_pile_size is set to 0. If args.my_pile_stage is greater than 0, it checks if the dataset size is as expected (332115325534 tokens) and vocab size (50277). It calculates samples_per_epoch based on epoch_steps and real_bsz. Asserts that samples_per_epoch is 40320. Prints rank-zero info with stage number if args.my_pile_stage != 4. If args.data_type is \"numpy\", loads data from args.data_file, converts it to int, sets self.vocab_size, and prints current vocab size to ensure correctness.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/dataset.py\":42-59", + "content": " else:\n self.data_pile = None\n self.data_pile_size = 0\n if args.my_pile_stage > 0:\n # assert self.data_size == 332115325534 and self.vocab_size == 50277\n self.samples_per_epoch = args.epoch_steps * args.real_bsz\n assert self.samples_per_epoch == 40320\n rank_zero_info(f\"########## Pile 20b-tokenized stage {args.my_pile_stage} ##########\")\n dataset_slot = self.data_size // args.ctx_len\n if args.my_pile_stage != 4:\n assert MaybeIsPrime(args.magic_prime)\n assert args.magic_prime % 3 == 2\n assert args.magic_prime / dataset_slot > 0.99 and args.magic_prime / dataset_slot <= 1\n elif args.data_type == \"numpy\":\n self.data = np.load(args.data_file).astype(\"int\")\n self.vocab_size = args.vocab_size\n rank_zero_info(f\"Current vocab size = {self.vocab_size} (make sure it's correct)\")" + }, + { + "comment": "The code handles loading and resizing data based on the specified data type. If no data type is specified, it loads uint16 data from the file, resizes vocab size, calculates number of samples, and notifies rank 0. If data type is \"wds_img\", it sets vocab size and data size to -1, sets data to None, and error count to 0. If data type is \"dummy\", it creates dummy data by concatenating numbers and notifies rank 0.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/dataset.py\":60-81", + "content": " self.data_size = len(self.data)\n rank_zero_info(f\"Data has {self.data_size} tokens.\")\n elif args.data_type == \"uint16\":\n self.data = np.fromfile(args.data_file, dtype=np.uint16).astype(\"int32\").reshape(-1, args.my_sample_len)\n self.vocab_size = args.vocab_size\n rank_zero_info(f\"Current vocab size = {self.vocab_size} (make sure it's correct)\")\n self.data_size = self.data.shape[0]\n rank_zero_info(f\"Data has {self.data_size} samples.\")\n elif args.data_type == \"wds_img\":\n self.vocab_size = -1\n self.data_size = -1\n self.data = None\n self.error_count = 0\n else:\n if args.data_type == \"dummy\":\n rank_zero_info(\"Building dummy data...\")\n self.data = \"\"\n for i in range(100000):\n aa = (i) % 10000\n bb = (i * i) % 10000\n cc = aa + bb\n self.data += f\".{aa}+{bb}={cc}.\"" + }, + { + "comment": "The code reads data from a file and builds a token list, storing it in a JSON file. It then creates dictionaries for mapping tokens to indices and indices to tokens. Finally, it provides methods for the length of the dataset and accessing specific items within the dataset.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/dataset.py\":82-106", + "content": " else:\n self.data = open(args.data_file, \"r\", encoding=args.data_type).read()\n rank_zero_info(\"Building token list...\")\n unique = sorted(list(set(self.data)))\n self.vocab_size = len(unique)\n # rank_zero_info()\n # for u in unique:\n # print(u, end=' ')\n # rank_zero_info('\\n\\n')\n xx = 0\n xxObj = {}\n for u in unique:\n xxObj[xx] = u\n xx += 1\n with open(f\"{args.proj_dir}/vocab.json\", \"w\", encoding=\"utf-8\") as vocab_file:\n vocab_file.write(json.dumps(xxObj, ensure_ascii=False))\n self.data_size = len(self.data)\n rank_zero_info(f\"Data has {self.data_size} tokens, {self.vocab_size} vocab size.\")\n self.stoi = {ch: i for i, ch in enumerate(unique)}\n self.itos = {i: ch for i, ch in enumerate(unique)}\n def __len__(self):\n return self.args.epoch_steps * self.args.micro_bsz\n def __getitem__(self, idx):" + }, + { + "comment": "This code initializes a WebDataset for image data with specified transformation. It shuffles the dataset and decodes it into torchrgb format, then maps the tuple of jpg, json, and txt files to image transformations, identity mappings for other file types, and returns the initialized dataset.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/dataset.py\":107-127", + "content": " args = self.args\n rank = self.global_rank\n epoch = self.real_epoch\n world_size = self.world_size\n # print(f\"epoch {epoch} idx {idx} rank {rank}/{world_size}\")\n if args.data_type == \"wds_img\":\n def init_wds(self, bias=0):\n def identity(x):\n return x \n import webdataset as wds\n import torchvision.transforms as transforms\n # img_transform = transforms.Compose(\n # [transforms.CenterCrop(256)]\n # )\n img_transform = transforms.Compose([\n transforms.CenterCrop(512),\n transforms.Resize((args.my_img_size))\n ])\n self.data_raw = wds.WebDataset(args.data_file, resampled=True).shuffle(10000, initial=1000, rng=random.Random(epoch*100000+rank+bias*1e9)).decode(\"torchrgb\").to_tuple(\"jpg\", \"json\", \"txt\").map_tuple(img_transform, identity, identity)\n for pp in self.data_raw.pipeline:" + }, + { + "comment": "This code initializes a WebDataset for distributed training, handling potential errors in data loading and maintaining worker seeds for determinism. If the dataset is not initialized, it calls init_wds() to do so. It attempts to load data from the dataset 10 times, printing an error message if there's a failure, then re-initializes the WebDataset before retrying. The code also prints information about the current epoch, rank, and progress when loading data successfully. Additionally, it has the potential to append data samples to a file named \"sample_{rank}.txt\".", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/dataset.py\":128-149", + "content": " if 'Resampled' in str(pp):\n pp.deterministic = True\n def worker_seed():\n return rank*100000+epoch+bias*1e9\n pp.worker_seed = worker_seed\n self.data = iter(self.data_raw)\n # print(f\"WebDataset loaded for rank {rank} epoch {epoch}\")\n if self.data == None:\n init_wds(self)\n trial = 0\n while trial < 10:\n try:\n dd = next(self.data) # jpg, json, txt\n break\n except:\n print(f'[dataloader error - epoch {epoch} rank {rank} - trying a new shuffle]')\n self.error_count += 1\n init_wds(self, self.error_count)\n trial += 1\n pass\n # print(f\"epoch {epoch} idx {idx} rank {rank}/{world_size} {dd[2]}\")\n # with open(f\"sample_{rank}.txt\", \"a\", encoding=\"utf-8\") as tmp:" + }, + { + "comment": "The code initializes a random index 'i' within the data range, creates two tensors 'x' and 'y' with torch.tensor() from the data slice. If the data type is uint16, it selects a random index 'i' from 0 to data_size-1, extracts x from the data at index i, y from the next element. Else, it calculates the context length, required length, and uses magic prime for data selection based on current epoch, idx, rank, world_size. If my_pile_stage is greater than 0, it computes the index 'ii', checks if my_qa_mask > 0 to determine if ii should be -1 or half of itself. Depending on data type and certain conditions, it selects data_pile for data tensor.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/dataset.py\":150-174", + "content": " # tmp.write(f\"epoch {epoch} idx {idx} rank {rank}/{world_size} {int(dd[1]['key'])}\\n\")\n return dd[0], dd[2]\n else:\n if args.data_type == \"uint16\":\n i = np.random.randint(0, self.data_size-1)\n dix = self.data[i]\n x = torch.tensor(dix[:-1], dtype=torch.long)\n y = torch.tensor(dix[1:], dtype=torch.long)\n else:\n ctx_len = args.ctx_len\n req_len = ctx_len + 1\n magic_prime = args.magic_prime\n data = self.data\n if args.my_pile_stage > 0:\n ii = 1 + epoch * self.samples_per_epoch + (idx * world_size) + rank\n if args.my_qa_mask > 0:\n ii_orig = ii\n if ii % 2 == 0:\n ii = -1\n data = self.data_pile\n else:\n ii = ii // 2\n if data == self.data_pile:" + }, + { + "comment": "This code randomly selects a chunk of data for model training. If the stage is 4 or random steps are involved, it either picks a random spot in the dataset based on the current version, or uses a formula involving magic prime and pile shift to determine the position. It then adds the context length. Finally, if the input type is \"cheat\", it picks a random spot in the dataset. The code also prints some information about epoch, index, rank, world size, iterations, and selected position.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/dataset.py\":175-191", + "content": " i = np.random.randint(0, self.data_pile_size - req_len)\n else:\n if args.my_pile_stage == 4 or ii < args.my_random_steps:\n # cheat: pick a random spot in dataset\n if args.my_pile_version == 1:\n i = np.random.randint(0, self.data_size - req_len)\n else:\n i = np.random.randint(0, self.data_size)\n else:\n ii = ii - args.my_random_steps\n factor = (math.sqrt(5) - 1) / 2\n factor = int(magic_prime * factor)\n i = ((factor * ii * ii * ii) % magic_prime) * ctx_len\n i = i + args.my_pile_shift\n # print(f\"epoch {epoch} idx {idx} rank {rank}/{world_size} ii {ii} pos {round(i / self.data_size, 3)}\")\n else:\n # cheat: pick a random spot in dataset" + }, + { + "comment": "The code retrieves a random index (i) within the data size and based on the data type, it selects the appropriate indices (dix) from the provided dataset. If data_type is \"binidx\", it checks if my_pile_version is 1 or not; then it gets the indices using different methods. If data_type is \"numpy\", it directly selects the indices using numpy's slicing. Finally, it converts string tokens to indices using self.stoi for non-\"binidx\" and \"numpy\" data types.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/dataset.py\":192-212", + "content": " i = np.random.randint(0, self.data_size - req_len)\n if args.data_type == \"binidx\":\n if args.my_pile_version == 1:\n dix = data.get(idx=0, offset=i, length=req_len).astype(int)\n else:\n # self.data : cutoff, chunk_count, data\n for j in range(len(data)):\n if i < data[j][0]:\n ii = i\n i = (i - (data[j-1][0] if j > 0 else 0)) % data[j][1]\n dix = data[j][2].get(idx=0, offset=i, length=req_len).astype(int)\n # print(ii, j, i)\n break\n elif args.data_type == \"numpy\":\n dix = data[i : i + req_len]\n else:\n dix = [self.stoi[s] for s in data[i : i + req_len]]\n if args.my_qa_mask == 1:\n if data == self.data_pile:" + }, + { + "comment": "This code checks if the input data has a specific pattern, and sets corresponding values in the 'z' list. If no such pattern is found, it generates a random index and retrieves the data from the dataset. It then converts the data into tensors for further processing.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/dataset.py\":213-233", + "content": " z = [1] * ctx_len\n else:\n z = [0] * ctx_len\n z_sum = 0\n isGood = False\n for i in range(3, ctx_len):\n if dix[i] == 27 and dix[i-1] == 34 and dix[i-2] == 187 and dix[i-3] == 187:\n isGood = True\n if dix[i] == 0:\n isGood = False\n if isGood:\n z[i] = 1\n z_sum += 1\n if z_sum == 0:\n z = [1] * ctx_len\n i = np.random.randint(0, self.data_pile_size - req_len)\n dix = self.data_pile.get(idx=0, offset=i, length=req_len).astype(int)\n z = torch.tensor(z, dtype=torch.bfloat16)\n x = torch.tensor(dix[:-1], dtype=torch.long)\n y = torch.tensor(dix[1:], dtype=torch.long)" + }, + { + "comment": "This code snippet appears to be part of a larger function. It checks if `ii_orig` is less than 50, and if it is, it prints some information related to rank, indices, and a portion of the array `x`. If `args.my_qa_mask` is equal to 1, it returns `x`, `y`, and `z`, otherwise it simply returns `x` and `y`. The purpose and functionality of this code may vary depending on the larger context in which it exists.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/src/dataset.py\":235-244", + "content": " # if ii_orig < 50:\n # # if rank == 1:\n # print('rank', rank, 'i', ii_orig, ii, i, 'x', x[:5], '...', x[-5:])\n # else:\n # exit(0)\n if args.my_qa_mask == 1:\n return x, y, z\n return x, y" + } + ] +} \ No newline at end of file diff --git a/docs/doc/f99ad258-84c2-44f2-946f-2429f53eba29.json b/docs/doc/f99ad258-84c2-44f2-946f-2429f53eba29.json new file mode 100644 index 00000000..ea163d4f --- /dev/null +++ b/docs/doc/f99ad258-84c2-44f2-946f-2429f53eba29.json @@ -0,0 +1,35 @@ +{ + "summary": "This code utilizes the RWKV Language Model for text generation, setting up parameters and initializing the model. It tokenizes prompts, performs multiple trials, calculates logits, handles special cases, prints tokens, and saves/loads model states.", + "details": [ + { + "comment": "This code is importing necessary libraries and modules for the RWKV Language Model. It sets the TOKEN_MODE variable to either 'char', 'bpe', or 'pile' depending on whether the model is trained from scratch, pre-trained pile models are being tested, etc. This helps set up the appropriate configuration for running the model.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/run.py\":0-25", + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nimport numpy as np\nimport math, os\nimport time\nimport types\nimport copy\nimport torch\nfrom torch.nn import functional as F\nfrom src.utils import TOKENIZER, Dataset\ntorch.backends.cudnn.benchmark = True\ntorch.backends.cudnn.allow_tf32 = True\ntorch.backends.cuda.matmul.allow_tf32 = True\nnp.set_printoptions(precision=4, suppress=True, linewidth=200)\n########################################################################################################\n# Step 1: set model\n# \n# Set TOKEN_MODE to 'char' or 'bpe' if the model is trained by 'train.py' from scratch.\n#\n# Set TOKEN_MODE to 'pile' if you want to test pre-trained pile models.\n########################################################################################################\nTOKEN_MODE = 'char' # char / bpe / pile" + }, + { + "comment": "This code is responsible for setting the necessary parameters and model name based on the tokenization mode. The modes include character, byte-pair encoding (BPE), and Pile. If using a pre-trained RWKV model, it allows specifying a fine-tuned model for better performance. Different model architectures like BERT, RoBERTa, and GPT are used depending on the mode. The parameters n_layer, n_embd, and ctx_len define the number of layers, embedding dimensions, and context length respectively for the chosen model.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/run.py\":27-59", + "content": "n_layer = 6\nn_embd = 512\nctx_len = 1024\nif TOKEN_MODE == 'char':\n MODEL_NAME = 'trained-500' # your trained model\n WORD_NAME = 'vocab' # the .json vocab (generated by train.py)\n # set UNKNOWN_CHAR to the rarest token in your vocab.json, and all unknown tokens in your prompt will be denoted by it\n UNKNOWN_CHAR = ' ' # here we just set it to ' ' for simplicity\nelif TOKEN_MODE == 'bpe':\n MODEL_NAME = 'trained-500' # your trained model\n WORD_NAME = ['model-vocab.json', 'model-merges.txt'] # [vocab, merge] for your BPE model\n UNKNOWN_CHAR = None\nelif TOKEN_MODE == 'pile':\n WORD_NAME = ['20B_tokenizer.json', '20B_tokenizer.json']\n UNKNOWN_CHAR = None\n #---> you can set MODEL_NAME to your fine-tuned model <---\n MODEL_NAME = 'RWKV-4-Pile-169M-20220807-8023'\n # MODEL_NAME = 'trained-11'\n n_layer = 12\n n_embd = 768\n ctx_len = 1024\n # MODEL_NAME = 'RWKV-4-Pile-430M-20220808-8066'\n # n_layer = 24\n # n_embd = 1024\n # ctx_len = 1024\n # MODEL_NAME = 'RWKV-4-Pile-1B5-20220903-8040'" + }, + { + "comment": "The code is setting the environment variables for the RWKV model, specifying the model type, and providing the prompt and sampling parameters. The model is currently using fp32 (floating-point arithmetic) and running on the CPU. The prompt provided is a text describing the discovery of Chinese-speaking dragons in Tibet. The code will perform 999 trials, each producing output of length 333, with temperature set to 1.0 for sampling and top_p set to 0.7 and 0.9 for softmax output.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/run.py\":60-86", + "content": " # n_layer = 24\n # n_embd = 2048\n # ctx_len = 1024 \nos.environ['RWKV_FLOAT_MODE'] = 'fp32' # 'bf16' / 'fp16' / 'fp32' (note: only using fp32 at this moment)\nos.environ['RWKV_RUN_DEVICE'] = 'cpu' # 'cpu' (already very fast) or 'cuda'\nmodel_type = 'RWKV' # 'RWKV' or 'RWKV-ffnPre'\n########################################################################################################\n# Step 2: set prompt & sampling stuffs\n########################################################################################################\n# context = 'A'\n# context = \"\\nIn the\"\n# context = '\\nSugar:'\ncontext = '\\nIn a shocking finding, scientist discovered a herd of dragons living in a remote, previously unexplored valley, in Tibet. Even more surprising to the researchers was the fact that the dragons spoke perfect Chinese.'\nNUM_TRIALS = 999\nLENGTH_PER_TRIAL = 333\nTEMPERATURE = 1.0\ntop_p = 0.7\ntop_p_newline = 0.9 # only used in TOKEN_MODE = char\nDEBUG_DEBUG = False # True False --> show softmax output\n####" + }, + { + "comment": "Loading the specified RWKV model, creating a tokenizer for text processing, refining context if in character mode, converting context to tokens, printing the number of tokens in prompt, informing about the time taken to process long prompts with RNN or using GPT for faster speed, and starting a loop for multiple trials.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/run.py\":86-108", + "content": "####################################################################################################\nprint(f'Loading {MODEL_NAME}...')\nfrom src.model_run import RWKV_RNN\nmodel = RWKV_RNN(MODEL_NAME, os.environ['RWKV_RUN_DEVICE'], model_type, n_layer, n_embd, ctx_len)\ntokenizer = TOKENIZER(WORD_NAME, UNKNOWN_CHAR=UNKNOWN_CHAR)\n########################################################################################################\nif tokenizer.charMode:\n context = tokenizer.refine_context(context)\n ctx = [tokenizer.stoi.get(s, tokenizer.UNKNOWN_CHAR) for s in context]\nelse:\n ctx = tokenizer.tokenizer.encode(context)\nsrc_len = len(ctx)\nsrc_ctx = ctx.copy()\nprint('\\nYour prompt has ' + str(src_len) + ' tokens.')\nprint('\\n--> Currently the first run takes a while if your prompt is long, as we are using RNN to process the prompt. Use GPT to build the hidden state for better speed. <--\\n')\nfor TRIAL in range(1 if DEBUG_DEBUG else NUM_TRIALS):\n t_begin = time.time_ns()\n print(('-' * 30) + context, end='')" + }, + { + "comment": "This code initializes a model and its state, then generates text based on the input sequence. It saves and loads the model's state for subsequent trials, calculates logits for character sampling, and handles special cases like disabling <|endoftext|>. The DEBUG_DEBUG print statement displays output statistics if enabled.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/run.py\":109-141", + "content": " ctx = src_ctx.copy()\n model.clear()\n if TRIAL == 0:\n init_state = types.SimpleNamespace()\n for i in range(src_len):\n x = ctx[:i+1]\n if i == src_len - 1:\n init_state.out = model.run(x)\n else:\n model.run(x)\n model.save(init_state)\n else:\n model.load(init_state)\n for i in range(src_len, src_len + (1 if DEBUG_DEBUG else LENGTH_PER_TRIAL)):\n x = ctx[:i+1]\n x = x[-ctx_len:]\n if i == src_len:\n out = copy.deepcopy(init_state.out)\n else:\n out = model.run(x)\n if DEBUG_DEBUG:\n print('model', np.array(x), '==>', np.array(\n out), np.max(out), np.min(out))\n if TOKEN_MODE == 'pile':\n out[0] = -999999999 # disable <|endoftext|>\n char = tokenizer.sample_logits(out, x, ctx_len, temperature=TEMPERATURE,\n top_p_usual=top_p, top_p_newline=top_p_newline)\n char = char.item()\n if tokenizer.charMode:" + }, + { + "comment": "The code is printing each token in a sequence using either the integer representation or the decoded version from the tokenizer, and storing each character in the ctx list. It also measures the time taken for this process and prints it in seconds at the end.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/run.py\":142-148", + "content": " print(tokenizer.itos[int(char)], end='', flush=True)\n else:\n print(tokenizer.tokenizer.decode(int(char)), end='', flush=True)\n ctx += [char]\n t_end = time.time_ns()\n print(\"\\n----------\", round((t_end - t_begin) / (10 ** 9), 2), end='s ')" + } + ] +} \ No newline at end of file diff --git a/docs/doc/f9c5f99e-9965-4c10-a791-5db0997733cd.json b/docs/doc/f9c5f99e-9965-4c10-a791-5db0997733cd.json new file mode 100644 index 00000000..89ab43d4 --- /dev/null +++ b/docs/doc/f9c5f99e-9965-4c10-a791-5db0997733cd.json @@ -0,0 +1,80 @@ +{ + "summary": "This code uses CUDA kernels, fancy initialization, and attention mechanism layers for RWKV models. It implements a time-mixing channel model with custom GPT layers, layer normalization, attention, feed-forward layers, and DeepSpeed's optimizer for improved performance.", + "details": [ + { + "comment": "This code imports necessary libraries and defines a function called L2Wrap for calculating the loss. It also initializes RWKV_HEAD_QK_DIM variable and begins defining a class likely to be used in model training. The class uses the L2Wrap function to encourage logits to be close to 0.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/src/model.py\":0-31", + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nimport math, os\nimport numpy as np\nimport logging\nimport torch\nimport torch.nn as nn\nfrom torch.nn import functional as F\ntry:\n from deepspeed.ops.adam import FusedAdam\nexcept:\n pass # some poor windows users cant install deepspeed\nlogger = logging.getLogger(__name__)\nRWKV_HEAD_QK_DIM = 0\nprint(f'\\nRWKV_HEAD_QK_DIM {RWKV_HEAD_QK_DIM}\\n')\nclass L2Wrap(torch.autograd.Function):\n @staticmethod\n def forward(ctx, loss, y):\n ctx.save_for_backward(y)\n return loss\n @staticmethod\n def backward(ctx, grad_output):\n y = ctx.saved_tensors[0]\n # to encourage the logits to be close to 0\n factor = 1e-4 / (y.shape[0] * y.shape[1])\n maxx, ids = torch.max(y, -1, keepdim=True)\n gy = torch.zeros_like(y)" + }, + { + "comment": "This code defines a WKV class that uses the wkv_op.cpp and wkv_cuda.cu CUDA kernel files to perform computations on input B, T, C, w, u, k, and v. The function checks if T is within the maximum allowed value (T_MAX) and if the product of B and C is divisible by the smaller of B and C. If these conditions are met, it proceeds with further computations using a 32-bit floating point mode if '32' is present in the RWKV_FLOAT_MODE environment variable.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/src/model.py\":32-54", + "content": " gy.scatter_(-1, ids, maxx * factor)\n return (grad_output, gy)\n########################################################################################################\n# CUDA Kernel\n########################################################################################################\nT_MAX = 1024 # increase this if your ctx_len is long [NOTE: TAKES LOTS OF VRAM!]\n# it's possible to go beyond CUDA limitations if you slice the ctx and pass the hidden state in each slice\nfrom torch.utils.cpp_extension import load\nwkv_cuda = load(name=\"wkv\", sources=[\"cuda/wkv_op.cpp\", \"cuda/wkv_cuda.cu\"],\n verbose=True, extra_cuda_cflags=['-res-usage', '--maxrregcount 60', '--use_fast_math', '-O3', '-Xptxas -O3', f'-DTmax={T_MAX}'])\nclass WKV(torch.autograd.Function):\n @staticmethod\n def forward(ctx, B, T, C, w, u, k, v):\n ctx.B = B\n ctx.T = T\n ctx.C = C\n assert T <= T_MAX\n assert B * C % min(C, 1024) == 0\n if '32' in os.environ['RWKV_FLOAT_MODE']:" + }, + { + "comment": "This code snippet initializes tensors and handles different float modes for model forward pass. It saves tensors for backward pass, ensures correct shape and alignment, and returns the result based on the specified float mode. The backward method performs cleanup by creating zeros tensors for gradients and asserts that certain conditions are met before proceeding with calculations.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/src/model.py\":55-83", + "content": " w = -torch.exp(w.contiguous())\n u = u.contiguous()\n k = k.contiguous()\n v = v.contiguous()\n else:\n w = -torch.exp(w.float().contiguous())\n u = u.float().contiguous()\n k = k.float().contiguous()\n v = v.float().contiguous()\n ctx.save_for_backward(w, u, k, v)\n y = torch.empty((B, T, C), device='cuda', memory_format=torch.contiguous_format)\n wkv_cuda.forward(B, T, C, w, u, k, v, y)\n if '32' in os.environ['RWKV_FLOAT_MODE']:\n return y\n elif os.environ['RWKV_FLOAT_MODE'] == 'fp16':\n return y.half()\n elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':\n return y.bfloat16()\n @staticmethod\n def backward(ctx, gy):\n B = ctx.B\n T = ctx.T\n C = ctx.C\n assert T <= T_MAX\n assert B * C % min(C, 1024) == 0\n w, u, k, v = ctx.saved_tensors\n gw = torch.zeros((B, C), device='cuda').contiguous()\n gu = torch.zeros((B, C), device='cuda').contiguous()" + }, + { + "comment": "The code defines a function that performs backward pass for the RWKV model on CUDA devices. It initializes gradients for weights and inputs, then applies the backward pass using the provided weights and input tensors. Depending on the RWKV_FLOAT_MODE environment variable, it returns gradients in different floating-point precisions or None if not running on a CUDA device. The RUN_CUDA function wraps this logic for convenience by moving model weights and inputs to CUDA devices before applying the backward pass.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/src/model.py\":84-102", + "content": " gk = torch.zeros((B, T, C), device='cuda').contiguous()\n gv = torch.zeros((B, T, C), device='cuda').contiguous()\n if '32' in os.environ['RWKV_FLOAT_MODE']:\n wkv_cuda.backward(B, T, C, w, u, k, v, gy.contiguous(), gw, gu, gk, gv)\n else:\n wkv_cuda.backward(B, T, C, w, u, k, v, gy.float().contiguous(), gw, gu, gk, gv)\n gw = torch.sum(gw, dim=0)\n gu = torch.sum(gu, dim=0)\n if '32' in os.environ['RWKV_FLOAT_MODE']:\n return (None, None, None, gw, gu, gk, gv)\n elif os.environ['RWKV_FLOAT_MODE'] == 'fp16':\n return (None, None, None, gw.half(), gu.half(), gk.half(), gv.half())\n elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':\n return (None, None, None, gw.bfloat16(), gu.bfloat16(), gk.bfloat16(), gv.bfloat16())\ndef RUN_CUDA(B, T, C, w, u, k, v):\n return WKV.apply(B, T, C, w.cuda(), u.cuda(), k.cuda(), v.cuda())\n########################################################################################################" + }, + { + "comment": "This code initializes all linear and embedding layers in a model using fancy initialization. This is done by iterating through all modules of the model, skipping non-linear and non-embedding layers. It finds the weight parameters for these layers and performs some operations to initialize them, including finding their names. The code also provides some information about the process, such as it being slow for large models and needing to be run on a single GPU before loading onto others.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/src/model.py\":103-125", + "content": "# RWKV: RWKV Time-mix + RWKV Channel-mix\n########################################################################################################\ndef RWKV_Init(model, args): # fancy initialization of all lin & emb layer in the model\n print(\"\\n[--> first run, init model params (very slow for large models) <--]\")\n print(\"[so you shall only do it for 1 single GPU and save the checkpt and load it when using multiple GPU]\\n\")\n for mm in model.modules():\n if \"RecursiveScriptModule\" in str(type(mm)):\n if mm.original_name not in [\"Linear\"]:\n continue\n ww = None\n for name, param in mm.named_parameters():\n if name == \"weight\":\n ww = param\n else:\n m = mm\n if not isinstance(m, (nn.Linear, nn.Embedding)):\n continue\n ww = m.weight\n with torch.no_grad():\n name = \"[unknown weight]\"\n for name, parameter in model.named_parameters(): # find the name of the weight" + }, + { + "comment": "This code is adjusting the weight matrix (`ww`) initializer of various neural network layers based on their shapes and types. It sets the gain and scale factors accordingly to optimize the model's performance. If `scale` is -999, it initializes with eye initialization. If `gain` is 0, it uses zero initialization. This process helps in setting up the weight matrices efficiently for RWKV models.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/src/model.py\":126-155", + "content": " if id(ww) == id(parameter):\n break\n shape = ww.shape\n gain = 1.0\n scale = 1.0 # extra scale for gain\n if isinstance(m, nn.Embedding):\n gain = math.sqrt(max(shape[0], shape[1]))\n if shape[0] == args.vocab_size and shape[1] == args.n_embd: # token emb?\n scale = 1e-4\n else:\n scale = 0\n if isinstance(m, nn.Linear):\n if shape[0] > shape[1]:\n gain = math.sqrt(shape[0] / shape[1])\n if shape[0] == args.vocab_size and shape[1] == args.n_embd: # final projection?\n scale = 0.5\n if hasattr(m, \"scale_init\"):\n scale = m.scale_init\n # print(f\"{str(shape[0]).ljust(5)} {str(shape[1]).ljust(5)} {str(scale).ljust(4)} {name}\")\n gain *= scale\n if scale == -999:\n nn.init.eye_(ww)\n elif gain == 0:\n # zero init is great for some RWKV matrices" + }, + { + "comment": "This code is initializing a layer of the RWKV_TimeMix model with fancy initialization for time decay parameter. It calculates the time decay speed based on the current layer and attenuation size, and assigns it to the `self.time_decay` parameter in the class.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/src/model.py\":156-183", + "content": " nn.init.zeros_(ww)\n elif gain > 0:\n nn.init.orthogonal_(ww, gain=gain)\n else:\n nn.init.normal_(ww, mean=0.0, std=-scale)\nclass RWKV_TimeMix(torch.jit.ScriptModule):\n def __init__(self, config, layer_id):\n super().__init__()\n self.layer_id = layer_id\n self.ctx_len = config.ctx_len\n self.n_embd = config.n_embd\n attn_sz = config.n_embd\n with torch.no_grad(): # fancy init\n ratio_0_to_1 = (layer_id / (config.n_layer - 1)) # 0 to 1\n ratio_1_to_almost0 = (1.0 - (layer_id / config.n_layer)) # 1 to ~0\n # fancy time_decay\n decay_speed = torch.ones(attn_sz)\n for h in range(attn_sz):\n decay_speed[h] = -5 + 8 * (h / (attn_sz-1)) ** (0.7 + 1.3 * ratio_0_to_1)\n self.time_decay = nn.Parameter(decay_speed)\n # print(layer_id, self.time_decay.flatten()[:3].cpu().numpy(), '...', self.time_decay.flatten()[-3:].cpu().numpy())\n # fancy time_first" + }, + { + "comment": "The code above initializes various layers for an attention mechanism in a transformer model. It defines parameters for time-based mixing, shifting, and linear transformations for keys, values, and output. The key, value, and output layers are initialized with zero scaling.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/src/model.py\":184-205", + "content": " zigzag = (torch.tensor([(i+1)%3 - 1 for i in range(attn_sz)]) * 0.5)\n self.time_first = nn.Parameter(torch.ones(attn_sz) * math.log(0.3) + zigzag)\n # fancy time_mix\n x = torch.ones(1, 1, config.n_embd)\n for i in range(config.n_embd):\n x[0, 0, i] = i / config.n_embd\n self.time_mix_k = nn.Parameter(torch.pow(x, ratio_1_to_almost0))\n self.time_mix_v = nn.Parameter(torch.pow(x, ratio_1_to_almost0) + 0.3 * ratio_0_to_1)\n self.time_mix_r = nn.Parameter(torch.pow(x, 0.5 * ratio_1_to_almost0))\n self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))\n self.key = nn.Linear(config.n_embd, attn_sz, bias=False)\n self.value = nn.Linear(config.n_embd, attn_sz, bias=False)\n self.receptance = nn.Linear(config.n_embd, attn_sz, bias=False)\n self.output = nn.Linear(attn_sz, config.n_embd, bias=False)\n self.key.scale_init = 0\n self.receptance.scale_init = 0\n self.output.scale_init = 0" + }, + { + "comment": "The code defines a model that performs channel-wise mixing and applies a time shift. The jit_func method takes an input tensor x, mixes it with the previous timestep to produce xk, xv, xr, and then calculates k, v, r using these mixed tensors. The forward method takes an input tensor x, calls the jit_func to obtain sr, k, v, applies a time decay, and returns the output from an output layer. RWKV_ChannelMix is a ScriptModule class that initializes the time shift and uses fancy initialization for time_mix.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/src/model.py\":207-241", + "content": " @torch.jit.script_method\n def jit_func(self, x):\n # Mix x with the previous timestep to produce xk, xv, xr\n xx = self.time_shift(x)\n xk = x * self.time_mix_k + xx * (1 - self.time_mix_k)\n xv = x * self.time_mix_v + xx * (1 - self.time_mix_v)\n xr = x * self.time_mix_r + xx * (1 - self.time_mix_r)\n # Use xk, xv, xr to produce k, v, r\n k = self.key(xk)\n v = self.value(xv)\n r = self.receptance(xr)\n sr = torch.sigmoid(r)\n return sr, k, v\n def forward(self, x):\n B, T, C = x.size() # x = (Batch,Time,Channel)\n sr, k, v = self.jit_func(x)\n rwkv = sr * RUN_CUDA(B, T, C, self.time_decay, self.time_first, k, v)\n rwkv = self.output(rwkv)\n return rwkv\nclass RWKV_ChannelMix(torch.jit.ScriptModule):\n def __init__(self, config, layer_id):\n super().__init__()\n self.layer_id = layer_id\n self.time_shift = nn.ZeroPad2d((0, 0, 1, -1))\n with torch.no_grad(): # fancy init of time_mix" + }, + { + "comment": "Code snippet initializes two parameters for time-mixing, sets hidden size and defines linear layers for key, receptance, and value in a transformer model. The forward method applies time-shifting, mixing, passing through key and value layers, and calculates the final output using sigmoid activation and multiplication.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/src/model.py\":242-269", + "content": " ratio_1_to_almost0 = (1.0 - (layer_id / config.n_layer)) # 1 to ~0\n x = torch.ones(1, 1, config.n_embd)\n for i in range(config.n_embd):\n x[0, 0, i] = i / config.n_embd\n self.time_mix_k = nn.Parameter(torch.pow(x, ratio_1_to_almost0))\n self.time_mix_r = nn.Parameter(torch.pow(x, ratio_1_to_almost0))\n hidden_sz = 4 * config.n_embd\n self.key = nn.Linear(config.n_embd, hidden_sz, bias=False)\n self.receptance = nn.Linear(config.n_embd, config.n_embd, bias=False)\n self.value = nn.Linear(hidden_sz, config.n_embd, bias=False)\n self.value.scale_init = 0\n self.receptance.scale_init = 0\n @torch.jit.script_method\n def forward(self, x):\n xx = self.time_shift(x)\n xk = x * self.time_mix_k + xx * (1 - self.time_mix_k)\n xr = x * self.time_mix_r + xx * (1 - self.time_mix_r)\n k = self.key(xk)\n k = torch.square(torch.relu(k))\n kv = self.value(k)\n rkv = torch.sigmoid(self.receptance(xr)) * kv" + }, + { + "comment": "The code defines a GPT model with customizable blocks and config parameters. The GPTConfig class holds the vocabulary size, context length, and additional keyword-value pairs. The Block class is a module for these customizable blocks, using layer normalization, attention, and feed-forward layers, depending on the block type and position.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/src/model.py\":270-302", + "content": " return rkv\n########################################################################################################\n# The GPT Model with our blocks\n########################################################################################################\nclass GPTConfig:\n def __init__(self, vocab_size, ctx_len, **kwargs):\n self.vocab_size = vocab_size\n self.ctx_len = ctx_len\n for k, v in kwargs.items():\n setattr(self, k, v)\nclass Block(nn.Module):\n def __init__(self, config, layer_id):\n super().__init__()\n self.config = config\n self.layer_id = layer_id\n self.ln1 = nn.LayerNorm(config.n_embd)\n self.ln2 = nn.LayerNorm(config.n_embd)\n if self.layer_id == 0:\n self.ln0 = nn.LayerNorm(config.n_embd)\n if self.layer_id == 0 and self.config.model_type == 'RWKV-ffnPre':\n self.ffnPre = RWKV_ChannelMix(config, 0)\n else:\n self.att = RWKV_TimeMix(config, layer_id)\n self.ffn = RWKV_ChannelMix(config, layer_id)" + }, + { + "comment": "This code defines a GPT model class with layer normalization, embedding, and multiple blocks. The forward function performs feed-forward and attention mechanisms, and the __init__ function initializes the model parameters based on the given configuration.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/src/model.py\":304-332", + "content": " def forward(self, x):\n if self.layer_id == 0:\n x = self.ln0(x) \n if self.layer_id == 0 and self.config.model_type == 'RWKV-ffnPre':\n x = x + self.ffnPre(self.ln1(x)) # better in some cases\n else:\n x = x + self.att(self.ln1(x))\n x = x + self.ffn(self.ln2(x))\n return x\nclass GPT(nn.Module):\n def __init__(self, config):\n super().__init__()\n self.step = 0\n self.config = config\n self.emb = nn.Embedding(config.vocab_size, config.n_embd)\n self.blocks = nn.Sequential(*[Block(config, i)\n for i in range(config.n_layer)])\n self.ln_out = nn.LayerNorm(config.n_embd)\n self.head = nn.Linear(config.n_embd, config.vocab_size, bias=False)\n if RWKV_HEAD_QK_DIM > 0:\n self.head_q = nn.Linear(config.n_embd, RWKV_HEAD_QK_DIM, bias=False)\n self.head_q.scale_init = 0\n self.head_k = nn.Linear(config.n_embd, RWKV_HEAD_QK_DIM, bias=False)" + }, + { + "comment": "This code initializes the model, sets parameters such as head_k scale and copy mask, checks if the model should be loaded, and logs the number of parameters. It also defines methods for getting ctx_len, initializing weights with specific values, and configuring optimizers.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/src/model.py\":333-363", + "content": " self.head_k.scale_init = 0.1\n self.register_buffer(\"copy_mask\", torch.tril(\n torch.ones(config.ctx_len, config.ctx_len)))\n self.ctx_len = config.ctx_len\n try:\n if os.environ['RWKV_LOAD_MODEL'] == str(False):\n RWKV_Init(self, config) \n except:\n pass\n logger.info(\"number of parameters: %e\", sum(p.numel()\n for p in self.parameters()))\n def get_ctx_len(self):\n return self.ctx_len\n def _init_weights(self, module):\n if isinstance(module, (nn.Linear)):\n module.weight.data.normal_(mean=0.0, std=0.01)\n if isinstance(module, (nn.Embedding)):\n module.weight.data.normal_(mean=0.0, std=1e-5)\n if isinstance(module, nn.Linear) and module.bias is not None:\n module.bias.data.zero_()\n def configure_optimizers(self, train_config):\n no_decay = set()\n for mn, m in self.named_modules(): # here we disable weight_decay\n for pn, p in m.named_parameters():" + }, + { + "comment": "This code initializes an optimizer for a model, either using DeepSpeed's FusedAdam if available or falling back to torch.optim.Adam. It defines the full param name and creates optimization groups based on whether or not weight decay should be applied. The forward function performs forward pass of the model and asserts that the input length is less than or equal to the context length.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/src/model.py\":364-386", + "content": " fpn = '%s.%s' % (mn, pn) if mn else pn # full param name\n no_decay.add(fpn)\n param_dict = {pn: p for pn, p in self.named_parameters()}\n optim_groups = [\n {\"params\": [param_dict[pn]\n for pn in sorted(list(no_decay))], \"weight_decay\": 0.0},\n ]\n try:\n optimizer = FusedAdam(optim_groups, lr=train_config.learning_rate, betas=train_config.betas, eps=train_config.eps, bias_correction=True, adam_w_mode=False, weight_decay=0, amsgrad=False)\n except:\n print('\\n\\nDeepSpeed not found. Using torch optimizer instead (probably slower)\\n\\n')\n optimizer = torch.optim.Adam(optim_groups, lr=train_config.learning_rate, betas=train_config.betas, eps=train_config.eps)\n return optimizer\n def forward(self, idx, targets=None):\n idx = idx.to(self.emb.weight.device)\n self.step += 1\n B, T = idx.size()\n assert T <= self.ctx_len, \"Cannot forward, because len(input) > model ctx_len.\"" + }, + { + "comment": "This code calculates the attention scores using head layers, and then applies them to the output. If RWKV_HEAD_QK_DIM is greater than 0, it performs multi-head attention by computing the attention scores and scaling them based on the number of heads. Then, it adds a one-hot vector to the output depending on the RWKV_FLOAT_MODE environment variable, and finally calculates the cross-entropy loss between the output and targets (if provided), and returns the L2Wrap applied output and the loss.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4/src/model.py\":388-413", + "content": " x = self.emb(idx)\n x = self.blocks(x)\n x = self.ln_out(x)\n if RWKV_HEAD_QK_DIM > 0:\n q = self.head_q(x)[:, :T, :]\n k = self.head_k(x)[:, :T, :]\n c = (q @ k.transpose(-2, -1)) * (1.0 / RWKV_HEAD_QK_DIM)\n c = c.masked_fill(self.copy_mask[:T, :T] == 0, 0)\n if '32' in os.environ['RWKV_FLOAT_MODE']:\n c = c @ F.one_hot(idx, num_classes=self.config.vocab_size)\n elif os.environ['RWKV_FLOAT_MODE'] == 'fp16':\n c = c @ F.one_hot(idx, num_classes=self.config.vocab_size).half()\n elif os.environ['RWKV_FLOAT_MODE'] == 'bf16':\n c = c @ F.one_hot(idx, num_classes=self.config.vocab_size).bfloat16()\n x = self.head(x) + c\n else:\n x = self.head(x)\n loss = None\n if targets is not None:\n loss = F.cross_entropy(x.view(-1, x.size(-1)), targets.to(x.device).view(-1))\n return L2Wrap.apply(loss, x)" + } + ] +} \ No newline at end of file diff --git a/docs/doc/ff9b9b89-c87a-4fcb-9ac6-feba9820a6a8.json b/docs/doc/ff9b9b89-c87a-4fcb-9ac6-feba9820a6a8.json new file mode 100644 index 00000000..8d693958 --- /dev/null +++ b/docs/doc/ff9b9b89-c87a-4fcb-9ac6-feba9820a6a8.json @@ -0,0 +1,100 @@ +{ + "summary": "This code trains an RWKV language model with PyTorch Lightning, supports customizable training parameters, fine-tunes on enwik8 data, and saves models every 5 epochs using 'argparse' for command line arguments. It includes essential setup tasks like version assertions and learning rate schedule configuration.", + "details": [ + { + "comment": "This code is for training an RWKV language model using PyTorch Lightning framework. It includes basic configuration, argument parsing, and example usage for training a simple L12-D768 RWKV model on dummy data. The user can specify various parameters like load_model, wandb, proj_dir, data_file, data_type, vocab_size, ctx_len, epoch_steps, epoch_count, epoch_begin, epoch_save, micro_bsz, n_layer, n_embd, pre_ffn, and head_qk.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/train.py\":0-22", + "content": "########################################################################################################\n# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM\n########################################################################################################\nimport logging\nlogging.basicConfig(level=logging.INFO)\nif __name__ == \"__main__\":\n from argparse import ArgumentParser\n from pytorch_lightning import Trainer\n from pytorch_lightning.utilities import rank_zero_info, rank_zero_only\n import pytorch_lightning as pl\n rank_zero_info(\"########## work in progress ##########\")\n ########################################################################################################\n #\n # example: train a simple L12-D768 RWKV on dummy data\n #\n # python train.py --load_model \"\" --wandb \"\" --proj_dir \"out\" \\\n # --data_file \"\" --data_type \"dummy\" --vocab_size 0 \\\n # --ctx_len 128 --epoch_steps 1000 --epoch_count 20 --epoch_begin 0 --epoch_save 10 \\\n # --micro_bsz 16 --n_layer 12 --n_embd 768 --pre_ffn 0 --head_qk 0 \\" + }, + { + "comment": "Train a simple L6-D512 RWKV from scratch on enwik8, fine-tune RWKV 1.5B using 8xA100 40G = 1.76it/s = 115k token/s, VRAM 37477M", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/train.py\":23-38", + "content": " # --lr_init 6e-4 --lr_final 1e-5 --warmup_steps 0 --beta1 0.9 --beta2 0.99 --adam_eps 1e-8 \\\n # --accelerator gpu --devices 1 --precision bf16 --strategy ddp_find_unused_parameters_false --grad_cp 0\n # example: train a simple L6-D512 RWKV from scratch on enwik8\n #\n # python train.py --load_model \"\" --wandb \"\" --proj_dir \"out\" \\\n # --data_file \"../data/enwik8\" --data_type \"utf-8\" --vocab_size 0 \\\n # --ctx_len 512 --epoch_steps 5000 --epoch_count 500 --epoch_begin 0 --epoch_save 5 \\\n # --micro_bsz 12 --n_layer 6 --n_embd 512 --pre_ffn 0 --head_qk 0 \\\n # --lr_init 8e-4 --lr_final 1e-5 --warmup_steps 0 --beta1 0.9 --beta2 0.99 --adam_eps 1e-8 \\\n # --accelerator gpu --devices 1 --precision bf16 --strategy ddp_find_unused_parameters_false --grad_cp 0\n # example: fine-tune RWKV 1.5B using 8xA100 40G = 1.76it/s = 115k token/s, VRAM 37477M\n #\n # python train.py --load_model \"/fsx/BlinkDL/CODE/FP16/out_1b2/all-8040.pth\" --wandb \"\" --proj_dir \"out\" \\\n # --data_file \"../data/train.npy\" --data_type \"numpy\" --vocab_size 50277 \\" + }, + { + "comment": "This code configures a fine-tuning process for RWKV using 8 GPUs with BF16 precision, and saves models every 5 epochs. The command line arguments specify the model path, W&B project directory, training data file, vocabulary size, and various hyperparameters like batch size, layers, embedding dimensions, learning rate, etc. The parser is used to parse these command line arguments.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/train.py\":39-53", + "content": " # --ctx_len 1024 --epoch_steps 1000 --epoch_count 1000 --epoch_begin 0 --epoch_save 5 \\\n # --micro_bsz 8 --n_layer 24 --n_embd 2048 --pre_ffn 0 --head_qk 0 \\\n # --lr_init 1e-5 --lr_final 1e-5 --warmup_steps 0 --beta1 0.9 --beta2 0.999 --adam_eps 1e-8 \\\n # --accelerator gpu --devices 8 --precision bf16 --strategy deepspeed_stage_2 --grad_cp 0\n # example: fine-tune RWKV 1.5B using 1 GPU fp16 (VRAM 16G) NOTE: fp16 might overflow\n #\n # python train.py --load_model \"/fsx/BlinkDL/CODE/FP16/out_1b2/all-8040.pth\" --wandb \"\" --proj_dir \"out\" \\\n # --data_file \"../data/train.npy\" --data_type \"numpy\" --vocab_size 50277 \\\n # --ctx_len 1024 --epoch_steps 200 --epoch_count 1000 --epoch_begin 0 --epoch_save 1 \\\n # --micro_bsz 11 --n_layer 24 --n_embd 2048 --pre_ffn 0 --head_qk 0 \\\n # --lr_init 1e-5 --lr_final 1e-5 --warmup_steps 0 --beta1 0.9 --beta2 0.999 --adam_eps 1e-8 \\\n # --accelerator gpu --devices 1 --precision fp16 --strategy deepspeed_stage_2_offload --grad_cp 1\n parser = ArgumentParser()" + }, + { + "comment": "This code is using the 'argparse' module to add command line arguments for specifying a model load path, Wandb project name, project directory, random seed, data file, data type, vocabulary size, context length, epoch steps, training epochs count, and epoch begin point. These arguments control how the program behaves during execution.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/train.py\":55-68", + "content": " parser.add_argument(\"--load_model\", default=\"\", type=str) # full path, with .pth\n parser.add_argument(\"--wandb\", default=\"\", type=str) # wandb project name. if \"\" then don't use wandb\n parser.add_argument(\"--proj_dir\", default=\"out\", type=str)\n parser.add_argument(\"--random_seed\", default=\"-1\", type=int)\n parser.add_argument(\"--data_file\", default=\"\", type=str)\n parser.add_argument(\"--data_type\", default=\"utf-8\", type=str)\n parser.add_argument(\"--vocab_size\", default=0, type=int) # vocab_size = 0 means auto (for char-level LM and .txt data)\n parser.add_argument(\"--ctx_len\", default=1024, type=int)\n parser.add_argument(\"--epoch_steps\", default=1000, type=int) # a mini \"epoch\" has [epoch_steps] steps\n parser.add_argument(\"--epoch_count\", default=500, type=int) # train for this many \"epochs\". will continue afterwards with lr = lr_final\n parser.add_argument(\"--epoch_begin\", default=0, type=int) # if you load a model trained for x \"epochs\", set epoch_begin = x\n " + }, + { + "comment": "This code snippet from \"RWKV-LM/RWKV-v4neo/train.py\" provides default values and types for various command line arguments used in model training. These parameters control aspects like epoch save frequency, batch size per GPU, model layers, embedding dimension, activation function settings, and learning rate configurations. The code also includes optional features like the \"headQK trick\", tiny attention dimensions, and layer placement.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/train.py\":68-82", + "content": " parser.add_argument(\"--epoch_save\", default=5, type=int) # save the model every [epoch_save] \"epochs\"\n parser.add_argument(\"--micro_bsz\", default=12, type=int) # micro batch size (batch size per GPU)\n parser.add_argument(\"--n_layer\", default=6, type=int)\n parser.add_argument(\"--n_embd\", default=512, type=int)\n parser.add_argument(\"--dim_att\", default=0, type=int)\n parser.add_argument(\"--dim_ffn\", default=0, type=int)\n parser.add_argument(\"--pre_ffn\", default=0, type=int) # replace first att layer by ffn (sometimes better)\n parser.add_argument(\"--head_qk\", default=0, type=int) # my headQK trick\n parser.add_argument(\"--tiny_att_dim\", default=0, type=int) # tiny attention dim\n parser.add_argument(\"--tiny_att_layer\", default=-999, type=int) # tiny attention @ which layer\n parser.add_argument(\"--lr_init\", default=6e-4, type=float) # 6e-4 for L12-D768, 4e-4 for L24-D1024, 3e-4 for L24-D2048\n parser.add_argument(\"--lr_final\", default=1e-5, type=float)\n parser.add_argument(\"--warmup_steps\", default=-1, type=int) # try 50 if you load a model" + }, + { + "comment": "The code is using the 'argparse' module to add arguments for hyperparameters, such as beta1 and beta2 values for Adam optimizer, Adam epsilon, gradient checkpoint frequency, dropout rate, weight decay, special pile version and stage, text shift for special pile, and layerwise learning rate.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/train.py\":83-95", + "content": " parser.add_argument(\"--beta1\", default=0.9, type=float)\n parser.add_argument(\"--beta2\", default=0.99, type=float) # use 0.999 when your model is close to convergence\n parser.add_argument(\"--adam_eps\", default=1e-8, type=float)\n parser.add_argument(\"--grad_cp\", default=0, type=int) # gradient checkpt: saves VRAM, but slower\n parser.add_argument(\"--dropout\", default=0, type=float) # try 0.01 / 0.02 / 0.05 / 0.1\n parser.add_argument(\"--weight_decay\", default=0, type=float) # try 0.1 / 0.01 / 0.001\n parser.add_argument(\"--weight_decay_final\", default=-1, type=float)\n parser.add_argument(\"--my_pile_version\", default=1, type=int) # my special pile version\n parser.add_argument(\"--my_pile_stage\", default=0, type=int) # my special pile mode\n parser.add_argument(\"--my_pile_shift\", default=-1, type=int) # my special pile mode - text shift\n parser.add_argument(\"--my_pile_edecay\", default=0, type=int)\n parser.add_argument(\"--layerwise_lr\", default=1, type=int) # layerwise lr for faster convergence (but slower it/s)" + }, + { + "comment": "This code snippet contains various command line arguments used in a training process. It defines the default values and types for these arguments, such as --ds_bucket_mb, --cuda_cleanup, --my_img_version, etc. These options control different aspects of the model's behavior or performance during training. For instance, --my_sample_len specifies the length of samples to use while training, and --head_size_a sets the size of attention heads for the model. The code provides default values that should be sufficient for most cases but can be modified if needed.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/train.py\":96-110", + "content": " parser.add_argument(\"--ds_bucket_mb\", default=200, type=int) # deepspeed bucket size in MB. 200 seems enough\n # parser.add_argument(\"--cuda_cleanup\", default=0, type=int) # extra cuda cleanup (sometimes helpful)\n parser.add_argument(\"--my_img_version\", default=0, type=str)\n parser.add_argument(\"--my_img_size\", default=0, type=int)\n parser.add_argument(\"--my_img_bit\", default=0, type=int)\n parser.add_argument(\"--my_img_clip\", default='x', type=str)\n parser.add_argument(\"--my_img_clip_scale\", default=1, type=float)\n parser.add_argument(\"--my_img_l1_scale\", default=0, type=float)\n parser.add_argument(\"--my_img_encoder\", default='x', type=str)\n # parser.add_argument(\"--my_img_noise_scale\", default=0, type=float)\n parser.add_argument(\"--my_sample_len\", default=0, type=int)\n parser.add_argument(\"--my_ffn_shift\", default=1, type=int)\n parser.add_argument(\"--my_att_shift\", default=1, type=int)\n parser.add_argument(\"--head_size_a\", default=64, type=int) # can try larger values for larger models" + }, + { + "comment": "This code defines command-line arguments for a program using the 'argparse' module. The options include settings for model training (like head size, loading partial data, magic prime), as well as accelerator configuration in case of running on PyTorch Lightning (PL) with Python 2. No comments are needed as this is just defining command-line arguments.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/train.py\":111-128", + "content": " parser.add_argument(\"--head_size_divisor\", default=8, type=int)\n parser.add_argument(\"--my_pos_emb\", default=0, type=int)\n parser.add_argument(\"--load_partial\", default=0, type=int)\n parser.add_argument(\"--magic_prime\", default=0, type=int)\n parser.add_argument(\"--my_qa_mask\", default=0, type=int)\n parser.add_argument(\"--my_random_steps\", default=0, type=int)\n parser.add_argument(\"--my_testing\", default='', type=str)\n parser.add_argument(\"--my_exit\", default=99999999, type=int)\n parser.add_argument(\"--my_exit_tokens\", default=0, type=int)\n if pl.__version__[0]=='2':\n parser.add_argument(\"--accelerator\", default=\"gpu\", type=str)\n parser.add_argument(\"--strategy\", default=\"auto\", type=str)\n parser.add_argument(\"--devices\", default=1, type=int)\n parser.add_argument(\"--num_nodes\", default=1, type=int)\n parser.add_argument(\"--precision\", default=\"fp16\", type=str)\n parser.add_argument(\"--accumulate_grad_batches\", default=1, type=int)\n else:" + }, + { + "comment": "This code snippet is importing necessary libraries and setting up global seed for multi-GPU sampling. It also configures print options, ignores certain warnings, and sets the timestamp for experiment name.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/train.py\":129-152", + "content": " parser = Trainer.add_argparse_args(parser)\n args = parser.parse_args()\n ########################################################################################################\n import os, warnings, math, datetime, sys, time\n import numpy as np\n import torch\n from torch.utils.data import DataLoader\n if \"deepspeed\" in args.strategy:\n import deepspeed\n from pytorch_lightning import seed_everything\n if args.random_seed >= 0:\n print(f\"########## WARNING: GLOBAL SEED {args.random_seed} THIS WILL AFFECT MULTIGPU SAMPLING ##########\\n\" * 3)\n seed_everything(args.random_seed)\n np.set_printoptions(precision=4, suppress=True, linewidth=200)\n warnings.filterwarnings(\"ignore\", \".*Consider increasing the value of the `num_workers` argument*\")\n warnings.filterwarnings(\"ignore\", \".*The progress bar already tracks a metric with the*\")\n # os.environ[\"WDS_SHOW_SEED\"] = \"1\"\n args.my_timestamp = datetime.datetime.today().strftime(\"%Y-%m-%d-%H-%M-%S\")\n args.enable_checkpointing = False" + }, + { + "comment": "The code sets various arguments for model training, including disabling DDP sampler and logger, setting gradient clip value, and modifying batch size based on the number of nodes and devices. It also adjusts the dimensions of attention and feedforward layers if necessary, and customizes run name based on data type or vocabulary size, context length, layer count, and embedding dimension.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/train.py\":153-177", + "content": " args.replace_sampler_ddp = False\n args.logger = False\n args.gradient_clip_val = 1.0\n args.num_sanity_val_steps = 0\n args.check_val_every_n_epoch = int(1e20)\n args.log_every_n_steps = int(1e20)\n args.max_epochs = -1 # continue forever\n args.betas = (args.beta1, args.beta2)\n args.real_bsz = int(args.num_nodes) * int(args.devices) * args.micro_bsz\n os.environ[\"RWKV_T_MAX\"] = str(args.ctx_len)\n os.environ[\"RWKV_MY_TESTING\"] = args.my_testing\n os.environ[\"RWKV_HEAD_SIZE_A\"] = str(args.head_size_a)\n if args.dim_att <= 0:\n args.dim_att = args.n_embd\n if args.dim_ffn <= 0:\n if 'r3' in args.my_testing:\n args.dim_ffn = int((args.n_embd * 3.5) // 32 * 32)\n else:\n args.dim_ffn = args.n_embd * 4\n if args.data_type == \"wds_img\":\n args.run_name = f\"v{args.my_img_version}-{args.my_img_size}-{args.my_img_bit}bit-{args.my_img_clip}x{args.my_img_clip_scale}\"\n args.proj_dir = f\"{args.proj_dir}-{args.run_name}\"\n else:\n args.run_name = f\"{args.vocab_size} ctx{args.ctx_len} L{args.n_layer} D{args.n_embd}\"" + }, + { + "comment": "This code checks if a directory exists, creates it if not, adjusts magic_prime and my_pile_shift values based on ctx_len, and sets my_pile_shift to 0 if it's negative.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/train.py\":178-207", + "content": " if not os.path.exists(args.proj_dir):\n os.makedirs(args.proj_dir)\n if args.my_pile_stage > 0:\n magic_prime_bak = args.magic_prime\n if args.my_pile_version == 1:\n if args.ctx_len == 1024:\n args.magic_prime = 324331313\n elif args.ctx_len == 2048:\n args.magic_prime = 162165671\n elif args.ctx_len == 4096:\n args.magic_prime = 81082817\n elif args.ctx_len == 8192:\n args.magic_prime = 40541399\n else:\n if args.ctx_len == 1024:\n args.magic_prime = 1670239709\n elif args.ctx_len == 2048:\n args.magic_prime = 835119767\n elif args.ctx_len == 4096:\n args.magic_prime = 417559889\n elif args.ctx_len == 6144:\n args.magic_prime = 278373239\n elif args.ctx_len == 8192:\n args.magic_prime = 208779911\n if args.my_pile_shift < 0:\n args.my_pile_shift = 0\n if magic_prime_bak > 0:" + }, + { + "comment": "This code sets the epoch count based on magic_prime and my_qa_mask, determines epoch steps for batch size, asserts that their product equals 40320, and checks if my_pile_stage is 2 to find the latest saved model. If my_pile_stage >= 2, it lists all models in proj_dir, sorts them, sets max_p as last one, and my_pile_prev_p if there are more than one.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/train.py\":208-232", + "content": " args.magic_prime = magic_prime_bak\n if args.my_qa_mask == 2:\n args.epoch_count = 2 * args.magic_prime // 40320\n else:\n args.epoch_count = args.magic_prime // 40320\n args.epoch_steps = 40320 // args.real_bsz\n assert args.epoch_steps * args.real_bsz == 40320\n # if args.my_pile_stage == 2:\n # assert args.lr_final == args.lr_init\n if args.my_pile_stage >= 2: # find latest saved model\n list_p = []\n for p in os.listdir(args.proj_dir):\n if p.startswith(\"rwkv\") and p.endswith(\".pth\"):\n p = ((p.split(\"-\"))[1].split(\".\"))[0]\n if p != \"final\":\n if p == \"init\":\n p = -1\n else:\n p = int(p)\n list_p += [p]\n list_p.sort()\n max_p = list_p[-1]\n if len(list_p) > 1:\n args.my_pile_prev_p = list_p[-2] # in case max_p is corrupted" + }, + { + "comment": "If max_p is -1, the model will be loaded from rwkv-init.pth in args.proj_dir. Else, it will load from rwkv-{max_p}.pth in args.proj_dir. If warmup_steps is less than 0, set warmup_steps depending on my_pile_stage. Calculate samples_per_epoch and tokens_per_epoch based on epoch_steps and ctx_len respectively. Retrieve deepspeed version. Display rank_zero_info message with relevant information about the model, data, and project directory.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/train.py\":233-257", + "content": " if max_p == -1:\n args.load_model = f\"{args.proj_dir}/rwkv-init.pth\"\n else:\n args.load_model = f\"{args.proj_dir}/rwkv-{max_p}.pth\"\n if args.warmup_steps < 0:\n if args.my_pile_stage == 2:\n args.warmup_steps = 10\n else:\n args.warmup_steps = 30\n args.epoch_begin = max_p + 1\n samples_per_epoch = args.epoch_steps * args.real_bsz\n tokens_per_epoch = samples_per_epoch * args.ctx_len\n try:\n deepspeed_version = deepspeed.__version__\n except:\n deepspeed_version = None\n pass\n rank_zero_info(\n f\"\"\"\n############################################################################\n#\n# RWKV-4 {args.precision.upper()} on {args.num_nodes}x{args.devices} {args.accelerator.upper()}, bsz {args.num_nodes}x{args.devices}x{args.micro_bsz}={args.real_bsz}, {args.strategy} {'with grad_cp' if args.grad_cp > 0 else ''}\n#\n# Data = {args.data_file} ({args.data_type}), ProjDir = {args.proj_dir}" + }, + { + "comment": "Code snippet defines variables for epochs, steps, samples, tokens, model layers, embedding size, context length, learning rate schedule, warmup steps, beta, and epsilon. It also mentions required Python libraries versions and their recommendations. The code asserts the data type and provides a note if either final or initial learning rate is zero, suggesting that it will use a linear learning rate schedule instead.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/train.py\":258-279", + "content": "#\n# Epoch = {args.epoch_begin} to {args.epoch_begin + args.epoch_count - 1} (will continue afterwards), save every {args.epoch_save} epoch\n#\n# Each \"epoch\" = {args.epoch_steps} steps, {samples_per_epoch} samples, {tokens_per_epoch} tokens\n#\n# Model = {args.n_layer} n_layer, {args.n_embd} n_embd, {args.ctx_len} ctx_len\n#\n# Adam = lr {args.lr_init} to {args.lr_final}, warmup {args.warmup_steps} steps, beta {args.betas}, eps {args.adam_eps}\n#\n# Found torch {torch.__version__}, recommend 1.13.1+cu117 or newer\n# Found deepspeed {deepspeed_version}, recommend 0.7.0 (faster than newer versions)\n# Found pytorch_lightning {pl.__version__}, recommend 1.9.5\n#\n############################################################################\n\"\"\"\n )\n rank_zero_info(str(vars(args)) + \"\\n\")\n assert args.data_type in [\"utf-8\", \"utf-16le\", \"numpy\", \"binidx\", \"dummy\", \"wds_img\", \"uint16\"]\n if args.lr_final == 0 or args.lr_init == 0:\n rank_zero_info(\"\\n\\nNote: lr_final = 0 or lr_init = 0. Using linear LR schedule instead.\\n\\n\")" + }, + { + "comment": "This code sets the precision argument, adjusts relevant environment variables and configurations for faster training with different precision types. If using fp32, it provides a note suggesting to use bf16 or tf32 for better performance. It also sets up cudnn settings and allows tf32 in certain precision cases.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/train.py\":281-307", + "content": " assert args.precision in [\"fp32\", \"tf32\", \"fp16\", \"bf16\"]\n os.environ[\"RWKV_FLOAT_MODE\"] = args.precision\n if args.precision == \"fp32\":\n for i in range(10):\n rank_zero_info(\"\\n\\nNote: you are using fp32 (very slow). Try bf16 / tf32 for faster training.\\n\\n\")\n if args.precision == \"fp16\":\n rank_zero_info(\"\\n\\nNote: you are using fp16 (might overflow). Try bf16 / tf32 for stable training.\\n\\n\")\n os.environ[\"RWKV_JIT_ON\"] = \"1\"\n if \"deepspeed_stage_3\" in args.strategy:\n os.environ[\"RWKV_JIT_ON\"] = \"0\"\n torch.backends.cudnn.benchmark = True\n torch.backends.cudnn.enabled = True\n if args.precision == \"fp32\":\n torch.backends.cudnn.allow_tf32 = False\n torch.backends.cuda.matmul.allow_tf32 = False\n else:\n torch.backends.cudnn.allow_tf32 = True\n torch.backends.cuda.matmul.allow_tf32 = True\n if \"32\" in args.precision:\n args.precision = 32\n elif args.precision == \"fp16\":\n args.precision = 16\n else:\n args.precision = \"bf16\"" + }, + { + "comment": "Initializing and loading the RWKV model with specified arguments, generating initial weights if none are loaded or if at MyPile stage 1. Saving initial weights in specified directory and then attempting to load pre-trained model from given path.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/train.py\":309-334", + "content": " ########################################################################################################\n from src.trainer import train_callback, generate_init_weight\n from src.dataset import MyDataset\n train_data = MyDataset(args)\n args.vocab_size = train_data.vocab_size\n if args.data_type == 'wds_img':\n from src.model_img import RWKV_IMG\n model = RWKV_IMG(args)\n else:\n from src.model import RWKV\n model = RWKV(args)\n if len(args.load_model) == 0 or args.my_pile_stage == 1: # shall we build the initial weights?\n init_weight_name = f\"{args.proj_dir}/rwkv-init.pth\"\n generate_init_weight(model, init_weight_name) # save initial weights\n args.load_model = init_weight_name\n rank_zero_info(f\"########## Loading {args.load_model}... ##########\")\n try:\n load_dict = torch.load(args.load_model, map_location=\"cpu\")\n load_keys = list(load_dict.keys())\n for k in load_keys:\n if k.startswith('_forward_module.'):" + }, + { + "comment": "This code attempts to load a model checkpoint. It first checks if the provided checkpoint file exists, and if not, it tries another one. Then, it loads the dictionary of state parameters from the checkpoint into memory. If loading only part of the model, it also adds missing keys from the original model's state dictionary to the loaded dictionary. Finally, it loads the state dictionary into the model for training or inference.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/train.py\":335-357", + "content": " load_dict[k.replace('_forward_module.','')] = load_dict[k]\n del load_dict[k]\n except:\n rank_zero_info(f\"Bad checkpoint {args.load_model}\")\n if args.my_pile_stage >= 2: # try again using another checkpoint\n max_p = args.my_pile_prev_p\n if max_p == -1:\n args.load_model = f\"{args.proj_dir}/rwkv-init.pth\"\n else:\n args.load_model = f\"{args.proj_dir}/rwkv-{max_p}.pth\"\n args.epoch_begin = max_p + 1\n rank_zero_info(f\"Trying {args.load_model}\")\n load_dict = torch.load(args.load_model, map_location=\"cpu\")\n if args.load_partial == 1:\n load_keys = load_dict.keys()\n for k in model.state_dict():\n if k not in load_keys:\n load_dict[k] = model.state_dict()[k]\n model.load_state_dict(load_dict)\n if pl.__version__[0]=='2':\n trainer = Trainer(accelerator=args.accelerator,strategy=args.strategy,devices=args.devices,num_nodes=args.num_nodes,precision=args.precision," + }, + { + "comment": "The code creates a trainer object with specified arguments, including a callback for training. If the model has state dictionaries with shapes larger than 1D, it prints the shape and name of each such dictionary. The code then checks if the strategy used is \"deepspeed\" and sets the bucket size accordingly.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/train.py\":358-377", + "content": " logger=args.logger,callbacks=[train_callback(args)],max_epochs=args.max_epochs,check_val_every_n_epoch=args.check_val_every_n_epoch,num_sanity_val_steps=args.num_sanity_val_steps,\n log_every_n_steps=args.log_every_n_steps,enable_checkpointing=args.enable_checkpointing,accumulate_grad_batches=args.accumulate_grad_batches,gradient_clip_val=args.gradient_clip_val)\n else:\n trainer = Trainer.from_argparse_args(\n args,\n callbacks=[train_callback(args)],\n )\n if trainer.global_rank == 0:\n for n in model.state_dict():\n shape = model.state_dict()[n].shape\n shape = [i for i in shape if i != 1]\n if len(shape) > 1:\n print(f\"{str(shape[0]).ljust(5)} {str(shape[1]).ljust(5)} {n}\")\n else:\n print(f\"{str(shape[0]).ljust(5)} {n}\")\n if \"deepspeed\" in args.strategy:\n trainer.strategy.config[\"zero_optimization\"][\"allgather_bucket_size\"] = args.ds_bucket_mb * 1000 * 1000\n " + }, + { + "comment": "This code sets the bucket size for zero optimization and configures a data loader with specific parameters before fitting the model in a trainer.", + "location": "\"/media/root/Prima/works/RWKV-LM/docs/src/RWKV-v4neo/train.py\":377-382", + "content": " trainer.strategy.config[\"zero_optimization\"][\"reduce_bucket_size\"] = args.ds_bucket_mb * 1000 * 1000\n # must set shuffle=False, persistent_workers=False (because worker is in another thread)\n data_loader = DataLoader(train_data, shuffle=False, pin_memory=True, batch_size=args.micro_bsz, num_workers=1, persistent_workers=False, drop_last=True)\n trainer.fit(model, data_loader)" + } + ] +} \ No newline at end of file diff --git a/docs/github-markdown.css b/docs/github-markdown.css new file mode 100755 index 00000000..96a4f29e --- /dev/null +++ b/docs/github-markdown.css @@ -0,0 +1,1197 @@ +@media (prefers-color-scheme: dark) { + + .markdown-body, + [data-theme="dark"] { + /*dark*/ + color-scheme: dark; + --color-prettylights-syntax-comment: #8b949e; + --color-prettylights-syntax-constant: #79c0ff; + --color-prettylights-syntax-entity: #d2a8ff; + --color-prettylights-syntax-storage-modifier-import: #c9d1d9; + --color-prettylights-syntax-entity-tag: #7ee787; + --color-prettylights-syntax-keyword: #ff7b72; + --color-prettylights-syntax-string: #a5d6ff; + --color-prettylights-syntax-variable: #ffa657; + --color-prettylights-syntax-brackethighlighter-unmatched: #f85149; + --color-prettylights-syntax-invalid-illegal-text: #f0f6fc; + --color-prettylights-syntax-invalid-illegal-bg: #8e1519; + --color-prettylights-syntax-carriage-return-text: #f0f6fc; + --color-prettylights-syntax-carriage-return-bg: #b62324; + --color-prettylights-syntax-string-regexp: #7ee787; + --color-prettylights-syntax-markup-list: #f2cc60; + --color-prettylights-syntax-markup-heading: #1f6feb; + --color-prettylights-syntax-markup-italic: #c9d1d9; + --color-prettylights-syntax-markup-bold: #c9d1d9; + --color-prettylights-syntax-markup-deleted-text: #ffdcd7; + --color-prettylights-syntax-markup-deleted-bg: #67060c; + --color-prettylights-syntax-markup-inserted-text: #aff5b4; + --color-prettylights-syntax-markup-inserted-bg: #033a16; + --color-prettylights-syntax-markup-changed-text: #ffdfb6; + --color-prettylights-syntax-markup-changed-bg: #5a1e02; + --color-prettylights-syntax-markup-ignored-text: #c9d1d9; + --color-prettylights-syntax-markup-ignored-bg: #1158c7; + --color-prettylights-syntax-meta-diff-range: #d2a8ff; + --color-prettylights-syntax-brackethighlighter-angle: #8b949e; + --color-prettylights-syntax-sublimelinter-gutter-mark: #484f58; + --color-prettylights-syntax-constant-other-reference-link: #a5d6ff; + --color-fg-default: #e6edf3; + --color-fg-muted: #848d97; + --color-fg-subtle: #6e7681; + --color-canvas-default: #0d1117; + --color-canvas-subtle: #161b22; + --color-border-default: #30363d; + --color-border-muted: #21262d; + --color-neutral-muted: rgba(110, 118, 129, 0.4); + --color-accent-fg: #2f81f7; + --color-accent-emphasis: #1f6feb; + --color-success-fg: #3fb950; + --color-success-emphasis: #238636; + --color-attention-fg: #d29922; + --color-attention-emphasis: #9e6a03; + --color-attention-subtle: rgba(187, 128, 9, 0.15); + --color-danger-fg: #f85149; + --color-danger-emphasis: #da3633; + --color-done-fg: #a371f7; + --color-done-emphasis: #8957e5; + } +} + +@media (prefers-color-scheme: light) { + + .markdown-body, + [data-theme="light"] { + /*light*/ + color-scheme: light; + --color-prettylights-syntax-comment: #57606a; + --color-prettylights-syntax-constant: #0550ae; + --color-prettylights-syntax-entity: #6639ba; + --color-prettylights-syntax-storage-modifier-import: #24292f; + --color-prettylights-syntax-entity-tag: #116329; + --color-prettylights-syntax-keyword: #cf222e; + --color-prettylights-syntax-string: #0a3069; + --color-prettylights-syntax-variable: #953800; + --color-prettylights-syntax-brackethighlighter-unmatched: #82071e; + --color-prettylights-syntax-invalid-illegal-text: #f6f8fa; + --color-prettylights-syntax-invalid-illegal-bg: #82071e; + --color-prettylights-syntax-carriage-return-text: #f6f8fa; + --color-prettylights-syntax-carriage-return-bg: #cf222e; + --color-prettylights-syntax-string-regexp: #116329; + --color-prettylights-syntax-markup-list: #3b2300; + --color-prettylights-syntax-markup-heading: #0550ae; + --color-prettylights-syntax-markup-italic: #24292f; + --color-prettylights-syntax-markup-bold: #24292f; + --color-prettylights-syntax-markup-deleted-text: #82071e; + --color-prettylights-syntax-markup-deleted-bg: #ffebe9; + --color-prettylights-syntax-markup-inserted-text: #116329; + --color-prettylights-syntax-markup-inserted-bg: #dafbe1; + --color-prettylights-syntax-markup-changed-text: #953800; + --color-prettylights-syntax-markup-changed-bg: #ffd8b5; + --color-prettylights-syntax-markup-ignored-text: #eaeef2; + --color-prettylights-syntax-markup-ignored-bg: #0550ae; + --color-prettylights-syntax-meta-diff-range: #8250df; + --color-prettylights-syntax-brackethighlighter-angle: #57606a; + --color-prettylights-syntax-sublimelinter-gutter-mark: #8c959f; + --color-prettylights-syntax-constant-other-reference-link: #0a3069; + --color-fg-default: #1F2328; + --color-fg-muted: #656d76; + --color-fg-subtle: #6e7781; + --color-canvas-default: #ffffff; + --color-canvas-subtle: #f6f8fa; + --color-border-default: #d0d7de; + --color-border-muted: hsla(210, 18%, 87%, 1); + --color-neutral-muted: rgba(175, 184, 193, 0.2); + --color-accent-fg: #0969da; + --color-accent-emphasis: #0969da; + --color-success-fg: #1a7f37; + --color-success-emphasis: #1f883d; + --color-attention-fg: #9a6700; + --color-attention-emphasis: #9a6700; + --color-attention-subtle: #fff8c5; + --color-danger-fg: #d1242f; + --color-danger-emphasis: #cf222e; + --color-done-fg: #8250df; + --color-done-emphasis: #8250df; + } +} + +.markdown-body { + -ms-text-size-adjust: 100%; + -webkit-text-size-adjust: 100%; + margin: 0; + color: var(--color-fg-default); + background-color: var(--color-canvas-default); + font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", "Noto Sans", Helvetica, Arial, sans-serif, "Apple Color Emoji", "Segoe UI Emoji"; + font-size: 16px; + line-height: 1.5; + word-wrap: break-word; +} + +.markdown-body .octicon { + display: inline-block; + fill: currentColor; + vertical-align: text-bottom; +} + +.markdown-body h1:hover .anchor .octicon-link:before, +.markdown-body h2:hover .anchor .octicon-link:before, +.markdown-body h3:hover .anchor .octicon-link:before, +.markdown-body h4:hover .anchor .octicon-link:before, +.markdown-body h5:hover .anchor .octicon-link:before, +.markdown-body h6:hover .anchor .octicon-link:before { + width: 16px; + height: 16px; + content: ' '; + display: inline-block; + background-color: currentColor; + -webkit-mask-image: url("data:image/svg+xml,"); + mask-image: url("data:image/svg+xml,"); +} + +.markdown-body details, +.markdown-body figcaption, +.markdown-body figure { + display: block; +} + +.markdown-body summary { + display: list-item; +} + +.markdown-body [hidden] { + display: none !important; +} + +.markdown-body a { + background-color: transparent; + color: var(--color-accent-fg); + text-decoration: none; +} + +.markdown-body abbr[title] { + border-bottom: none; + -webkit-text-decoration: underline dotted; + text-decoration: underline dotted; +} + +.markdown-body b, +.markdown-body strong { + font-weight: var(--base-text-weight-semibold, 600); +} + +.markdown-body dfn { + font-style: italic; +} + +.markdown-body h1 { + margin: .67em 0; + font-weight: var(--base-text-weight-semibold, 600); + padding-bottom: .3em; + font-size: 2em; + border-bottom: 1px solid var(--color-border-muted); +} + +.markdown-body mark { + background-color: var(--color-attention-subtle); + color: var(--color-fg-default); +} + +.markdown-body small { + font-size: 90%; +} + +.markdown-body sub, +.markdown-body sup { + font-size: 75%; + line-height: 0; + position: relative; + vertical-align: baseline; +} + +.markdown-body sub { + bottom: -0.25em; +} + +.markdown-body sup { + top: -0.5em; +} + +.markdown-body img { + border-style: none; + max-width: 100%; + box-sizing: content-box; + background-color: var(--color-canvas-default); +} + +.markdown-body code, +.markdown-body kbd, +.markdown-body pre, +.markdown-body samp { + font-family: monospace; + font-size: 1em; +} + +.markdown-body figure { + margin: 1em 40px; +} + +.markdown-body hr { + box-sizing: content-box; + overflow: hidden; + background: transparent; + border-bottom: 1px solid var(--color-border-muted); + height: .25em; + padding: 0; + margin: 24px 0; + background-color: var(--color-border-default); + border: 0; +} + +.markdown-body input { + font: inherit; + margin: 0; + overflow: visible; + font-family: inherit; + font-size: inherit; + line-height: inherit; +} + +.markdown-body [type=button], +.markdown-body [type=reset], +.markdown-body [type=submit] { + -webkit-appearance: button; + appearance: button; +} + +.markdown-body [type=checkbox], +.markdown-body [type=radio] { + box-sizing: border-box; + padding: 0; +} + +.markdown-body [type=number]::-webkit-inner-spin-button, +.markdown-body [type=number]::-webkit-outer-spin-button { + height: auto; +} + +.markdown-body [type=search]::-webkit-search-cancel-button, +.markdown-body [type=search]::-webkit-search-decoration { + -webkit-appearance: none; + appearance: none; +} + +.markdown-body ::-webkit-input-placeholder { + color: inherit; + opacity: .54; +} + +.markdown-body ::-webkit-file-upload-button { + -webkit-appearance: button; + appearance: button; + font: inherit; +} + +.markdown-body a:hover { + text-decoration: underline; +} + +.markdown-body ::placeholder { + color: var(--color-fg-subtle); + opacity: 1; +} + +.markdown-body hr::before { + display: table; + content: ""; +} + +.markdown-body hr::after { + display: table; + clear: both; + content: ""; +} + +.markdown-body table { + border-spacing: 0; + border-collapse: collapse; + display: block; + width: max-content; + max-width: 100%; + overflow: auto; +} + +.markdown-body td, +.markdown-body th { + padding: 0; +} + +.markdown-body details summary { + cursor: pointer; +} + +.markdown-body details:not([open])>*:not(summary) { + display: none !important; +} + +.markdown-body a:focus, +.markdown-body [role=button]:focus, +.markdown-body input[type=radio]:focus, +.markdown-body input[type=checkbox]:focus { + outline: 2px solid var(--color-accent-fg); + outline-offset: -2px; + box-shadow: none; +} + +.markdown-body a:focus:not(:focus-visible), +.markdown-body [role=button]:focus:not(:focus-visible), +.markdown-body input[type=radio]:focus:not(:focus-visible), +.markdown-body input[type=checkbox]:focus:not(:focus-visible) { + outline: solid 1px transparent; +} + +.markdown-body a:focus-visible, +.markdown-body [role=button]:focus-visible, +.markdown-body input[type=radio]:focus-visible, +.markdown-body input[type=checkbox]:focus-visible { + outline: 2px solid var(--color-accent-fg); + outline-offset: -2px; + box-shadow: none; +} + +.markdown-body a:not([class]):focus, +.markdown-body a:not([class]):focus-visible, +.markdown-body input[type=radio]:focus, +.markdown-body input[type=radio]:focus-visible, +.markdown-body input[type=checkbox]:focus, +.markdown-body input[type=checkbox]:focus-visible { + outline-offset: 0; +} + +.markdown-body kbd { + display: inline-block; + padding: 3px 5px; + font: 11px ui-monospace, SFMono-Regular, SF Mono, Menlo, Consolas, Liberation Mono, monospace; + line-height: 10px; + color: var(--color-fg-default); + vertical-align: middle; + background-color: var(--color-canvas-subtle); + border: solid 1px var(--color-neutral-muted); + border-bottom-color: var(--color-neutral-muted); + border-radius: 6px; + box-shadow: inset 0 -1px 0 var(--color-neutral-muted); +} + +.markdown-body h1, +.markdown-body h2, +.markdown-body h3, +.markdown-body h4, +.markdown-body h5, +.markdown-body h6 { + margin-top: 24px; + margin-bottom: 16px; + font-weight: var(--base-text-weight-semibold, 600); + line-height: 1.25; +} + +.markdown-body h2 { + font-weight: var(--base-text-weight-semibold, 600); + padding-bottom: .3em; + font-size: 1.5em; + border-bottom: 1px solid var(--color-border-muted); +} + +.markdown-body h3 { + font-weight: var(--base-text-weight-semibold, 600); + font-size: 1.25em; +} + +.markdown-body h4 { + font-weight: var(--base-text-weight-semibold, 600); + font-size: 1em; +} + +.markdown-body h5 { + font-weight: var(--base-text-weight-semibold, 600); + font-size: .875em; +} + +.markdown-body h6 { + font-weight: var(--base-text-weight-semibold, 600); + font-size: .85em; + color: var(--color-fg-muted); +} + +.markdown-body p { + margin-top: 0; + margin-bottom: 10px; +} + +.markdown-body blockquote { + margin: 0; + padding: 0 1em; + color: var(--color-fg-muted); + border-left: .25em solid var(--color-border-default); +} + +.markdown-body ul, +.markdown-body ol { + margin-top: 0; + margin-bottom: 0; + padding-left: 2em; +} + +.markdown-body ol ol, +.markdown-body ul ol { + list-style-type: lower-roman; +} + +.markdown-body ul ul ol, +.markdown-body ul ol ol, +.markdown-body ol ul ol, +.markdown-body ol ol ol { + list-style-type: lower-alpha; +} + +.markdown-body dd { + margin-left: 0; +} + +.markdown-body tt, +.markdown-body code, +.markdown-body samp { + font-family: ui-monospace, SFMono-Regular, SF Mono, Menlo, Consolas, Liberation Mono, monospace; + font-size: 12px; +} + +.markdown-body pre { + margin-top: 0; + margin-bottom: 0; + font-family: ui-monospace, SFMono-Regular, SF Mono, Menlo, Consolas, Liberation Mono, monospace; + font-size: 12px; + word-wrap: normal; +} + +.markdown-body .octicon { + display: inline-block; + overflow: visible !important; + vertical-align: text-bottom; + fill: currentColor; +} + +.markdown-body input::-webkit-outer-spin-button, +.markdown-body input::-webkit-inner-spin-button { + margin: 0; + -webkit-appearance: none; + appearance: none; +} + +.markdown-body .mr-2 { + margin-right: var(--base-size-8, 8px) !important; +} + +.markdown-body::before { + display: table; + content: ""; +} + +.markdown-body::after { + display: table; + clear: both; + content: ""; +} + +.markdown-body>*:first-child { + margin-top: 0 !important; +} + +.markdown-body>*:last-child { + margin-bottom: 0 !important; +} + +.markdown-body a:not([href]) { + color: inherit; + text-decoration: none; +} + +.markdown-body .absent { + color: var(--color-danger-fg); +} + +.markdown-body .anchor { + float: left; + padding-right: 4px; + margin-left: -20px; + line-height: 1; +} + +.markdown-body .anchor:focus { + outline: none; +} + +.markdown-body p, +.markdown-body blockquote, +.markdown-body ul, +.markdown-body ol, +.markdown-body dl, +.markdown-body table, +.markdown-body pre, +.markdown-body details { + margin-top: 0; + margin-bottom: 16px; +} + +.markdown-body blockquote>:first-child { + margin-top: 0; +} + +.markdown-body blockquote>:last-child { + margin-bottom: 0; +} + +.markdown-body h1 .octicon-link, +.markdown-body h2 .octicon-link, +.markdown-body h3 .octicon-link, +.markdown-body h4 .octicon-link, +.markdown-body h5 .octicon-link, +.markdown-body h6 .octicon-link { + color: var(--color-fg-default); + vertical-align: middle; + visibility: hidden; +} + +.markdown-body h1:hover .anchor, +.markdown-body h2:hover .anchor, +.markdown-body h3:hover .anchor, +.markdown-body h4:hover .anchor, +.markdown-body h5:hover .anchor, +.markdown-body h6:hover .anchor { + text-decoration: none; +} + +.markdown-body h1:hover .anchor .octicon-link, +.markdown-body h2:hover .anchor .octicon-link, +.markdown-body h3:hover .anchor .octicon-link, +.markdown-body h4:hover .anchor .octicon-link, +.markdown-body h5:hover .anchor .octicon-link, +.markdown-body h6:hover .anchor .octicon-link { + visibility: visible; +} + +.markdown-body h1 tt, +.markdown-body h1 code, +.markdown-body h2 tt, +.markdown-body h2 code, +.markdown-body h3 tt, +.markdown-body h3 code, +.markdown-body h4 tt, +.markdown-body h4 code, +.markdown-body h5 tt, +.markdown-body h5 code, +.markdown-body h6 tt, +.markdown-body h6 code { + padding: 0 .2em; + font-size: inherit; +} + +.markdown-body summary h1, +.markdown-body summary h2, +.markdown-body summary h3, +.markdown-body summary h4, +.markdown-body summary h5, +.markdown-body summary h6 { + display: inline-block; +} + +.markdown-body summary h1 .anchor, +.markdown-body summary h2 .anchor, +.markdown-body summary h3 .anchor, +.markdown-body summary h4 .anchor, +.markdown-body summary h5 .anchor, +.markdown-body summary h6 .anchor { + margin-left: -40px; +} + +.markdown-body summary h1, +.markdown-body summary h2 { + padding-bottom: 0; + border-bottom: 0; +} + +.markdown-body ul.no-list, +.markdown-body ol.no-list { + padding: 0; + list-style-type: none; +} + +.markdown-body ol[type="a s"] { + list-style-type: lower-alpha; +} + +.markdown-body ol[type="A s"] { + list-style-type: upper-alpha; +} + +.markdown-body ol[type="i s"] { + list-style-type: lower-roman; +} + +.markdown-body ol[type="I s"] { + list-style-type: upper-roman; +} + +.markdown-body ol[type="1"] { + list-style-type: decimal; +} + +.markdown-body div>ol:not([type]) { + list-style-type: decimal; +} + +.markdown-body ul ul, +.markdown-body ul ol, +.markdown-body ol ol, +.markdown-body ol ul { + margin-top: 0; + margin-bottom: 0; +} + +.markdown-body li>p { + margin-top: 16px; +} + +.markdown-body li+li { + margin-top: .25em; +} + +.markdown-body dl { + padding: 0; +} + +.markdown-body dl dt { + padding: 0; + margin-top: 16px; + font-size: 1em; + font-style: italic; + font-weight: var(--base-text-weight-semibold, 600); +} + +.markdown-body dl dd { + padding: 0 16px; + margin-bottom: 16px; +} + +.markdown-body table th { + font-weight: var(--base-text-weight-semibold, 600); +} + +.markdown-body table th, +.markdown-body table td { + padding: 6px 13px; + border: 1px solid var(--color-border-default); +} + +.markdown-body table td>:last-child { + margin-bottom: 0; +} + +.markdown-body table tr { + background-color: var(--color-canvas-default); + border-top: 1px solid var(--color-border-muted); +} + +.markdown-body table tr:nth-child(2n) { + background-color: var(--color-canvas-subtle); +} + +.markdown-body table img { + background-color: transparent; +} + +.markdown-body img[align=right] { + padding-left: 20px; +} + +.markdown-body img[align=left] { + padding-right: 20px; +} + +.markdown-body .emoji { + max-width: none; + vertical-align: text-top; + background-color: transparent; +} + +.markdown-body span.frame { + display: block; + overflow: hidden; +} + +.markdown-body span.frame>span { + display: block; + float: left; + width: auto; + padding: 7px; + margin: 13px 0 0; + overflow: hidden; + border: 1px solid var(--color-border-default); +} + +.markdown-body span.frame span img { + display: block; + float: left; +} + +.markdown-body span.frame span span { + display: block; + padding: 5px 0 0; + clear: both; + color: var(--color-fg-default); +} + +.markdown-body span.align-center { + display: block; + overflow: hidden; + clear: both; +} + +.markdown-body span.align-center>span { + display: block; + margin: 13px auto 0; + overflow: hidden; + text-align: center; +} + +.markdown-body span.align-center span img { + margin: 0 auto; + text-align: center; +} + +.markdown-body span.align-right { + display: block; + overflow: hidden; + clear: both; +} + +.markdown-body span.align-right>span { + display: block; + margin: 13px 0 0; + overflow: hidden; + text-align: right; +} + +.markdown-body span.align-right span img { + margin: 0; + text-align: right; +} + +.markdown-body span.float-left { + display: block; + float: left; + margin-right: 13px; + overflow: hidden; +} + +.markdown-body span.float-left span { + margin: 13px 0 0; +} + +.markdown-body span.float-right { + display: block; + float: right; + margin-left: 13px; + overflow: hidden; +} + +.markdown-body span.float-right>span { + display: block; + margin: 13px auto 0; + overflow: hidden; + text-align: right; +} + +.markdown-body code, +.markdown-body tt { + padding: .2em .4em; + margin: 0; + font-size: 85%; + white-space: break-spaces; + background-color: var(--color-neutral-muted); + border-radius: 6px; +} + +.markdown-body code br, +.markdown-body tt br { + display: none; +} + +.markdown-body del code { + text-decoration: inherit; +} + +.markdown-body samp { + font-size: 85%; +} + +.markdown-body pre code { + font-size: 100%; +} + +.markdown-body pre>code { + padding: 0; + margin: 0; + word-break: normal; + white-space: pre; + background: transparent; + border: 0; +} + +.markdown-body .highlight { + margin-bottom: 16px; +} + +.markdown-body .highlight pre { + margin-bottom: 0; + word-break: normal; +} + +.markdown-body .highlight pre, +.markdown-body pre { + padding: 16px; + overflow: auto; + font-size: 85%; + line-height: 1.45; + color: var(--color-fg-default); + background-color: var(--color-canvas-subtle); + border-radius: 6px; +} + +.markdown-body pre code, +.markdown-body pre tt { + display: inline; + max-width: auto; + padding: 0; + margin: 0; + overflow: visible; + line-height: inherit; + word-wrap: normal; + background-color: transparent; + border: 0; +} + +.markdown-body .csv-data td, +.markdown-body .csv-data th { + padding: 5px; + overflow: hidden; + font-size: 12px; + line-height: 1; + text-align: left; + white-space: nowrap; +} + +.markdown-body .csv-data .blob-num { + padding: 10px 8px 9px; + text-align: right; + background: var(--color-canvas-default); + border: 0; +} + +.markdown-body .csv-data tr { + border-top: 0; +} + +.markdown-body .csv-data th { + font-weight: var(--base-text-weight-semibold, 600); + background: var(--color-canvas-subtle); + border-top: 0; +} + +.markdown-body [data-footnote-ref]::before { + content: "["; +} + +.markdown-body [data-footnote-ref]::after { + content: "]"; +} + +.markdown-body .footnotes { + font-size: 12px; + color: var(--color-fg-muted); + border-top: 1px solid var(--color-border-default); +} + +.markdown-body .footnotes ol { + padding-left: 16px; +} + +.markdown-body .footnotes ol ul { + display: inline-block; + padding-left: 16px; + margin-top: 16px; +} + +.markdown-body .footnotes li { + position: relative; +} + +.markdown-body .footnotes li:target::before { + position: absolute; + top: -8px; + right: -8px; + bottom: -8px; + left: -24px; + pointer-events: none; + content: ""; + border: 2px solid var(--color-accent-emphasis); + border-radius: 6px; +} + +.markdown-body .footnotes li:target { + color: var(--color-fg-default); +} + +.markdown-body .footnotes .data-footnote-backref g-emoji { + font-family: monospace; +} + +.markdown-body .pl-c { + color: var(--color-prettylights-syntax-comment); +} + +.markdown-body .pl-c1, +.markdown-body .pl-s .pl-v { + color: var(--color-prettylights-syntax-constant); +} + +.markdown-body .pl-e, +.markdown-body .pl-en { + color: var(--color-prettylights-syntax-entity); +} + +.markdown-body .pl-smi, +.markdown-body .pl-s .pl-s1 { + color: var(--color-prettylights-syntax-storage-modifier-import); +} + +.markdown-body .pl-ent { + color: var(--color-prettylights-syntax-entity-tag); +} + +.markdown-body .pl-k { + color: var(--color-prettylights-syntax-keyword); +} + +.markdown-body .pl-s, +.markdown-body .pl-pds, +.markdown-body .pl-s .pl-pse .pl-s1, +.markdown-body .pl-sr, +.markdown-body .pl-sr .pl-cce, +.markdown-body .pl-sr .pl-sre, +.markdown-body .pl-sr .pl-sra { + color: var(--color-prettylights-syntax-string); +} + +.markdown-body .pl-v, +.markdown-body .pl-smw { + color: var(--color-prettylights-syntax-variable); +} + +.markdown-body .pl-bu { + color: var(--color-prettylights-syntax-brackethighlighter-unmatched); +} + +.markdown-body .pl-ii { + color: var(--color-prettylights-syntax-invalid-illegal-text); + background-color: var(--color-prettylights-syntax-invalid-illegal-bg); +} + +.markdown-body .pl-c2 { + color: var(--color-prettylights-syntax-carriage-return-text); + background-color: var(--color-prettylights-syntax-carriage-return-bg); +} + +.markdown-body .pl-sr .pl-cce { + font-weight: bold; + color: var(--color-prettylights-syntax-string-regexp); +} + +.markdown-body .pl-ml { + color: var(--color-prettylights-syntax-markup-list); +} + +.markdown-body .pl-mh, +.markdown-body .pl-mh .pl-en, +.markdown-body .pl-ms { + font-weight: bold; + color: var(--color-prettylights-syntax-markup-heading); +} + +.markdown-body .pl-mi { + font-style: italic; + color: var(--color-prettylights-syntax-markup-italic); +} + +.markdown-body .pl-mb { + font-weight: bold; + color: var(--color-prettylights-syntax-markup-bold); +} + +.markdown-body .pl-md { + color: var(--color-prettylights-syntax-markup-deleted-text); + background-color: var(--color-prettylights-syntax-markup-deleted-bg); +} + +.markdown-body .pl-mi1 { + color: var(--color-prettylights-syntax-markup-inserted-text); + background-color: var(--color-prettylights-syntax-markup-inserted-bg); +} + +.markdown-body .pl-mc { + color: var(--color-prettylights-syntax-markup-changed-text); + background-color: var(--color-prettylights-syntax-markup-changed-bg); +} + +.markdown-body .pl-mi2 { + color: var(--color-prettylights-syntax-markup-ignored-text); + background-color: var(--color-prettylights-syntax-markup-ignored-bg); +} + +.markdown-body .pl-mdr { + font-weight: bold; + color: var(--color-prettylights-syntax-meta-diff-range); +} + +.markdown-body .pl-ba { + color: var(--color-prettylights-syntax-brackethighlighter-angle); +} + +.markdown-body .pl-sg { + color: var(--color-prettylights-syntax-sublimelinter-gutter-mark); +} + +.markdown-body .pl-corl { + text-decoration: underline; + color: var(--color-prettylights-syntax-constant-other-reference-link); +} + +.markdown-body g-emoji { + display: inline-block; + min-width: 1ch; + font-family: "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol"; + font-size: 1em; + font-style: normal !important; + font-weight: var(--base-text-weight-normal, 400); + line-height: 1; + vertical-align: -0.075em; +} + +.markdown-body g-emoji img { + width: 1em; + height: 1em; +} + +.markdown-body .task-list-item { + list-style-type: none; +} + +.markdown-body .task-list-item label { + font-weight: var(--base-text-weight-normal, 400); +} + +.markdown-body .task-list-item.enabled label { + cursor: pointer; +} + +.markdown-body .task-list-item+.task-list-item { + margin-top: 4px; +} + +.markdown-body .task-list-item .handle { + display: none; +} + +.markdown-body .task-list-item-checkbox { + margin: 0 .2em .25em -1.4em; + vertical-align: middle; +} + +.markdown-body .contains-task-list:dir(rtl) .task-list-item-checkbox { + margin: 0 -1.6em .25em .2em; +} + +.markdown-body .contains-task-list { + position: relative; +} + +.markdown-body .contains-task-list:hover .task-list-item-convert-container, +.markdown-body .contains-task-list:focus-within .task-list-item-convert-container { + display: block; + width: auto; + height: 24px; + overflow: visible; + clip: auto; +} + +.markdown-body ::-webkit-calendar-picker-indicator { + filter: invert(50%); +} + +.markdown-body .markdown-alert { + padding: var(--base-size-8) var(--base-size-16); + margin-bottom: 16px; + color: inherit; + border-left: .25em solid var(--color-border-default); +} + +.markdown-body .markdown-alert>:first-child { + margin-top: 0; +} + +.markdown-body .markdown-alert>:last-child { + margin-bottom: 0; +} + +.markdown-body .markdown-alert .markdown-alert-title { + display: flex; + font-weight: var(--base-text-weight-medium, 500); + align-items: center; + line-height: 1; +} + +.markdown-body .markdown-alert.markdown-alert-note { + border-left-color: var(--color-accent-emphasis); +} + +.markdown-body .markdown-alert.markdown-alert-note .markdown-alert-title { + color: var(--color-accent-fg); +} + +.markdown-body .markdown-alert.markdown-alert-important { + border-left-color: var(--color-done-emphasis); +} + +.markdown-body .markdown-alert.markdown-alert-important .markdown-alert-title { + color: var(--color-done-fg); +} + +.markdown-body .markdown-alert.markdown-alert-warning { + border-left-color: var(--color-attention-emphasis); +} + +.markdown-body .markdown-alert.markdown-alert-warning .markdown-alert-title { + color: var(--color-attention-fg); +} + +.markdown-body .markdown-alert.markdown-alert-tip { + border-left-color: var(--color-success-emphasis); +} + +.markdown-body .markdown-alert.markdown-alert-tip .markdown-alert-title { + color: var(--color-success-fg); +} + +.markdown-body .markdown-alert.markdown-alert-caution { + border-left-color: var(--color-danger-emphasis); +} + +.markdown-body .markdown-alert.markdown-alert-caution .markdown-alert-title { + color: var(--color-danger-fg); +} \ No newline at end of file diff --git a/docs/index.html b/docs/index.html new file mode 100755 index 00000000..d1154b4d --- /dev/null +++ b/docs/index.html @@ -0,0 +1,1250 @@ + + + + + + + + + +Search Code By Comment + + + + + + + + + + + + + + + + + + + + + + + +++ + ++ +++ ++
++ Document index of: ++ ++ + +++ + + + + + \ No newline at end of file diff --git a/docs/metadata.json b/docs/metadata.json new file mode 100644 index 00000000..028a9215 --- /dev/null +++ b/docs/metadata.json @@ -0,0 +1,320 @@ +{ + "url": { + "full": "https://github.com/BlinkDL/RWKV-LM", + "partial": "BlinkDL/RWKV-LM" + }, + "file_mapping": { + "0": { + "filepath": "/README.md", + "entry_id": 0, + "language_id": "markdown" + }, + "1": { + "filepath": "/RWKV-v1/src/model.py", + "entry_id": 88, + "language_id": "python" + }, + "2": { + "filepath": "/RWKV-v1/src/trainer.py", + "entry_id": 132, + "language_id": "python" + }, + "3": { + "filepath": "/RWKV-v1/src/utils.py", + "entry_id": 146, + "language_id": "python" + }, + "4": { + "filepath": "/RWKV-v1/train.py", + "entry_id": 152, + "language_id": "python" + }, + "5": { + "filepath": "/RWKV-v2-RNN/cuda/timex_cuda.cu", + "entry_id": 168, + "language_id": "cuda" + }, + "6": { + "filepath": "/RWKV-v2-RNN/cuda/timex_op.cpp", + "entry_id": 182, + "language_id": "c++" + }, + "7": { + "filepath": "/RWKV-v2-RNN/run.py", + "entry_id": 188, + "language_id": "python" + }, + "8": { + "filepath": "/RWKV-v2-RNN/src/model.py", + "entry_id": 200, + "language_id": "python" + }, + "9": { + "filepath": "/RWKV-v2-RNN/src/model_run.py", + "entry_id": 228, + "language_id": "python" + }, + "10": { + "filepath": "/RWKV-v2-RNN/src/trainer.py", + "entry_id": 240, + "language_id": "python" + }, + "11": { + "filepath": "/RWKV-v2-RNN/src/utils.py", + "entry_id": 256, + "language_id": "python" + }, + "12": { + "filepath": "/RWKV-v2-RNN/train.py", + "entry_id": 266, + "language_id": "python" + }, + "13": { + "filepath": "/RWKV-v3/cuda/timex_cuda.cu", + "entry_id": 276, + "language_id": "cuda" + }, + "14": { + "filepath": "/RWKV-v3/cuda/timex_op.cpp", + "entry_id": 290, + "language_id": "c++" + }, + "15": { + "filepath": "/RWKV-v3/run.py", + "entry_id": 296, + "language_id": "python" + }, + "16": { + "filepath": "/RWKV-v3/src/model.py", + "entry_id": 306, + "language_id": "python" + }, + "17": { + "filepath": "/RWKV-v3/src/model_run.py", + "entry_id": 336, + "language_id": "python" + }, + "18": { + "filepath": "/RWKV-v3/src/trainer.py", + "entry_id": 360, + "language_id": "python" + }, + "19": { + "filepath": "/RWKV-v3/src/utils.py", + "entry_id": 376, + "language_id": "python" + }, + "20": { + "filepath": "/RWKV-v3/train.py", + "entry_id": 386, + "language_id": "python" + }, + "21": { + "filepath": "/RWKV-v3/verify.py", + "entry_id": 400, + "language_id": "python" + }, + "22": { + "filepath": "/RWKV-v4/cuda/wkv_cuda.cu", + "entry_id": 408, + "language_id": "cuda" + }, + "23": { + "filepath": "/RWKV-v4/cuda/wkv_op.cpp", + "entry_id": 420, + "language_id": "c++" + }, + "24": { + "filepath": "/RWKV-v4/run.py", + "entry_id": 426, + "language_id": "python" + }, + "25": { + "filepath": "/RWKV-v4/src/binidx.py", + "entry_id": 440, + "language_id": "python" + }, + "26": { + "filepath": "/RWKV-v4/src/model.py", + "entry_id": 456, + "language_id": "python" + }, + "27": { + "filepath": "/RWKV-v4/src/model_run.py", + "entry_id": 488, + "language_id": "python" + }, + "28": { + "filepath": "/RWKV-v4/src/trainer.py", + "entry_id": 518, + "language_id": "python" + }, + "29": { + "filepath": "/RWKV-v4/src/utils.py", + "entry_id": 536, + "language_id": "python" + }, + "30": { + "filepath": "/RWKV-v4/train.py", + "entry_id": 550, + "language_id": "python" + }, + "31": { + "filepath": "/RWKV-v4/verify.py", + "entry_id": 576, + "language_id": "python" + }, + "32": { + "filepath": "/RWKV-v4neo/chat.py", + "entry_id": 586, + "language_id": "python" + }, + "33": { + "filepath": "/RWKV-v4neo/cuda/wkv5_cuda.cu", + "entry_id": 612, + "language_id": "cuda" + }, + "34": { + "filepath": "/RWKV-v4neo/cuda/wkv5_op.cpp", + "entry_id": 626, + "language_id": "c++" + }, + "35": { + "filepath": "/RWKV-v4neo/cuda/wkv_cuda.cu", + "entry_id": 632, + "language_id": "cuda" + }, + "36": { + "filepath": "/RWKV-v4neo/cuda/wkv_cuda_bf16.cu", + "entry_id": 644, + "language_id": "cuda" + }, + "37": { + "filepath": "/RWKV-v4neo/cuda/wkv_op.cpp", + "entry_id": 656, + "language_id": "c++" + }, + "38": { + "filepath": "/RWKV-v4neo/cuda/wkv_op_bf16.cpp", + "entry_id": 662, + "language_id": "c++" + }, + "39": { + "filepath": "/RWKV-v4neo/img_demoAE.py", + "entry_id": 668, + "language_id": "python" + }, + "40": { + "filepath": "/RWKV-v4neo/math_demo/run.py", + "entry_id": 682, + "language_id": "python" + }, + "41": { + "filepath": "/RWKV-v4neo/run.py", + "entry_id": 696, + "language_id": "python" + }, + "42": { + "filepath": "/RWKV-v4neo/src/binidx.py", + "entry_id": 714, + "language_id": "python" + }, + "43": { + "filepath": "/RWKV-v4neo/src/dataset.py", + "entry_id": 734, + "language_id": "python" + }, + "44": { + "filepath": "/RWKV-v4neo/src/model.py", + "entry_id": 760, + "language_id": "python" + }, + "45": { + "filepath": "/RWKV-v4neo/src/model_img.py", + "entry_id": 844, + "language_id": "python" + }, + "46": { + "filepath": "/RWKV-v4neo/src/model_run.py", + "entry_id": 878, + "language_id": "python" + }, + "47": { + "filepath": "/RWKV-v4neo/src/trainer.py", + "entry_id": 898, + "language_id": "python" + }, + "48": { + "filepath": "/RWKV-v4neo/src/utils.py", + "entry_id": 922, + "language_id": "python" + }, + "49": { + "filepath": "/RWKV-v4neo/train.py", + "entry_id": 934, + "language_id": "python" + }, + "50": { + "filepath": "/RWKV-v4neo/verify.py", + "entry_id": 974, + "language_id": "python" + }, + "51": { + "filepath": "/RWKV-v5/cuda/wkv5_cuda.cu", + "entry_id": 984, + "language_id": "cuda" + }, + "52": { + "filepath": "/RWKV-v5/cuda/wkv5_op.cpp", + "entry_id": 998, + "language_id": "c++" + }, + "53": { + "filepath": "/RWKV-v5/demo.jsonl", + "entry_id": 1004, + "language_id": "text" + }, + "54": { + "filepath": "/RWKV-v5/make_data.py", + "entry_id": 1604, + "language_id": "python" + }, + "55": { + "filepath": "/RWKV-v5/src/binidx.py", + "entry_id": 1616, + "language_id": "python" + }, + "56": { + "filepath": "/RWKV-v5/src/dataset.py", + "entry_id": 1636, + "language_id": "python" + }, + "57": { + "filepath": "/RWKV-v5/src/model.py", + "entry_id": 1656, + "language_id": "python" + }, + "58": { + "filepath": "/RWKV-v5/src/trainer.py", + "entry_id": 1706, + "language_id": "python" + }, + "59": { + "filepath": "/RWKV-v5/src/utils.py", + "entry_id": 1730, + "language_id": "python" + }, + "60": { + "filepath": "/RWKV-v5/tokenizer/rwkv_tokenizer.py", + "entry_id": 1742, + "language_id": "python" + }, + "61": { + "filepath": "/RWKV-v5/train.py", + "entry_id": 1750, + "language_id": "python" + } + }, + "project_name": "RWKV-LM", + "split_count": 18 +} \ No newline at end of file diff --git a/docs/metadata_title.json b/docs/metadata_title.json new file mode 100644 index 00000000..fc9a5c4a --- /dev/null +++ b/docs/metadata_title.json @@ -0,0 +1 @@ +{"split_count": 3} \ No newline at end of file diff --git a/docs/sitemap.xml b/docs/sitemap.xml new file mode 100644 index 00000000..c551224c --- /dev/null +++ b/docs/sitemap.xml @@ -0,0 +1,385 @@ + + +
+ + + + \ No newline at end of file diff --git a/docs/src/README.md b/docs/src/README.md new file mode 100644 index 00000000..b24dd10c --- /dev/null +++ b/docs/src/README.md @@ -0,0 +1,785 @@ +# The RWKV Language Model (and my LM tricks) + +> RWKV homepage: https://www.rwkv.com/ https://wiki.rwkv.com/ + +### HOW TO TEST TRAINING RWKV-5 on MiniPile (1.5G tokens) ## + +Use python 3.10 and cuda 11.7.1 / 11.7 (note torch2 + cuda12 has weird bugs and hurts model performance). +``` +pip install torch==1.13.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117 +pip install pytorch-lightning==1.9.5 deepspeed==0.7.0 wandb ninja +cd RWKV-v5/ +./demo-training-prepare.sh +./demo-training-run.sh +(you may want to log in to wandb first) +``` +Your loss curve should look almost exactly the same as this, with the same ups and downs (if you use the same bsz & config): + + + +You can run your model using https://pypi.org/project/rwkv/ (use "rwkv_vocab_v20230424" instead of "20B_tokenizer.json") + +### HOW TO FINETUNE RWKV-5 MODELS ## + +Use .jsonl format for your data (see https://huggingface.co/BlinkDL/rwkv-5-world for formats). + +Use https://github.com/BlinkDL/RWKV-LM/blob/main/RWKV-v5/make_data.py to tokenizer it into binidx suitable for training. + +## RWKV: Parallelizable RNN with Transformer-level LLM Performance (pronounced as "RwaKuv", from 4 major params: R W K V) + +RWKV is an RNN with Transformer-level LLM performance, which can also be directly trained like a GPT transformer (parallelizable). And it's 100% attention-free. You only need the hidden state at position t to compute the state at position t+1. You can use the "GPT" mode to quickly compute the hidden state for the "RNN" mode. + +So it's combining the best of RNN and transformer - **great performance, fast inference, saves VRAM, fast training, "infinite" ctx_len, and free sentence embedding** (using the final hidden state). + +Our latest version is **RWKV-6**, which is easily Mamba level, and simpler ;) https://twitter.com/BlinkDL_AI/status/1732791817073229881 https://twitter.com/BlinkDL_AI/status/1713967928209752128 (Preview models: https://huggingface.co/BlinkDL/temp ) + +**RWKV-5 World v2 1.5B** Demo: https://huggingface.co/spaces/BlinkDL/RWKV-Gradio-1 + +**RWKV-5 World v2 3B** Demo: https://huggingface.co/spaces/BlinkDL/RWKV-Gradio-2 + +![RWKV-v5-benchmark-1](RWKV-v5-benchmark-1.png) + +**RWKV Runner GUI** https://github.com/josStorer/RWKV-Runner with one-click install and API + +**Raw cutting-edge RWKV weights:** https://huggingface.co/BlinkDL + +**HF-compatible RWKV weights:** https://huggingface.co/RWKV + +**RWKV pip package**: https://pypi.org/project/rwkv/ + +```python +os.environ["RWKV_JIT_ON"] = '1' +os.environ["RWKV_CUDA_ON"] = '0' # if '1' then use CUDA kernel for seq mode (much faster) +from rwkv.model import RWKV # pip install rwkv +model = RWKV(model='/fsx/BlinkDL/HF-MODEL/rwkv-4-pile-1b5/RWKV-4-Pile-1B5-20220903-8040', strategy='cuda fp16') + +out, state = model.forward([187, 510, 1563, 310, 247], None) # use 20B_tokenizer.json +print(out.detach().cpu().numpy()) # get logits +out, state = model.forward([187, 510], None) +out, state = model.forward([1563], state) # RNN has state (use deepcopy if you want to clone it) +out, state = model.forward([310, 247], state) +print(out.detach().cpu().numpy()) # same result as above +``` + +**nanoRWKV**: https://github.com/BlinkDL/nanoRWKV (does not require custom CUDA kernel to train, works for any GPU/CPU) + +## RWKV Discord: https://discord.gg/bDSBUMeFpc (7k+ members) + +**Twitter**: https://twitter.com/BlinkDL_AI + +**Homepage**: https://www.rwkv.com/ + +**Cool Community RWKV Projects**: + +All (200+) RWKV projects: https://github.com/search?o=desc&q=rwkv&s=updated&type=Repositories + +https://github.com/cgisky1980/ai00_rwkv_server Fastest GPU inference API with vulkan (good for nvidia/amd/intel), supports rwkv5 + +https://github.com/cryscan/web-rwkv backend for ai00_rwkv_server, supports rwkv5 + +https://github.com/saharNooby/rwkv.cpp Fast CPU/cuBLAS/CLBlast inference: int4/int8/fp16/fp32, supports rwkv5 + +https://github.com/daquexian/faster-rwkv supports rwkv5 + +https://github.com/mlc-ai/mlc-llm/pull/1275 supports rwkv5 + +https://github.com/RWKV/RWKV-infctx-trainer Infctx trainer + +https://github.com/Blealtan/RWKV-LM-LoRA LoRA finetuning + +https://github.com/TheRamU/Fay/blob/main/README_EN.md Digital Assistant with RWKV + +https://github.com/harrisonvanderbyl/rwkv-cpp-cuda Fast GPU inference with cuda/amd/vulkan + +**RWKV v4 in 150 lines** (model, inference, text generation): https://github.com/BlinkDL/ChatRWKV/blob/main/RWKV_in_150_lines.py + +**🔥 RWKV v5 in 250 lines 🔥** (with tokenizer too): https://github.com/BlinkDL/ChatRWKV/blob/main/RWKV_v5_demo.py + +**RWKV v4 preprint** https://arxiv.org/abs/2305.13048 + +![RWKV-paper](RWKV-paper.png) + +**RWKV v4 introduction, and in 100 lines of numpy**: https://johanwind.github.io/2023/03/23/rwkv_overview.html https://johanwind.github.io/2023/03/23/rwkv_details.html + +RWKV v6 illustrated: + +![RWKV-v6](rwkv-x060.png) + +A cool paper (Spiking Neural Network) using RWKV: https://github.com/ridgerchu/SpikeGPT + +You are welcome to join the RWKV discord https://discord.gg/bDSBUMeFpc to build upon it. We have plenty of potential compute (A100 40Gs) now (thanks to Stability and EleutherAI), so if you have interesting ideas I can run them. + +![RWKV-eval2](RWKV-eval2.png) + +RWKV [loss vs token position] for 10000 ctx4k+ documents in Pile. RWKV 1B5-4k is mostly flat after ctx1500, but 3B-4k and 7B-4k and 14B-4k have some slopes, and they are getting better. This debunks the old view that RNNs cannot model long ctxlens. We can predict that RWKV 100B will be great, and RWKV 1T is probably all you need :) + +![RWKV-ctxlen](RWKV-ctxlen.png) + +ChatRWKV with RWKV 14B ctx8192: + +![RWKV-chat](RWKV-chat.png) + +I believe RNN is a better candidate for fundamental models, because: (1) It's more friendly for ASICs (no kv cache). (2) It's more friendly for RL. (3) When we write, our brain is more similar to RNN. (4) The universe is like an RNN too (because of locality). Transformers are non-local models. + +RWKV-3 1.5B on A40 (tf32) = always 0.015 sec/token, tested using simple pytorch code (no CUDA), GPU utilization 45%, VRAM 7823M + +GPT2-XL 1.3B on A40 (tf32) = 0.032 sec/token (for ctxlen 1000), tested using HF, GPU utilization 45% too (interesting), VRAM 9655M + +Training speed: (new training code) RWKV-4 14B BF16 ctxlen4096 = 114K tokens/s on 8x8 A100 80G (ZERO2+CP). (old training code) RWKV-4 1.5B BF16 ctxlen1024 = 106K tokens/s on 8xA100 40G. + +I am doing image experiments too (For example: https://huggingface.co/BlinkDL/clip-guided-binary-autoencoder) and RWKV will be able to do txt2img diffusion :) My idea: 256x256 rgb image -> 32x32x13bit latents -> apply RWKV to compute transition probability for each of the 32x32 grid -> pretend the grids are independent and "diffuse" using these probabilities. + +Smooth training - no loss spikes! (lr & bsz change around 15G tokens) +![RWKV-loss](RWKV-loss.png) + +![RWKV-eval](RWKV-eval.png) + +All of the trained models will be open-source. Inference is very fast (only matrix-vector multiplications, no matrix-matrix multiplications) even on CPUs, so you can even run a LLM on your phone. + +How it works: RWKV gathers information to a number of channels, which are also decaying with different speeds as you move to the next token. It's very simple once you understand it. + +**RWKV is parallelizable because the time-decay of each channel is data-independent (and trainable)**. For example, in usual RNN you can adjust the time-decay of a channel from say 0.8 to 0.5 (these are called "gates"), while in RWKV you simply move the information from a W-0.8-channel to a W-0.5-channel to achieve the same effect. Moreover, you can fine-tune RWKV into a non-parallelizable RNN (then you can use outputs of later layers of the previous token) if you want extra performance. + +![RWKV-formula](RWKV-formula.png) + +Here are some of my TODOs. Let's work together :) + +* HuggingFace integration (check https://github.com/huggingface/transformers/issues/17230 +), and optimized CPU & iOS & Android & WASM & WebGL inference. RWKV is a RNN and very friendly for edge devices. Let's make it possible to run a LLM on your phone. + +* Test it on bidirectional & MLM tasks, and image & audio & video tokens. I think RWKV can support Encoder-Decoder via this: for each decoder token, use a learned mixture of [decoder previous hidden state] & [encoder final hidden state]. Hence all decoder tokens will have access to the encoder output. + +* Now training RWKV-4a with one single tiny extra attention (just a few extra lines comparing with RWKV-4) to further improve some difficult zeroshot tasks (such as LAMBADA) for smaller models. See https://github.com/BlinkDL/RWKV-LM/commit/a268cd2e40351ee31c30c5f8a5d1266d35b41829 + +User feedback: +> *I've so far toyed around the character-based model on our relatively small pre-training dataset (around 10GB of text), and the results are extremely good - similar ppl to models taking much, much longer to train.* + +> *dear god rwkv is fast. i switched to another tab after starting training it from scratch & when i returned it was emitting plausible english & maori words, i left to go microwave some coffee & when i came back it was producing fully grammatically correct sentences.* + +Tweet from Sepp Hochreiter (thank you!): https://twitter.com/HochreiterSepp/status/1524270961314484227 + +You can find me (BlinkDL) in the EleutherAI Discord too: https://www.eleuther.ai/get-involved/ + +![RWKV-demo](RWKV-demo.png) + +## Quick start + +**IMPORTANT: Use deepspeed==0.7.0 pytorch-lightning==1.9.5 torch==1.13.1+cu117 and cuda 11.7.1 or 11.7 (note torch2 + deepspeed has weird bugs and hurts model performance)** + +Use https://github.com/BlinkDL/RWKV-LM/tree/main/RWKV-v4neo (latest code, compatible with v4). + +Here is a great prompt for testing Q&A of LLMs. Works for any model: (found by minimizing ChatGPT ppls for RWKV 1.5B) +```python +prompt = f'\nQ & A\n\nQuestion:\n{qq}\n\nDetailed Expert Answer:\n' # let the model generate after this +``` + +### Inference + +**Run RWKV-4 Pile models:** Download models from https://huggingface.co/BlinkDL. Set TOKEN_MODE = 'pile' in run.py and run it. It's fast even on CPU (the default mode). + +**Colab for RWKV-4 Pile 1.5B**: https://colab.research.google.com/drive/1F7tZoPZaWJf1fsCmZ5tjw6sYHiFOYVWM + +Run RWKV-4 Pile models in your browser (and onnx version): see this issue https://github.com/BlinkDL/RWKV-LM/issues/7 + +RWKV-4 Web Demo: https://josephrocca.github.io/rwkv-v4-web/demo/ (note: only greedy sampling for now) + +For the old RWKV-2: see the release here for a 27M params model on enwik8 with 0.72 BPC(dev). Run run.py in https://github.com/BlinkDL/RWKV-LM/tree/main/RWKV-v2-RNN. You can even run it in your browser: https://github.com/BlinkDL/AI-Writer/tree/main/docs/eng https://blinkdl.github.io/AI-Writer/eng/ (this is using tf.js WASM single-thread mode). + +### Training / Fine-tuning + +pip install deepspeed==0.7.0 // pip install pytorch-lightning==1.9.5 // torch 1.13.1+cu117 + +NOTE: add weight decay (0.1 or 0.01) and dropout (0.1 or 0.01) when training on small amt of data. try x=x+dropout(att(x)) x=x+dropout(ffn(x)) x=dropout(x+att(x)) x=dropout(x+ffn(x)) etc. + +**Training RWKV-4 from scratch:** run train.py, which by default is using the enwik8 dataset (unzip https://data.deepai.org/enwik8.zip). + +You will be training the "GPT" version because it's paralleziable and faster to train. RWKV-4 can extrapolate, so training with ctxLen 1024 can work for ctxLen of 2500+. You can fine-tune the model with longer ctxLen and it can quickly adapt to longer ctxLens. + +**Fine-tuning RWKV-4 Pile models:** use 'prepare-data.py' in https://github.com/BlinkDL/RWKV-v2-RNN-Pile/tree/main/RWKV-v3 to tokenize .txt into train.npy data. Then use https://github.com/BlinkDL/RWKV-LM/blob/main/RWKV-v4neo/train.py to train it. + +Read the inference code in src/model.py and try using the final hidden state(.xx .aa .bb) as a faithful sentence embedding for other tasks. Probably you should begin with .xx and .aa/.bb (.aa divided by .bb). + +Colab for fine-tuning RWKV-4 Pile models: https://colab.research.google.com/github/resloved/RWKV-notebooks/blob/master/RWKV_v4_RNN_Pile_Fine_Tuning.ipynb + +**Large corpus:** Use https://github.com/Abel2076/json2binidx_tool to convert .jsonl into .bin and .idx + +The jsonl format sample (one line for each document): +``` +{"text": "This is the first document."} +{"text": "Hello\nWorld"} +{"text": "1+1=2\n1+2=3\n2+2=4"} +``` +generated by code like this: +``` +ss = json.dumps({"text": text}, ensure_ascii=False) +out.write(ss + "\n") +``` + +**Infinite ctxlen training (WIP):** https://github.com/Blealtan/RWKV-LM-LoRA/tree/dev-infctx + +### How to use RWKV hidden state as text embedding + +Consider RWKV 14B. The state has 200 vectors, that is, 5 vectors for each block: fp16 (xx), fp32 (aa), fp32 (bb), fp32 (pp), fp16 (xx). + +Do not avg pool because different vectors (xx aa bb pp xx) in the state have very different meanings and ranges. You can probably remove pp. + +I suggest firstly collect the mean+stdev statistics of each channel of each vector, and normalize all of them (note: the normalization should be data-indepedent and collected from various texts). Then train a linear classifer. + +## Towards RWKV-5 (just to record some new ideas) + +### Lastest Design + +RWKV-5 is multi-head and here shows one head. There is also a LayerNorm for each head (hence actually GroupNorm). + +$` +\begin{array}{|l|l|l|} +\hline & \text { RWKV-4 with real-valued } k \,\&\, v \,\&\, u \,\&\, w & \text { RWKV-5 with matrix-valued } \mathrm{k}^{\dagger} \mathrm{v} \,\&\, \mathrm{u} \,\&\, \mathrm{w} \\ +\hline \mathrm{y}_0 & \mathrm{r}_0 \frac{\mathrm{uk}_0 \mathrm{v}_0}{\mathrm{uk}_0} & \mathrm{r}_0\left(\mathrm{uk}_0^{\dagger} \mathrm{v}_0\right) \\ +\hline \mathrm{y}_1 & \mathrm{r}_1 \frac{\mathrm{uk}_1 \mathrm{v}_1+\mathrm{k}_0 \mathrm{v}_0}{\mathrm{uk}_1+\mathrm{k}_0} & \mathrm{r}_1\left(\mathrm{uk}_1^{\dagger} \mathrm{v}_1+\mathrm{k}_0^{\dagger} \mathrm{v}_0\right) \\ +\hline \mathrm{y}_2 & \mathrm{r}_2 \frac{\mathrm{uk}_2 \mathrm{v}_2+\mathrm{k}_1 \mathrm{v}_1+\mathrm{wk}_0 \mathrm{v}_0}{\mathrm{uk}_2+\mathrm{k}_1+\mathrm{wk}_0} & \mathrm{r}_2\left(\mathrm{uk}_2^{\dagger} \mathrm{v}_2+\mathrm{k}_1^{\dagger} \mathrm{v}_1+\mathrm{wk}_0^{\dagger} \mathrm{v}_0\right) \\ +\hline \mathrm{y}_3 & \mathrm{r}_3 \frac{\mathrm{uk}_3 \mathrm{v}_3+\mathrm{k}_2 \mathrm{v}_2+\mathrm{wk}_1 \mathrm{v}_1+\mathrm{w}^2 \mathrm{k}_0 \mathrm{v}_0}{\mathrm{uk}_3+\mathrm{k}_2+\mathrm{wk}_1+\mathrm{w}^2 \mathrm{k}_0} & \mathrm{r}_3\left(\mathrm{uk}_3^{\dagger} \mathrm{v}_3+\mathrm{k}_2^{\dagger} \mathrm{v}_2+\mathrm{wk}_1^{\dagger} \mathrm{v}_1+\mathrm{w}^2 \mathrm{k}_0^{\dagger} \mathrm{v}_0\right) \\ +\hline +\end{array}`$ + +$`\left[\begin{array}{ll} +\mathrm{y}_{20} & \cdots \mathrm{y}_{2 \mathrm{c}} +\end{array}\right]=\left[\begin{array}{lll} +\mathrm{r}_{20} & \cdots & \mathrm{r}_{2 \mathrm{c}} +\end{array}\right]`$ +$`\left(\left[\begin{array}{ccc} +\mathrm{u}_{00} & \cdots & \mathrm{u}_{0 \mathrm{c}} \\ +\vdots & \ddots & \vdots \\ +\mathrm{u}_{\mathrm{c} 0} & \cdots & \mathrm{u}_{\mathrm{cc}} +\end{array}\right]\left[\begin{array}{ccc} +\mathrm{k}_{20} \mathrm{v}_{20} & \cdots & \mathrm{k}_{20} \mathrm{v}_{2 \mathrm{c}} \\ +\vdots & \ddots & \vdots \\ +\mathrm{k}_{2 \mathrm{c}} \mathrm{v}_{20} & \cdots & \mathrm{k}_{2 \mathrm{c}} \mathrm{v}_{2 \mathrm{c}} +\end{array}\right]+\left[\begin{array}{ccc} +\mathrm{k}_{10} \mathrm{v}_{10} & \cdots & \mathrm{k}_{10} \mathrm{v}_{1 \mathrm{c}} \\ +\vdots & \ddots & \vdots \\ +\mathrm{k}_{1 \mathrm{c}} \mathrm{v}_{10} & \cdots & \mathrm{k}_{1 \mathrm{c}} \mathrm{v}_{1 \mathrm{c}} +\end{array}\right]+\left[\begin{array}{ccc} +\mathrm{w}_{00} & \cdots & \mathrm{w}_{0 \mathrm{c}} \\ +\vdots & \ddots & \vdots \\ +\mathrm{w}_{\mathrm{c} 0} & \cdots & \mathrm{w}_{\mathrm{cc}} +\end{array}\right]\left[\begin{array}{ccc} +\mathrm{k}_{00} \mathrm{v}_{00} & \cdots & \mathrm{k}_{00} \mathrm{v}_{0 c} \\ +\vdots & \ddots & \vdots \\ +\mathrm{k}_{0 \mathrm{c}} \mathrm{v}_{00} & \cdots & \mathrm{k}_{0 \mathrm{c}} \mathrm{v}_{0 c} +\end{array}\right] +\right)`$ + +### RWKV-6 + +Dynamic Mix & Dynamic Decay. Example (do this for both TimeMix & ChannelMix): +``` +TIME_MIX_EXTRA_DIM = 32 +self.time_mix_k_w1 = nn.Parameter(torch.empty(args.n_embd, TIME_MIX_EXTRA_DIM).uniform_(-0.01, 0.01)) +self.time_mix_k_w2 = nn.Parameter(torch.zeros(TIME_MIX_EXTRA_DIM, args.n_embd)) +self.time_mix_v_w1 = nn.Parameter(torch.empty(args.n_embd, TIME_MIX_EXTRA_DIM).uniform_(-0.01, 0.01)) +self.time_mix_v_w2 = nn.Parameter(torch.zeros(TIME_MIX_EXTRA_DIM, args.n_embd)) +self.time_mix_r_w1 = nn.Parameter(torch.empty(args.n_embd, TIME_MIX_EXTRA_DIM).uniform_(-0.01, 0.01)) +self.time_mix_r_w2 = nn.Parameter(torch.zeros(TIME_MIX_EXTRA_DIM, args.n_embd)) +self.time_mix_g_w1 = nn.Parameter(torch.empty(args.n_embd, TIME_MIX_EXTRA_DIM).uniform_(-0.01, 0.01)) +self.time_mix_g_w2 = nn.Parameter(torch.zeros(TIME_MIX_EXTRA_DIM, args.n_embd)) +... +time_mix_k = self.time_mix_k.view(1,1,-1) + (x @ self.time_mix_k_w1) @ self.time_mix_k_w2 +time_mix_v = self.time_mix_v.view(1,1,-1) + (x @ self.time_mix_v_w1) @ self.time_mix_v_w2 +time_mix_r = self.time_mix_r.view(1,1,-1) + (x @ self.time_mix_r_w1) @ self.time_mix_r_w2 +time_mix_g = self.time_mix_g.view(1,1,-1) + (x @ self.time_mix_g_w1) @ self.time_mix_g_w2 + +xx = self.time_shift(x) +xk = x * time_mix_k + xx * (1 - time_mix_k) +xv = x * time_mix_v + xx * (1 - time_mix_v) +xr = x * time_mix_r + xx * (1 - time_mix_r) +xg = x * time_mix_g + xx * (1 - time_mix_g) +``` + +![RWKV-v6](RWKV-v6.png) + +### RWKV-7 + +Use parallelized mode to quickly generate the state, then use a finetuned full RNN (the layers of token n can use outputs of all layer of token n-1) for sequential generation. + +### Some old ideas + +1. Now time decay is like 0.999^T (0.999 is learnable). Change it to something like (0.999^T + 0.1) where 0.1 is learnable too. The 0.1 part will be kept forever. Or, A^T + B^T + C = fast-decay + slow-decay + constant. Can even use different formulas (for example, K^2 instead of e^K for a decay component, or, without normalization). + +2. Use complex-valued decay (so, rotation instead of decay) in some channels. + +3. Inject some trainable and extrapolatable positional encoding? + +4. Aside from 2d rotation, we can try other Lie groups such as 3d rotation ( SO(3) ). Non-abelian RWKV lol. + +5. RWKV might be great on analog devices (search for Analog Matrix-vector multiplication & Photonic Matrix-vector multiplication). The RNN mode is very hardware-friendly (processing-in-memory). Can be a SNN too (https://github.com/ridgerchu/SpikeGPT). I wonder if it can be optimized for quantum computation. + +6. Trainable initial hidden state (xx aa bb pp xx). + +7. Layerwise (or even row/column-wise, elementwise) LR, and test Lion optimizer. + +### Vision Tasks + +1. I find it's good to add a 2d pos encoding: +``` +self.pos_emb_x = nn.Parameter(torch.zeros((1,args.my_pos_emb,args.n_embd))) +self.pos_emb_y = nn.Parameter(torch.zeros((args.my_pos_emb,1,args.n_embd))) +... +x = x + pos_emb_x + pos_emb_y +``` + +2. In a BPE langauge model, it's the best to use [tokenShift of 1 token] (you can mix more tokens in a char-level English model). However you can try [tokenShift of N (or N-1) (or N+1) tokens] if the image size is N x N, because that will be like mixing [the token above the current positon (or the token above the to-be-predicted positon)] with [current token]. You can use try different tokenShift styles for "ATT" & "FFN", or mixing different tokenShift styles - such as mixing [token A] with [token A-1] and [token A-(N-1)] etc. + +### Misc + +Maybe we can improve memorization by simply repeating the context (I guess 2 times is enough). Example: Reference -> Reference(again) -> Question -> Answer + +#### Idea: Bytes-aware Embedding + +The idea is to make sure each token in vocab understand its length and raw UTF-8 bytes. + +Let a = max(len(token)) for all token in vocab. Define AA : float[a][d_emb] + +Let b = max(len_in_utf8_bytes(token)) for all token in vocab. Define BB : float[b][256][d_emb] + +For each token X in vocab, let [x0, x1, ..., xn] be its raw UTF-8 bytes. We will add some extra values to its embedding EMB(X): + +EMB(X) += AA[len(X)] + BB[0][x0] + BB[1][x1] + ... + BB[n][xn] (note: AA BB are learnable weights) + +* We can do this for the final Linear(d_emb, n_vocab) projection too. +* We can use some small networks to generate AA and BB, for some extra regularization (for example, BB[m][xi] and BB[n][xi] should be related). + +#### Old Idea + +I have an idea to improve tokenization. We can hardcode some channels to have meanings. Example: + +Channel 0 = "space" + +Channel 1 = "capitalize first letter" + +Channel 2 = "capitalize all letters" + +Therefore: + +Embedding of "abc": [0, 0, 0, x0, x1, x2 , ..] + +Embedding of " abc": [1, 0, 0, x0, x1, x2, ..] + +Embedding of " Abc": [1, 1, 0, x0, x1, x2, ..] + +Embedding of "ABC": [0, 0, 1, x0, x1, x2, ...] + +...... + +so they will share most of the embedding. And we can rapidly compute the output probability of all variations of "abc". + +Note: the above method is assuming that p(" xyz") / p("xyz") is the same for any "xyz", which can be wrong. + +Better: define emb_space emb_capitalize_first emb_capitalize_all to be a function of emb. + +Maybe the Best: let 'abc' ' abc' etc. to share the last 90% of their embeddings. + +At this moment, all our tokenizers spend too many items to represent all variations of 'abc' ' abc' ' Abc' etc. Moreover the model cannot discover that these are actually similar if some of these variations are rare in the dataset. The method here can improve this. I plan to test this in a new version of RWKV. + +#### Idea: Better Initial States + +Example (single-round Q & A): + +1. Generate the final state of all wiki documents. + +2. For any user Q, find the best wiki document, and use its final state as the initial state. + +3. Train a model to directly generate the optimal initial state for any user Q. + +However this can be a bit more tricky for multi-round Q & A :) + +## How it works + +RWKV is inspired by Apple's AFT (https://arxiv.org/abs/2105.14103). + +Moreover it's using a number of my tricks, such as: + +* SmallInitEmb: https://github.com/BlinkDL/SmallInitEmb (applicable to all transformers) which helps the embedding quality, and stabilizes Post-LN (which is what I am using). + +* Token-shift: https://github.com/BlinkDL/RWKV-LM#token-shift-time-shift-mixing (applicable to all transformers), especially helpful for char-level models. + +* Head-QK: https://github.com/BlinkDL/RWKV-LM#the-head-qk-trick-learning-to-copy-and-avoid-tokens (applicable to all transformers). Note: it's helpful, but I disabled it in the Pile model to keep it 100% RNN. + +* Extra R-gate in the FFN (applicable to all transformers). I am also using reluSquared from Primer. + +* Better initilization: I init most of the matrices to ZERO (see RWKV_Init in https://github.com/BlinkDL/RWKV-LM/blob/main/RWKV-v2-RNN/src/model.py). + +* You can transfer some parameters from a small model to a large model (note: I sort & smooth them too), for faster and better convergence (see https://www.reddit.com/r/MachineLearning/comments/umq908/r_rwkvv2rnn_a_parallelizable_rnn_with/). + +* My CUDA kernel: https://github.com/BlinkDL/RWKV-CUDA to speedup training. + +## The pseudocode (execution from top to bottom): + +![RWKV-v2-RNN](RWKV-v2-RNN.png) + +The a b c d factors work together to build a time-decay curve: [X, 1, W, W^2, W^3, ...]. + +Write out the formulas for "token at pos 2" and "token at pos 3" and you will get the idea: +* a and b: EMAs of kv and k. +* c and d: these are a and b combined with "self-attention". + +kv / k is the memory mechanism. The token with high k can be remembered for a long duration, if W is close to 1 in the channel. + +The R-gate is important for performance. k = info strength of this token (to be passed to future tokens). r = whether to apply the info to this token. + +## RWKV-3 improvements + +Use different trainable TimeMix factors for R / K / V in SA and FF layers. Example: +```python +xx = self.time_shift(x) +xk = x * self.time_mix_k + xx * (1 - self.time_mix_k) +xv = x * self.time_mix_v + xx * (1 - self.time_mix_v) +xr = x * self.time_mix_r + xx * (1 - self.time_mix_r) +``` + +Use preLN instead of postLN (more stable & faster convergence): +```python +if self.layer_id == 0: + x = self.ln0(x) +x = x + self.att(self.ln1(x)) +x = x + self.ffn(self.ln2(x)) +``` + +## Explaining the code for RWKV-3 GPT mode + +### The GPT mode - overview + +The building blocks of RWKV-3 GPT mode are similar to that of a usual preLN GPT. + +The only difference is an extra LN after embedding. Note you can absorb this LN into the embedding after finishing the training. +```python +x = self.emb(idx) # input: idx = token indices +x = self.ln_emb(x) # extra LN after embedding +x = x + self.att_0(self.ln_att_0(x)) # preLN +x = x + self.ffn_0(self.ln_ffn_0(x)) +... +x = x + self.att_n(self.ln_att_n(x)) +x = x + self.ffn_n(self.ln_ffn_n(x)) +x = self.ln_head(x) # final LN before projection +x = self.head(x) # output: x = logits +``` +It is important to initialize emb to tiny values, such as nn.init.uniform_(a=-1e-4, b=1e-4), to utilize my trick https://github.com/BlinkDL/SmallInitEmb. + +For the 1.5B RWKV-3, I use Adam (no wd, no dropout) optimizer on 8 * A100 40G. + +batchSz = 32 * 896, ctxLen = 896. I am using tf32 so the batchSz is a bit small. + +For the first 15B tokens, LR is fixed at 3e-4, and beta=(0.9, 0.99). + +Then I set beta=(0.9, 0.999), and do an exponential decay of LR, reaching 1e-5 at 332B tokens. + +### The GPT mode - ATT block + +The RWKV-3 does not have any attention in the usual sense, but we will call this block ATT anyway. +```python +B, T, C = x.size() # x = (Batch,Time,Channel) + +# Mix x with the previous timestep to produce xk, xv, xr +xx = self.time_shift(x) # self.time_shift = nn.ZeroPad2d((0,0,1,-1)) +xk = x * self.time_mix_k + xx * (1 - self.time_mix_k) +xv = x * self.time_mix_v + xx * (1 - self.time_mix_v) +xr = x * self.time_mix_r + xx * (1 - self.time_mix_r) + +# Use xk, xv, xr to produce k, v, r +k = self.key(xk).transpose(-1, -2) +v = self.value(xv).transpose(-1, -2) +r = self.receptance(xr) +k = torch.clamp(k, max=60) # clamp k to avoid overflow +k = torch.exp(k) +kv = k * v + +# Compute the W-curve = [e^(-n * e^time_decay), e^(-(n-1) * e^time_decay), ..., 1, e^(time_first)] +self.time_w = torch.cat([torch.exp(self.time_decay) * self.time_curve.to(x.device), self.time_first], dim=-1) +w = torch.exp(self.time_w) + +# Use W to mix kv and k respectively. Add K_EPS to wk to avoid divide-by-zero +if RUN_DEVICE == 'cuda': + wkv = TimeX.apply(w, kv, B,C,T, 0) + wk = TimeX.apply(w, k, B,C,T, K_EPS) +else: + w = w[:,-T:].unsqueeze(1) + wkv = F.conv1d(nn.ZeroPad2d((T-1, 0, 0, 0))(kv), w, groups=C) + wk = F.conv1d(nn.ZeroPad2d((T-1, 0, 0, 0))(k), w, groups=C) + K_EPS + +# The RWKV formula +rwkv = torch.sigmoid(r) * (wkv / wk).transpose(-1, -2) +rwkv = self.output(rwkv) # final output projection +``` + +The self.key, self.receptance, self.output matrices are all initialized to zero. + +The time_mix, time_decay, time_first vectors are transferred from a smaller trained model (note: I sort & smooth them too). + +### The GPT mode - FFN block + +The FFN block has three tricks comparing with the usual GPT: + +1. My time_mix trick. + +2. The sqReLU from the Primer paper. + +3. An extra receptance-gate (similar to the receptance-gate in ATT block). +```python +# Mix x with the previous timestep to produce xk, xr +xx = self.time_shift(x) +xk = x * self.time_mix_k + xx * (1 - self.time_mix_k) +xr = x * self.time_mix_r + xx * (1 - self.time_mix_r) + +# The usual FFN operation +k = self.key(xk) +k = torch.square(torch.relu(k)) # from the Primer paper +kv = self.value(k) + +# Apply an extra receptance-gate to kv +rkv = torch.sigmoid(self.receptance(xr)) * kv +return rkv +``` +The self.value, self.receptance matrices are all initialized to zero. + +## RWKV-4 improvements + +![RWKV-v3-plan](RWKV-v3-plan.png) + +## From GPT to RWKV (the formulas) + +Let F[t] be the system state at t. + +Let x[t] be the new external input at t. + +In GPT, predicting F[t+1] requires considering F[0], F[1], .. F[t]. So it takes O(T^2) to generate a length T sequence. + +The **simplified formula** for GPT: + +![F[\mathrm{t}+1]=\frac{\sum_{\mathrm{i}=0}^{\mathrm{t}} \exp (\mathbf{Q}x[\mathrm{t}] * \mathbf{K}F[\mathrm{i}]) \cdot(\mathbf{V}F[\mathrm{i}])}{\sum_{\mathrm{i}=0}^{\mathrm{t}} \exp (\mathbf{Q}x[\mathrm{t}] * \mathbf{K}F[\mathrm{i}])}](https://render.githubusercontent.com/render/math?math=%5Ccolor%7Bblack%7D%5Cdisplaystyle+F%5B%5Cmathrm%7Bt%7D%2B1%5D%3D%5Cfrac%7B%5Csum_%7B%5Cmathrm%7Bi%7D%3D0%7D%5E%7B%5Cmathrm%7Bt%7D%7D+%5Cexp+%28%5Cmathbf%7BQ%7Dx%5B%5Cmathrm%7Bt%7D%5D+%2A+%5Cmathbf%7BK%7DF%5B%5Cmathrm%7Bi%7D%5D%29+%5Ccdot%28%5Cmathbf%7BV%7DF%5B%5Cmathrm%7Bi%7D%5D%29%7D%7B%5Csum_%7B%5Cmathrm%7Bi%7D%3D0%7D%5E%7B%5Cmathrm%7Bt%7D%7D+%5Cexp+%28%5Cmathbf%7BQ%7Dx%5B%5Cmathrm%7Bt%7D%5D+%2A+%5Cmathbf%7BK%7DF%5B%5Cmathrm%7Bi%7D%5D%29%7D) + +It's very capable in theory, however that **does not mean we can fully utilize its capability with usual optimizers**. I suspect the loss landscape is too difficult for our current methods. + +Compare with the **simplified formula** for RWKV (the parallel mode, looks similar to Apple's AFT): + +![F[\mathrm{t}+1]=\sigma(\mathbf{R}x[\mathrm{t}]) \cdot \frac{\sum_{\mathrm{i}=0}^{\mathrm{t}} \exp (\mathbf{W} \cdot(\mathrm{t}-\mathrm{i})) \cdot \exp (\mathbf{K}F[\mathrm{i}]) \cdot(\mathbf{V}F[\mathrm{i}])}{\sum_{\mathrm{i}=0}^{\mathrm{t}} \exp (\mathbf{W} \cdot(\mathrm{t}-\mathrm{i})) \cdot \exp (\mathbf{K }F[\mathrm{i}])}](https://render.githubusercontent.com/render/math?math=%5Ccolor%7Bblack%7D%5Cdisplaystyle+F%5B%5Cmathrm%7Bt%7D%2B1%5D%3D%5Csigma%28%5Cmathbf%7BR%7Dx%5B%5Cmathrm%7Bt%7D%5D%29+%5Ccdot+%5Cfrac%7B%5Csum_%7B%5Cmathrm%7Bi%7D%3D0%7D%5E%7B%5Cmathrm%7Bt%7D%7D+%5Cexp+%28%5Cmathbf%7BW%7D+%5Ccdot%28%5Cmathrm%7Bt%7D-%5Cmathrm%7Bi%7D%29%29+%5Ccdot+%5Cexp+%28%5Cmathbf%7BK%7DF%5B%5Cmathrm%7Bi%7D%5D%29+%5Ccdot%28%5Cmathbf%7BV%7DF%5B%5Cmathrm%7Bi%7D%5D%29%7D%7B%5Csum_%7B%5Cmathrm%7Bi%7D%3D0%7D%5E%7B%5Cmathrm%7Bt%7D%7D+%5Cexp+%28%5Cmathbf%7BW%7D+%5Ccdot%28%5Cmathrm%7Bt%7D-%5Cmathrm%7Bi%7D%29%29+%5Ccdot+%5Cexp+%28%5Cmathbf%7BK+%7DF%5B%5Cmathrm%7Bi%7D%5D%29%7D) + +The R, K, V are trainable matrices, and W is a trainable vector (time-decay factor for each channel). + +In GPT, the contribution of F[i] to F[t+1] is weighted by ![ \exp (\mathbf{Q}x[\mathrm{t}] * \mathbf{K}F[\mathrm{i}]) ](https://render.githubusercontent.com/render/math?math=%5Ccolor%7Bblack%7D%5Cdisplaystyle++%5Cexp+%28%5Cmathbf%7BQ%7Dx%5B%5Cmathrm%7Bt%7D%5D+%2A+%5Cmathbf%7BK%7DF%5B%5Cmathrm%7Bi%7D%5D%29+). + +In RWKV-2, the contribution of F[i] to F[t+1] is weighted by ![\sigma(\mathbf{R}x[\mathrm{t}]) \cdot \exp (\mathbf{W} \cdot(\mathrm{t}-\mathrm{i})) \cdot \exp (\mathbf{K}F[\mathrm{i}]) ](https://render.githubusercontent.com/render/math?math=%5Ccolor%7Bblack%7D%5Cdisplaystyle+%5Csigma%28%5Cmathbf%7BR%7Dx%5B%5Cmathrm%7Bt%7D%5D%29+%5Ccdot+%5Cexp+%28%5Cmathbf%7BW%7D+%5Ccdot%28%5Cmathrm%7Bt%7D-%5Cmathrm%7Bi%7D%29%29+%5Ccdot+%5Cexp+%28%5Cmathbf%7BK%7DF%5B%5Cmathrm%7Bi%7D%5D%29+). +* The ![\sigma](https://render.githubusercontent.com/render/math?math=%5Ccolor%7Bblack%7D%5Cdisplaystyle+%5Csigma) is a non-linearity and we can use sigmoid. +* Note ![\sigma(\mathbf{R}x[\mathrm{t}])](https://render.githubusercontent.com/render/math?math=%5Ccolor%7Bblack%7D%5Cdisplaystyle+%5Csigma%28%5Cmathbf%7BR%7Dx%5B%5Cmathrm%7Bt%7D%5D%29) is not in the denominator, and I call R the "receptance". +* The ![\exp (\mathbf{W} \cdot(\mathrm{t}-\mathrm{i}))](https://render.githubusercontent.com/render/math?math=%5Ccolor%7Bblack%7D%5Cdisplaystyle+%5Cexp+%28%5Cmathbf%7BW%7D+%5Ccdot%28%5Cmathrm%7Bt%7D-%5Cmathrm%7Bi%7D%29%29) is the time-decay factor. I proposed the same idea (scaling the attention by distance) in Aug 2020 and called it the "time-weighting" (check the commit history of https://github.com/BlinkDL/minGPT-tuned). + +Here comes the punchline: we can rewrite it into a RNN (recursive formula). Note: + +![F[1]=\sigma(\mathbf{R }x[0]) \cdot \frac{ \exp (\mathbf{K }F[0]) \cdot(\mathbf{V }F[0])}{\exp (\mathbf{K }F[0])}](https://render.githubusercontent.com/render/math?math=%5Ccolor%7Bblack%7D%5Cdisplaystyle+F%5B1%5D%3D%5Csigma%28%5Cmathbf%7BR+%7Dx%5B0%5D%29+%5Ccdot+%5Cfrac%7B+%5Cexp+%28%5Cmathbf%7BK+%7DF%5B0%5D%29+%5Ccdot%28%5Cmathbf%7BV+%7DF%5B0%5D%29%7D%7B%5Cexp+%28%5Cmathbf%7BK+%7DF%5B0%5D%29%7D) + +![F[2]=\sigma(\mathbf{R }x[1]) \cdot \frac{ \exp (\mathbf{K }F[1]) \cdot(\mathbf{V }F[1])+\exp (\mathbf{W} ) \cdot \exp (\mathbf{K }F[0]) \cdot(\mathbf{V }F[0])}{ \exp (\mathbf{K }F[1])+\exp (\mathbf{W} ) \cdot \exp (\mathbf{K }F[0])}](https://render.githubusercontent.com/render/math?math=%5Ccolor%7Bblack%7D%5Cdisplaystyle+F%5B2%5D%3D%5Csigma%28%5Cmathbf%7BR+%7Dx%5B1%5D%29+%5Ccdot+%5Cfrac%7B+%5Cexp+%28%5Cmathbf%7BK+%7DF%5B1%5D%29+%5Ccdot%28%5Cmathbf%7BV+%7DF%5B1%5D%29%2B%5Cexp+%28%5Cmathbf%7BW%7D+%29+%5Ccdot+%5Cexp+%28%5Cmathbf%7BK+%7DF%5B0%5D%29+%5Ccdot%28%5Cmathbf%7BV+%7DF%5B0%5D%29%7D%7B+%5Cexp+%28%5Cmathbf%7BK+%7DF%5B1%5D%29%2B%5Cexp+%28%5Cmathbf%7BW%7D+%29+%5Ccdot+%5Cexp+%28%5Cmathbf%7BK+%7DF%5B0%5D%29%7D) + +Therefore it's straightforward to verify: + +![F[t+1]=\sigma(\mathbf{R }x[t]) \cdot \frac{\exp (\mathbf{K}F[\mathrm{t}]) \cdot(\mathbf{V}F[\mathrm{t}])+\exp (\mathbf{W}) \cdot A[\mathrm{t}]}{ \exp (\mathbf{K}F[\mathrm{t}])+\exp (\mathbf{W}) \cdot B[\mathrm{t}]}](https://render.githubusercontent.com/render/math?math=%5Ccolor%7Bblack%7D%5Cdisplaystyle+F%5Bt%2B1%5D%3D%5Csigma%28%5Cmathbf%7BR+%7Dx%5Bt%5D%29+%5Ccdot+%5Cfrac%7B%5Cexp+%28%5Cmathbf%7BK%7DF%5B%5Cmathrm%7Bt%7D%5D%29+%5Ccdot%28%5Cmathbf%7BV%7DF%5B%5Cmathrm%7Bt%7D%5D%29%2B%5Cexp+%28%5Cmathbf%7BW%7D%29+%5Ccdot+A%5B%5Cmathrm%7Bt%7D%5D%7D%7B+%5Cexp+%28%5Cmathbf%7BK%7DF%5B%5Cmathrm%7Bt%7D%5D%29%2B%5Cexp+%28%5Cmathbf%7BW%7D%29+%5Ccdot+B%5B%5Cmathrm%7Bt%7D%5D%7D) + +where A[t] and B[t] are the numerator and denominator of the previous step, respectively. + +I believe RWKV is performant because W is like repeatedly applying a diagonal matrix. Note (P^{-1} D P)^n = P^{-1} D^n P, so it is similar to repeatedly applying a general diagonalizable matrix. + +Moreover it's possible to turn it into a continuous ODE (a bit similar to State Space Models). I will write about it later. + +## Star History + +[![Star History Chart](https://api.star-history.com/svg?repos=BlinkDL/RWKV-LM&type=Date)](https://star-history.com/#BlinkDL/RWKV-LM&Date) + +## Multimodal ideas + +I have an idea for [text --> 32x32 RGB image] using a LM (transformer, RWKV, etc.). Will test it soon. + +Firstly, LM loss (instead of L2 loss), so the image will not be blurry. + +Secondly, color quantization. For example, only allowing 8 levels for R/G/B. Then the image vocab size is 8x8x8 = 512 (for each pixel), instead of 2^24. +Therefore, a 32x32 RGB image = a len1024 sequence of vocab512 (image tokens), which is a typical input for usual LMs. +(Later we can use diffusion models to upsample and generate RGB888 images. We might be able to use a LM for this too.) + +Thirdly, 2D positional embeddings that are easy for the model to understand. +For example, add one-hot X & Y coords to the first 64(=32+32) channels. Say if the pixel is at x=8, y=20, then we will add 1 to channel 8 and channel 52 (=32+20). +Moreover probably we can add the float X & Y coords (normalized to 0~1 range) to another 2 channels. And other periodic pos. encoding might help too (will test). + +Finally, RandRound when doing the color quantization in the DataLoader. +For example, if the float level is 4.578, then there is a 57.8% chance to use 5, and (1-57.8%) chance to use 4. +And we can allow both 4 and 5 in the prediction, but the loss will be higher if the prediction is 4. + +Multi-task training might help too. I will try this dataset format: +[TxtFirst] [Desc of Img (txt tokens)] [Img] [img tokens] +and sometimes +[ImgFirst] [img tokens] [Txt] [Desc of Img (txt tokens)] +... the order of the imgs should be randomized in the DataLoader, and [TxtFirst] [ImgFirst] [Img] [Txt] are special tokens +and do random sampling of the full dataset. So sometimes the model will see the img tokens first and then the corresponding txt tokens, which is a [img -> txt] task. And the model will see some partial imgs and partial txts. I think a char-level LM might help the model to write correct text on images. + +## How to sample a large dataset (for training) + +I am using a trick to sample the Pile deterministically yet randomly enough. + +Let's say the pile has x chunks (a chunk = ctx_len tokens). + +pick a prime number p just less than x, and make sure p = 2 (mod 3). + +Use (step * step * step) mod p to sample it. Add some bias to step for extra randomness. + +## The top-p-x sampling method (for inference) + +We propose a new sampling method called top-p-x: + +it's like top-p, and the only difference is you also keep all tokens whose prob > x. + +Try x = 0.01 first. + +## Better Learning Rate Schedule via Variantional Method of Loss Curve + +I propose a simple new method to find better LR schedules. The method is cost-efficient and practical for large LMs. The takeaway is we can model the loss curve dynamics (phenomenology) w.r.t. the LR, and a nice closed-form LR curve can be directly computed from it using variantional method. Moreover we can predict the final loss with reasonable accuracy. + +UPDATE: In "Conclusion 1.", use the best-fitting regime (ignore the initial steps where our approximations break down) to fit the parameters. + +Try this: fixed lr for 1 hr, then exponential decay to 0.2 * lr in 12 hrs, and choose the t=[1hr, 13hr] segment. + +In the last three plots, black = predicted loss curve of the new LR schedule, blue = original (unoptimized) real loss curve, orange = new LR schedule. + +![better_lr_schedule](Research/better_lr_schedule.png) + +# RWKV v1 + +We propose the RWKV language model, with alternating time-mix and channel-mix layers: + + + +* The R, K, V are generated by linear transforms of input, and W is parameter. The idea of RWKV is to decompose attention into R(target) * W(src, target) * K(src). So we can call R "receptance", and sigmoid means it's in 0~1 range. + +* The Time-mix is similar to AFT (https://arxiv.org/abs/2105.14103). There are two differences. + +(1) We changed the normalization (denominator). For masked language models, we define: + + + +**(UPDATE: We are using the original AFT normalization in v2)** + +Initialize K and R matrices (and the output projection matrix) to ZERO for fast & stable convergence. + +(2) We decompose W_{t,u,c} and introduce multi-head W (here h is the corresponding head of c): + + + +Moreover we multiply the final output of Time-mix layer by γ(t). The reason for the α β γ factors, is because the context size is smaller when t is small, and this can be compensated using the α β γ factors. + +**(UPDATE: We remove α β γ factors in v2-RNN and restrict W to be of a simple form and hence able to rewrite it as RNN)** + +* The Channel-mix is similar to GeGLU (https://arxiv.org/abs/2002.05202) with an extra R factor. Initialize R and W matrices to ZERO for fast & stable convergence. + +* Finally, we add extra token-shift (time-shift mixing) as in (https://github.com/BlinkDL/minGPT-tuned). + +# Token-shift (time-shift mixing) + +The token-shift explicitly uses (half the channels of this token) & (half the channels of prev token) to generate all vectors (QKV, RWKV, ...). + +``` +self.time_shift = nn.ZeroPad2d((0,0,1,-1)) + +x = torch.cat([self.time_shift(x[:, :, :C//2]), x[:, :, C//2:]], dim = -1) +``` + +Dividing channels by 2 and shift-1 works great for char-level English and char-level Chinese LM. + +However for BPE-level English LM, it's only effective if your embedding is large enough (at least 1024 - so the usual small L12-D768 model is not enough). + +My theory on the effectiveness of token-shift: + +When we train a GPT, the hidden representation of a token has to accomplish two different objects: + +1. Predict the next token. Sometimes this is easy (obvious next token). + +2. Collect all previous context info, so later tokens can use it. This is always hard. + +The shifted channels can focus on (2), so we have good propagation of info. It's like some kind of residual connection, or a small RNN inside the transformer. + +You can use token-shift in usual QKV self-attention too. I looked at the weights, and found V really likes the shifted channels, less so for Q. Makes sense if you think about it. I also found you may want to use less mixing in higher layers. + +p.s. There is a MHA_pro model in this repo with strong performance. Give it a try :) + +# The Head-QK Trick: learning to copy and avoid tokens + +In usual transformer, a small model has difficulty copying tokens (such as person names) in the context. We add extra Q & K to the final output such that the model can directly copy (or avoid) tokens in the context. Afterwards the model will teach itself NER (named entity recognition) if you look at the learned weights. +``` +q = self.head_q(x)[:,:T,:] # projecting to 256-d +k = self.head_k(x)[:,:T,:] # projecting to 256-d +c = (q @ k.transpose(-2, -1)) * (1.0 / 256) +c = c.masked_fill(self.copy_mask[:T,:T] == 0, 0) +c = c @ F.one_hot(idx, num_classes = self.config.vocab_size).float() +x = self.head(x) + c +``` +Note: when a token occurs multiple times in the context, it might be better to use max(prob) instead of sum(prob). + +# The top-a sampling method + +We also propose a new sampling method called top-a (as in src/utils.py): + +(1) Find the max probability p_max after softmax. + +(2) Remove all entries whose probability is lower than 0.2 * pow(p_max, 2). So it's adaptive, hence "top-a". + +(3) Feel free to tune the 0.2 and 2 factor. Tune 0.2 first. + +The idea of top-a: +1. If max_prob=0.9, then remove all tokens with prob < 0.162 (so, removing all alternatives) +2. If max_prob=0.5, then remove all tokens with prob < 0.05 (so, allowing more choices) +3. If max_prob=0.1, then remove all tokens with prob < 0.002 (so, allowing lots of possibilities) + +``` +probs = F.softmax(logits, dim=-1) + +limit = torch.pow(torch.max(probs), 2) * 0.02 +logits[probs < limit] = -float('Inf') +``` + +# Performance + +Character-level loss on simplebooks-92 dataset https://dldata-public.s3.us-east-2.amazonaws.com/simplebooks.zip + +![RWKV-vs-MHA](RWKV-vs-MHA.png) + +Gray: usual MHA+Rotary+GeGLU - performance not as good. 17.2M params. + +Red: RWKV ("linear" attention) - VRAM friendly - quite faster when ctx window is long - good performance. 16.6M params. + +Green: MHA+Rotary+GeGLU+Token_shift. 17.2M params. + +Blue: MHA_pro (MHA with various tweaks & RWKV-type-FFN) - slow - needs more VRAM - good performance. 16.6M params. + +``` +@software{peng_bo_2021_5196578, + author = {PENG Bo}, + title = {BlinkDL/RWKV-LM: 0.01}, + month = aug, + year = 2021, + publisher = {Zenodo}, + version = {0.01}, + doi = {10.5281/zenodo.5196577}, + url = {https://doi.org/10.5281/zenodo.5196577} +} +``` + +# Initialization + +We use careful initialization for RWKV to get fast convergence - orthogonal matrices with proper scaling, and special time_w curves. Check model.py for details. + +Some learned time_w examples: + +![RWKV-time-w](RWKV-time-w.png) diff --git a/docs/src/RWKV-v1/src/model.py b/docs/src/RWKV-v1/src/model.py new file mode 100644 index 00000000..1eeb0868 --- /dev/null +++ b/docs/src/RWKV-v1/src/model.py @@ -0,0 +1,517 @@ +######################################################################################################## +# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM +######################################################################################################## + +import math +import logging +import torch +import torch.nn as nn +from torch.nn import functional as F +logger = logging.getLogger(__name__) + +######################################################################################################## +# RWKV: RWKV Time-mix + RWKV Channel-mix +######################################################################################################## + +def RWKV_Init(module, config): # fancy initialization of all lin & emb layer in the module + for m in module.modules(): + if not isinstance(m, (nn.Linear, nn.Embedding)): + continue + with torch.no_grad(): + name = '[unknown weight]' + for name, parameter in module.named_parameters(): # find the name of the weight + if id(m.weight) == id(parameter): + break + + shape = m.weight.data.shape + gain = 1.0 # positive: gain for orthogonal, negative: std for normal + scale = 1.0 # extra scale for gain + + if isinstance(m, nn.Linear): + if m.bias is not None: + m.bias.data.zero_() + if shape[0] > shape[1]: + gain = math.sqrt(shape[0] / shape[1]) + if shape[0] == config.vocab_size and shape[1] == config.n_embd: # final projection? + scale = config.rwkv_emb_scale + + if isinstance(m, nn.Embedding): + gain = math.sqrt(max(shape[0], shape[1])) + if shape[0] == config.vocab_size and shape[1] == config.n_embd: # token emb? + scale = config.rwkv_emb_scale + + if hasattr(m, 'scale_init'): + scale = m.scale_init + + print(str(shape[0]).ljust(5), str(shape[1]).ljust(5), f'{round(scale,2):g}'.ljust(4), name) + + gain *= scale + if gain == 0: + nn.init.zeros_(m.weight) # zero init is great for some RWKV matrices + elif gain > 0: + nn.init.orthogonal_(m.weight, gain=gain) + else: + nn.init.normal_(m.weight, mean=0, std=-gain) + +class RWKV_TimeMix(nn.Module): + def __init__(self, config, layer_id): + super().__init__() + assert config.n_attn % config.n_head == 0 + self.layer_id = layer_id + self.ctx_len = config.ctx_len + self.n_head = config.n_head + self.head_size = config.n_attn // config.n_head + + with torch.no_grad(): # initial time_w curves for better convergence + ww = torch.ones(config.n_head, config.ctx_len) + curve = torch.tensor([-(config.ctx_len - 1 - i) for i in range(config.ctx_len)]) # the distance + for h in range(config.n_head): + if h < config.n_head - 1: + decay_speed = math.pow(config.ctx_len, -(h+1)/(config.n_head-1)) + else: + decay_speed = 0.0 + ww[h] = torch.exp(curve * decay_speed) + # print('layer', layer_id, 'head', h, 'decay_speed', round(decay_speed, 4), ww[h][:5].numpy(), '...', ww[h][-5:].numpy()) + self.time_w = nn.Parameter(ww) + + self.time_alpha = nn.Parameter(torch.ones(self.n_head, 1, config.ctx_len)) + self.time_beta = nn.Parameter(torch.ones(self.n_head, config.ctx_len, 1)) + self.time_gamma = nn.Parameter(torch.ones(config.ctx_len, 1)) + + self.time_shift = nn.ZeroPad2d((0,0,1,-1)) + + self.key = nn.Linear(config.n_embd, config.n_attn) + self.value = nn.Linear(config.n_embd, config.n_attn) + self.receptance = nn.Linear(config.n_embd, config.n_attn) + + # if config.rwkv_tiny_attn > 0: + # self.tiny_att = RWKV_TinyAttn(config) + + self.output = nn.Linear(config.n_attn, config.n_embd) + + self.key.scale_init = 0 + self.receptance.scale_init = 0 + self.output.scale_init = 0 + + def forward(self, x): + B, T, C = x.size() + TT = self.ctx_len + w = F.pad(self.time_w, (0, TT)) + w = torch.tile(w, [TT]) + w = w[:, :-TT].reshape(-1, TT, 2 * TT - 1) + w = w[:, :, TT-1:] # w is now a circulant matrix + w = w[:, :T, :T] * self.time_alpha[:, :, :T] * self.time_beta[:, :T, :] + + x = torch.cat([self.time_shift(x[:, :, :C//2]), x[:, :, C//2:]], dim = -1) + # if hasattr(self, 'tiny_att'): + # tiny_att = self.tiny_att(x, self.mask) + + k = self.key(x) + v = self.value(x) + r = self.receptance(x) + + k = torch.clamp(k, max=30, min=-60) # clamp extreme values. e^30 = 10^13 + k = torch.exp(k) + sum_k = torch.cumsum(k, dim=1) + + kv = (k * v).view(B, T, self.n_head, self.head_size) + + wkv = (torch.einsum('htu,buhc->bthc', w, kv)).contiguous().view(B, T, -1) + + rwkv = torch.sigmoid(r) * wkv / sum_k + + rwkv = self.output(rwkv) + # if hasattr(self, 'tiny_att'): + # rwkv += tiny_att + + return rwkv * self.time_gamma[:T, :] + +class RWKV_ChannelMix(nn.Module): + def __init__(self, config, layer_id): + super().__init__() + self.layer_id = layer_id + self.time_shift = nn.ZeroPad2d((0,0,1,-1)) + + hidden_sz = 5 * config.n_ffn // 2 # can use smaller hidden_sz because of receptance gating + self.key = nn.Linear(config.n_embd, hidden_sz) + self.value = nn.Linear(config.n_embd, hidden_sz) + self.weight = nn.Linear(hidden_sz, config.n_embd) + self.receptance = nn.Linear(config.n_embd, config.n_embd) + + self.receptance.scale_init = 0 + self.weight.scale_init = 0 + + def forward(self, x): + B, T, C = x.size() + + x = torch.cat([self.time_shift(x[:, :, :C//2]), x[:, :, C//2:]], dim = -1) + k = self.key(x) + v = self.value(x) + r = self.receptance(x) + + wkv = self.weight(F.mish(k) * v) # i find mish is a bit better than gelu + + rwkv = torch.sigmoid(r) * wkv + + return rwkv + +class RWKV_TinyAttn(nn.Module): # extra tiny attention + def __init__(self, config): + super().__init__() + self.d_attn = config.rwkv_tiny_attn + self.n_head = config.rwkv_tiny_head + self.head_size = self.d_attn // self.n_head + + self.qkv = nn.Linear(config.n_embd, self.d_attn * 3) + self.out = nn.Linear(self.d_attn, config.n_embd) + + def forward(self, x, mask): + B, T, C = x.size() + qkv = self.qkv(x) + q, k, v = qkv.chunk(3, dim = -1) + + if self.n_head > 1: + q = q.view(B, T, self.n_head, self.head_size).transpose(1, 2) # (B, T, C) -> (B, nh, T, hs) + k = k.view(B, T, self.n_head, self.head_size).transpose(1, 2) # (B, T, C) -> (B, nh, T, hs) + v = v.view(B, T, self.n_head, self.head_size).transpose(1, 2) # (B, T, C) -> (B, nh, T, hs) + + qk = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(self.head_size)) # (B, nh, T, hs) * (B, nh, hs, T) -> (B, nh, T, T) + qk = qk.masked_fill(mask == 0, float('-inf')) + qk = F.softmax(qk, dim = -1) + qkv = qk @ v # (B, nh, T, T) * (B, nh, T, hs) -> (B, nh, T, hs) + + if self.n_head > 1: + qkv = qkv.transpose(1, 2).contiguous().view(B, T, -1) # (B, nh, T, hs) -> (B, T, nh, hs) -> (B, T, C) + + return self.out(qkv) + +######################################################################################################## +# MHA_rotary: Multi-head Attention + Rotary Encoding + GeGLU FFN +######################################################################################################## + +class RotaryEmbedding(torch.nn.Module): + def __init__(self, dim, base=10000): + super().__init__() + inv_freq = 1. / (base ** (torch.arange(0, dim, 2).float() / dim)) + self.register_buffer('inv_freq', inv_freq) + self.seq_len_cached = None + self.cos_cached = None + self.sin_cached = None + + def forward(self, x, seq_len=None): + if seq_len != self.seq_len_cached: + self.seq_len_cached = seq_len + t = torch.arange(seq_len, device=x.device) + freqs = torch.einsum('i,j->ij', t, self.inv_freq) + emb = torch.cat((freqs, freqs), dim=-1).to(x.device) + self.cos_cached = emb.cos() + self.sin_cached = emb.sin() + return self.cos_cached, self.sin_cached + +def rotate_half(x): + x1, x2 = x[..., :x.shape[-1] // 2], x[..., x.shape[-1] // 2:] + return torch.cat((-x2, x1), -1) + +@torch.jit.script +def apply_rotary_pos_emb(q, k, cos, sin): + cos, sin = cos[...,:q.shape[-2],:], sin[...,:q.shape[-2],:] + return (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin) + +class MHA_rotary(nn.Module): + def __init__(self, config, layer_id, time_shift = False): + super().__init__() + self.layer_id = layer_id + assert config.n_attn % config.n_head == 0 + self.n_head = config.n_head + self.ctx_len = config.ctx_len + self.head_size = config.n_attn // config.n_head + + if time_shift: + self.time_shift = nn.ZeroPad2d((0,0,1,-1)) + + self.query = nn.Linear(config.n_embd, config.n_attn) + self.key = nn.Linear(config.n_embd, config.n_attn) + self.value = nn.Linear(config.n_embd, config.n_attn) + + self.register_buffer("mask", torch.tril(torch.ones(config.ctx_len, config.ctx_len))) + + self.rotary_ndims = int(self.head_size * 0.5) + self.rotary_emb = RotaryEmbedding(self.rotary_ndims) + + self.output = nn.Linear(config.n_attn, config.n_embd) + + def forward(self, x): + B, T, C = x.size() + + if hasattr(self, 'time_shift'): + x = torch.cat([self.time_shift(x[:, :, :C//2]), x[:, :, C//2:]], dim = -1) + + q = self.query(x).view(B, T, self.n_head, self.head_size).transpose(1, 2) # (B, T, C) -> (B, nh, T, hs) + k = self.key(x).view(B, T, self.n_head, self.head_size).transpose(1, 2) # (B, T, C) -> (B, nh, T, hs) + v = self.value(x).view(B, T, self.n_head, self.head_size).transpose(1, 2) # (B, T, C) -> (B, nh, T, hs) + + q, query_pass = q[..., :self.rotary_ndims], q[..., self.rotary_ndims:] + k, key_pass = k[..., :self.rotary_ndims], k[..., self.rotary_ndims:] + cos, sin = self.rotary_emb(q, seq_len=T) + q, k = apply_rotary_pos_emb(q, k, cos, sin) # rotary encoding + q = torch.cat((q, query_pass), dim=-1) + k = torch.cat((k, key_pass), dim=-1) + + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) # self-attention: (B, nh, T, hs) * (B, nh, hs, T) -> (B, nh, T, T) + att = att.masked_fill(self.mask[:T,:T] == 0, float('-inf')) # causal mask + att = F.softmax(att, dim = -1) # softmax + + x = att @ v # (B, nh, T, T) * (B, nh, T, hs) -> (B, nh, T, hs) + x = x.transpose(1, 2).contiguous().view(B, T, -1) # (B, nh, T, hs) -> (B, T, nh, hs) -> (B, T, C) + + x = self.output(x) + return x + +class GeGLU(torch.nn.Module): + def __init__(self, config, layer_id, time_shift = False): + super().__init__() + self.layer_id = layer_id + + if time_shift: + self.time_shift = nn.ZeroPad2d((0,0,1,-1)) + + hidden_sz = 3 * config.n_ffn + self.key = nn.Linear(config.n_embd, hidden_sz) + self.value = nn.Linear(config.n_embd, hidden_sz) + self.weight = nn.Linear(hidden_sz, config.n_embd) + + def forward(self, x): + B, T, C = x.size() + if hasattr(self, 'time_shift'): + x = torch.cat([self.time_shift(x[:, :, :C//2]), x[:, :, C//2:]], dim = -1) + + k = self.key(x) + v = self.value(x) + y = self.weight(F.gelu(k) * v) + return y + +######################################################################################################## +# MHA_pro: with more tricks +######################################################################################################## + +class MHA_pro(nn.Module): + def __init__(self, config, layer_id): + super().__init__() + self.layer_id = layer_id + assert config.n_attn % config.n_head == 0 + self.n_head = config.n_head + self.ctx_len = config.ctx_len + self.head_size = config.n_attn // config.n_head + + self.time_w = nn.Parameter(torch.ones(self.n_head, config.ctx_len)) + self.time_alpha = nn.Parameter(torch.ones(self.n_head, 1, config.ctx_len)) + self.time_beta = nn.Parameter(torch.ones(self.n_head, config.ctx_len, 1)) + self.time_gamma = nn.Parameter(torch.ones(config.ctx_len, 1)) + self.register_buffer("mask", torch.tril(torch.ones(config.ctx_len, config.ctx_len))) + + self.time_shift = nn.ZeroPad2d((0,0,1,-1)) + self.query = nn.Linear(config.n_embd, config.n_attn) + self.key = nn.Linear(config.n_embd, config.n_attn) + self.value = nn.Linear(config.n_embd, config.n_attn) + + self.rotary_ndims = int(self.head_size * 0.5) + self.rotary_emb = RotaryEmbedding(self.rotary_ndims) + + self.head_mix = nn.Conv2d(self.n_head, self.n_head, kernel_size=1, bias=False) # talking heads + + self.output = nn.Linear(config.n_attn, config.n_embd) + + def forward(self, x): + B, T, C = x.size() + TT = self.ctx_len + w = F.pad(self.time_w, (0, TT)) + w = torch.tile(w, [TT]) + w = w[:, :-TT].reshape(-1, TT, 2 * TT - 1) + w = w[:, :, TT-1:] # w is now a circulant matrix + w = w[:, :T, :T] * self.time_alpha[:, :, :T] * self.time_beta[:, :T, :] + + x = torch.cat([self.time_shift(x[:, :, :C//2]), x[:, :, C//2:]], dim = -1) # time-shift mixing + q = self.query(x).view(B, T, self.n_head, self.head_size).transpose(1, 2) # (B, T, C) -> (B, nh, T, hs) + k = self.key(x).view(B, T, self.n_head, self.head_size).transpose(1, 2) # (B, T, C) -> (B, nh, T, hs) + v = self.value(x).view(B, T, self.n_head, self.head_size).transpose(1, 2) # (B, T, C) -> (B, nh, T, hs) + + q, query_pass = q[..., :self.rotary_ndims], q[..., self.rotary_ndims:] + k, key_pass = k[..., :self.rotary_ndims], k[..., self.rotary_ndims:] + cos, sin = self.rotary_emb(q, seq_len=T) + q, k = apply_rotary_pos_emb(q, k, cos, sin) # rotary encoding + q = torch.cat((q, query_pass), dim=-1) + k = torch.cat((k, key_pass), dim=-1) + + att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) # self-attention: (B, nh, T, hs) * (B, nh, hs, T) -> (B, nh, T, T) + att = att.masked_fill(self.mask[:T,:T] == 0, float('-inf')) # causal mask + att = F.softmax(att, dim = -1) # softmax + att = att * w # time-weighting + att = self.head_mix(att) # talking heads + + x = att @ v # (B, nh, T, T) * (B, nh, T, hs) -> (B, nh, T, hs) + x = x.transpose(1, 2).contiguous().view(B, T, -1) # (B, nh, T, hs) -> (B, T, nh, hs) -> (B, T, C) + + x = self.output(x) * self.time_gamma[:T, :] + return x + +######################################################################################################## +# The GPT Model with our blocks +######################################################################################################## + +class RMSNorm(nn.Module): + def __init__(self, d): + super().__init__() + self.dd = d ** (-1. / 2) + self.weight = nn.Parameter(torch.ones(d)) + + def forward(self, x): + norm_x = x.norm(2, dim=-1, keepdim=True) + x_normed = x / (norm_x * self.dd + 1e-12) + return self.weight * x_normed + +class FixedNorm(nn.Module): + def __init__(self, d): + super().__init__() + self.dd = d ** (-1. / 2) + + def forward(self, x): + norm_x = x.norm(2, dim=-1, keepdim=True) + x_normed = x / (norm_x * self.dd + 1e-12) + return x_normed + +######################################################################################################## + +class GPTConfig: + def __init__(self, vocab_size, ctx_len, **kwargs): + self.vocab_size = vocab_size + self.ctx_len = ctx_len + for k,v in kwargs.items(): + setattr(self, k, v) + +class Block(nn.Module): + def __init__(self, config, layer_id): + super().__init__() + self.config = config + + self.ln1 = nn.LayerNorm(config.n_embd) + self.ln2 = nn.LayerNorm(config.n_embd) + + if config.model_type == 'RWKV': + # self.ln1 = FixedNorm(config.n_embd) + # self.ln2 = FixedNorm(config.n_embd) + self.attn = RWKV_TimeMix(config, layer_id) + self.mlp = RWKV_ChannelMix(config, layer_id) + + elif config.model_type == 'MHA_rotary': + self.attn = MHA_rotary(config, layer_id) + self.mlp = GeGLU(config, layer_id) + + elif config.model_type == 'MHA_shift': + self.attn = MHA_rotary(config, layer_id, time_shift=True) + self.mlp = GeGLU(config, layer_id, time_shift=True) + + elif config.model_type == 'MHA_pro': + self.attn = MHA_pro(config, layer_id) + self.mlp = RWKV_ChannelMix(config, layer_id) + + def forward(self, x): + + x = x + self.attn(self.ln1(x)) + x = x + self.mlp(self.ln2(x)) + + return x + +class GPT(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + + self.tok_emb = nn.Embedding(config.vocab_size, config.n_embd) + + self.blocks = nn.Sequential(*[Block(config, i) for i in range(config.n_layer)]) + + self.ln_f = nn.LayerNorm(config.n_embd) + self.time_out = nn.Parameter(torch.ones(1,config.ctx_len,1)) # reduce confidence of early tokens + self.head = nn.Linear(config.n_embd, config.vocab_size, bias=False) + + self.head_q = nn.Linear(config.n_embd, 256) + self.head_q.scale_init = 0.01 + self.head_k = nn.Linear(config.n_embd, 256) + self.head_k.scale_init = 0.01 + self.register_buffer("copy_mask", torch.tril(torch.ones(config.ctx_len, config.ctx_len))) + + self.ctx_len = config.ctx_len + + if self.config.model_type == 'RWKV': + RWKV_Init(self, config) + else: + self.apply(self._init_weights) + + logger.info("number of parameters: %e", sum(p.numel() for p in self.parameters())) + + def get_ctx_len(self): + return self.ctx_len + + def _init_weights(self, module): + if isinstance(module, (nn.Linear, nn.Embedding)): + module.weight.data.normal_(mean=0.0, std=0.01) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + def configure_optimizers(self, train_config): + # separate out all parameters to those that will and won't experience regularizing weight decay + decay = set() + no_decay = set() + + whitelist_weight_modules = (nn.Linear, ) + blacklist_weight_modules = (RMSNorm, nn.LayerNorm, nn.Embedding) + for mn, m in self.named_modules(): + for pn, p in m.named_parameters(): + fpn = '%s.%s' % (mn, pn) if mn else pn # full param name + + if pn.endswith('bias') or ('time' in fpn) or ('head' in fpn): + no_decay.add(fpn) + elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules): + decay.add(fpn) + elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules): + no_decay.add(fpn) + + # validate that we considered every parameter + param_dict = {pn: p for pn, p in self.named_parameters()} + inter_params = decay & no_decay + union_params = decay | no_decay + assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), ) + assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \ + % (str(param_dict.keys() - union_params), ) + + optim_groups = [ + {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": train_config.weight_decay}, + {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0}, + ] + optimizer = torch.optim.AdamW(optim_groups, lr=train_config.learning_rate, betas=train_config.betas, eps=train_config.eps) + return optimizer + + def forward(self, idx, targets=None): + B, T = idx.size() + assert T <= self.ctx_len, "Cannot forward, because len(input) > model ctx_len." + + x = self.tok_emb(idx) + + x = self.blocks(x) + + x = self.ln_f(x) + + q = self.head_q(x)[:,:T,:] + k = self.head_k(x)[:,:T,:] + c = (q @ k.transpose(-2, -1)) * (1.0 / 256) + c = c.masked_fill(self.copy_mask[:T,:T] == 0, 0) + c = c @ F.one_hot(idx, num_classes = self.config.vocab_size).float() + + x = x * self.time_out[:, :T, :] # reduce confidence of early tokens + x = self.head(x) + c + + loss = None + if targets is not None: + loss = F.cross_entropy(x.view(-1, x.size(-1)), targets.view(-1)) + + return x, loss diff --git a/docs/src/RWKV-v1/src/trainer.py b/docs/src/RWKV-v1/src/trainer.py new file mode 100644 index 00000000..5f88fcc7 --- /dev/null +++ b/docs/src/RWKV-v1/src/trainer.py @@ -0,0 +1,130 @@ +import math, sys, datetime +import logging +import numpy as np +from tqdm.auto import tqdm +import torch +import torch.optim as optim +from torch.optim.lr_scheduler import LambdaLR +from torch.utils.data.dataloader import DataLoader +logger = logging.getLogger(__name__) + +# print('logging to wandb... (comment it if you don\'t have wandb)') +# import wandb # comment this if you don't have wandb + +class TrainerConfig: + max_epochs = 10 + batch_size = 64 + learning_rate = 4e-4 + betas = (0.9, 0.99) + eps = 1e-8 + grad_norm_clip = 1.0 + weight_decay = 0.01 + lr_decay = False # linear warmup followed by cosine decay + warmup_tokens = 375e6 # these two numbers come from the GPT-3 paper + final_tokens = 260e9 # at which point do we reach lr_final + epoch_save_frequency = 0 + epoch_save_path = 'trained-' + num_workers = 0 # for DataLoader + + def __init__(self, **kwargs): + for k,v in kwargs.items(): + setattr(self, k, v) + +class Trainer: + + def __init__(self, model, train_dataset, test_dataset, config): + self.model = model + self.train_dataset = train_dataset + self.test_dataset = test_dataset + self.config = config + self.avg_loss = -1 + self.steps = 0 + + if 'wandb' in sys.modules: + cfg = model.config + for k in config.__dict__: + setattr(cfg, k, config.__dict__[k]) # combine cfg + wandb.init(project="RWKV-LM", name=self.get_run_name() + '-' + datetime.datetime.today().strftime('%Y-%m-%d-%H-%M-%S'), config=cfg, save_code=False) + + self.device = 'cpu' + if torch.cuda.is_available(): # take over whatever gpus are on the system + self.device = torch.cuda.current_device() + self.model = torch.nn.DataParallel(self.model).to(self.device) + + def get_run_name(self): + raw_model = self.model.module if hasattr(self.model, "module") else self.model + cfg = raw_model.config + run_name = str(cfg.vocab_size) + '-' + str(cfg.ctx_len) + '-' + cfg.model_type + '-' + str(cfg.n_layer) + '-' + str(cfg.n_embd) + return run_name + + def train(self): + model, config = self.model, self.config + raw_model = model.module if hasattr(self.model, "module") else model + optimizer = raw_model.configure_optimizers(config) + + def run_epoch(split): + is_train = split == 'train' + model.train(is_train) + data = self.train_dataset if is_train else self.test_dataset + loader = DataLoader(data, shuffle=True, pin_memory=True, + batch_size=config.batch_size, + num_workers=config.num_workers) + + pbar = tqdm(enumerate(loader), total=len(loader), bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}') if is_train else enumerate(loader) + + for it, (x, y) in pbar: + x = x.to(self.device) # place data on the correct device + y = y.to(self.device) + + with torch.set_grad_enabled(is_train): + _, loss = model(x, y) # forward the model + loss = loss.mean() # collapse all losses if they are scattered on multiple gpus + + if is_train: # backprop and update the parameters + model.zero_grad() + loss.backward() + + torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_norm_clip) + optimizer.step() + + if config.lr_decay: # decay the learning rate based on our progress + self.tokens += (y >= 0).sum() # number of tokens processed this step (i.e. label is not -100) + lr_final_factor = config.lr_final / config.learning_rate + if self.tokens < config.warmup_tokens: + # linear warmup + lr_mult = lr_final_factor + (1 - lr_final_factor) * float(self.tokens) / float(config.warmup_tokens) + progress = 0 + else: + # cosine learning rate decay + progress = float(self.tokens - config.warmup_tokens) / float(max(1, config.final_tokens - config.warmup_tokens)) + # progress = min(progress * 1.1, 1.0) # more fine-tuning with low LR + lr_mult = (0.5 + lr_final_factor / 2) + (0.5 - lr_final_factor / 2) * math.cos(math.pi * progress) # better 1.0 ~ 0.1 + lr = config.learning_rate * lr_mult + for param_group in optimizer.param_groups: + param_group['lr'] = lr + else: + lr = config.learning_rate + + now_loss = loss.item() # report progress + + if 'wandb' in sys.modules: + wandb.log({"loss": now_loss}, step = self.steps * self.config.batch_size) + self.steps += 1 + + if self.avg_loss < 0: + self.avg_loss = now_loss + else: + # factor = max(1.0 / 300, 1.0 / math.sqrt(it + 1)) + factor = 1 / (it + 1) + self.avg_loss = self.avg_loss * (1.0 - factor) + now_loss * factor + pbar.set_description(f"epoch {epoch+1} progress {progress*100.0:.2f}% iter {it}: ppl {math.exp(self.avg_loss):.2f} loss {self.avg_loss:.4f} lr {lr:e}") + + while True: + self.tokens = 0 # counter used for learning rate decay + for epoch in range(config.max_epochs): + + run_epoch('train') + + if (self.config.epoch_save_frequency > 0 and epoch % self.config.epoch_save_frequency == 0) or (epoch == config.max_epochs - 1): + raw_model = self.model.module if hasattr(self.model, "module") else self.model # DataParallel wrappers keep raw model object in .module + torch.save(raw_model, self.config.epoch_save_path + str(epoch+1) + '.pth') diff --git a/docs/src/RWKV-v1/src/utils.py b/docs/src/RWKV-v1/src/utils.py new file mode 100644 index 00000000..5f9bb650 --- /dev/null +++ b/docs/src/RWKV-v1/src/utils.py @@ -0,0 +1,50 @@ +import random +import numpy as np +import torch +import torch.nn as nn +from torch.nn import functional as F + +def top_k_logits(logits, k): + v, ix = torch.topk(logits, k) + out = logits.clone() + out[out < v[:, [-1]]] = -float('Inf') + return out + +def top_p_probs(probs, p): + out = probs.clone() + + sorted_probs, sorted_indices = torch.sort(out, descending=True) + cumulative_probs = torch.cumsum(sorted_probs, dim=-1) + sorted_indices_to_remove = cumulative_probs > p + sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() + sorted_indices_to_remove[..., 0] = 0 + indices_to_remove = sorted_indices[sorted_indices_to_remove] + out[indices_to_remove] = 0 + + return out + +# top-p + top-k + pow&ratio sampling +def sample_logits(logits, pos, temperature=1.0, top_k=None, top_p=None, min_p_pow=None, min_p_ratio=None): + logits = logits[:, pos, :] / temperature + probs = F.softmax(logits, dim=-1) + + if min_p_ratio is not None: + limit = torch.pow(torch.max(probs), min_p_pow) * min_p_ratio + logits[probs < limit] = -float('Inf') + + if top_k is not None: + logits = top_k_logits(logits, top_k) + + probs = F.softmax(logits, dim=-1) + + if top_p is not None: + probs[0] = top_p_probs(probs[0], top_p) + + ix = torch.multinomial(probs, num_samples=1) + return ix[0][0].cpu() + +def set_seed(seed): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) diff --git a/docs/src/RWKV-v1/train.py b/docs/src/RWKV-v1/train.py new file mode 100644 index 00000000..ab370e1b --- /dev/null +++ b/docs/src/RWKV-v1/train.py @@ -0,0 +1,142 @@ +######################################################################################################## +# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM +######################################################################################################## + +import os, sys, time, math, random, json, datetime, logging +import numpy as np +import torch +from torch.utils.data import Dataset +from src.trainer import Trainer, TrainerConfig +from src.model import GPT, GPTConfig +from src.utils import set_seed + +set_seed(42) +np.set_printoptions(precision=4, suppress=True, linewidth=200) +logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO,) + +# RWKV : our new model - fastest when ctx_len is long - VRAM friendly - good performance +# MHA_rotary : usual MultiheadAttention+Rotary+GeGLU - not as good +# MHA_shift : with time-shift - good performance +# MHA_pro : slow (lots of tricks) - VRAM hungry - very good performance +model_type = 'RWKV' + +# datafile = u"V:\\NLP\\text8" +# datafile = u"V:\\NLP\\enwik8" +datafile = u"V:\\NLP\\simplebooks\\simplebooks-92-raw\\train.txt" +datafile_encoding = 'utf-8' +# datafile = u"D:\\NLP-Data\\ww100M.txt" +# datafile = u"D:\\NLP-Data\\__2019.txt" +# datafile = u"Y:\\BlinkNLP\\_txt_\\txt\\_all.txt" +# datafile = u"V:\\NLP\\enwik8-shift-300.bpe" +# datafile_encoding = 'utf-16' +# datafile = u"V:\\NLP\\simplebooks-shift-utf32.word" +# datafile_encoding = 'utf-32' + +datafile_type = 0 # use 0 for char-level english. use 1 for chinese. only affects some RWKV hyperparametrs + +#################################### VERY IMPORTANT #################################### +epoch_save_frequency = 10 # 0 = never, 1 = every 'epoch', 2 = every two 'epoch', etc. +epoch_save_path = 'trained-' + +batch_size = 32 # if you see "CUDA out of memory", reduce this. + # if you have good GPU, increase this. + # use GPU-Z to find the highest value for your VRAM. + +n_epoch = 100 # the 'epoch' here is actually very short (and of fixed length) +######################################################################################## + +model_level = 'character' # 'character' (recommended) or 'word' + +ctx_len = 256 # context length, try 512 or 1024 if you have good GPU +n_layer = 6 # try 12 for 100M, 24 for 300M +n_head = 8 # try 12 for 100M, 16 for 300M + +n_embd = n_head * 64 +n_attn = n_embd +n_ffn = n_embd + +lr_init = 6e-4 if model_type == 'RWKV' else 4e-4 # RWKV can use higher lr. 8e-4 = 0.0008 4e-4 = 0.0004 +lr_final = 4e-5 + +betas = (0.9, 0.99) if model_type == 'RWKV' else (0.9, 0.99) +eps = 4e-9 +weight_decay = 0 if model_type == 'RWKV' else 0.01 # wd is not useful when we have enough data + +epoch_length_fixed = 10000 # make an 'epoch' very short, so we can see the training progress + +######## special hyperparameters for RWKV model ######## +rwkv_emb_scale = 0.4 # scale of initial embedding. 0.4 is a good choice +rwkv_tiny_attn = 0#64 if (datafile_type == 0 and ctx_len > 600) else 0 # extra tiny attention dim, useful for long ctx char-level english +rwkv_tiny_head = 1 # 1 is good enough. 8 is slow +# n_side_proj = 512 # extra 'side projection', quite useful for BPE models + +######################################################################################################## +# Load data +######################################################################################################## + +print('loading data... ' + datafile) + +class Dataset(Dataset): + def __init__(self, data, model_level, ctx_len): + print('building token list...', end=' ') + if model_level == 'word': + import re + data = re.sub(r'(\n|\.|\,|\?|\!|\:|\;|\-|\—|\||\'|\"|\`|\(|\)|[0-9]|\[|\]|\{|\}|\=|\+|\*|\\|\/|\~|\&|\$|\#|\%)', r' \g<0> ', data) + data = re.sub(' +',' ',data) + print('splitting token...') + data = data.lower().split(' ') + unique = sorted(list(set(data))) + # print() + # for u in unique: + # print(u, end=' ') + # print('\n\n') + + xx = 0 + xxObj = {} + for u in unique: + xxObj[xx] = u + xx += 1 + with open('vocab.json', "w", encoding="utf-16") as vocab_file: + vocab_file.write(json.dumps(xxObj, ensure_ascii=False)) + + data_size, vocab_size = len(data), len(unique) + print('data has %d %ss, %d unique.' % (data_size, model_level, vocab_size)) + self.stoi = { ch:i for i,ch in enumerate(unique) } + self.itos = { i:ch for i,ch in enumerate(unique) } + self.ctx_len = ctx_len + self.vocab_size = vocab_size + self.data = data + + def __len__(self): + return epoch_length_fixed + + def __getitem__(self, idx): + i = np.random.randint(0, len(self.data) - (self.ctx_len + 1)) # cheat: pick a random spot in dataset + chunk = self.data[i:i+self.ctx_len+1] + dix = [self.stoi[s] for s in chunk] + x = torch.tensor(dix[:-1], dtype=torch.long) + y = torch.tensor(dix[1:], dtype=torch.long) + return x, y + +train_dataset = Dataset(open(datafile, "r", encoding=datafile_encoding).read(), model_level, ctx_len) + +######################################################################################################## +# Train model +######################################################################################################## + +model = GPT(GPTConfig(train_dataset.vocab_size, train_dataset.ctx_len, model_type=model_type, + rwkv_emb_scale=rwkv_emb_scale, rwkv_tiny_attn=rwkv_tiny_attn, rwkv_tiny_head=rwkv_tiny_head, + n_layer=n_layer, n_head=n_head, n_embd=n_embd, n_attn=n_attn, n_ffn=n_ffn)) + +# load a trained model +# model.load_state_dict(torch.load('trained-xxx.pth').state_dict()) + +print('model', model_type, 'epoch', n_epoch, 'batchsz', batch_size, 'betas', betas, 'eps', eps, 'wd', weight_decay, 'ctx', ctx_len, 'layer', n_layer, 'head', n_head, 'embd', n_embd, 'attn', n_attn, 'ffn', n_ffn) +tconf = TrainerConfig(model_type=model_type, max_epochs=n_epoch, batch_size=batch_size, weight_decay=weight_decay, + learning_rate=lr_init, lr_decay=True, lr_final=lr_final, betas=betas, eps=eps, + warmup_tokens=0, final_tokens=n_epoch*len(train_dataset)*ctx_len, num_workers=0, epoch_save_frequency=epoch_save_frequency, epoch_save_path=epoch_save_path) +trainer = Trainer(model, train_dataset, None, tconf) + +trainer.train() + +torch.save(model, 'trained-' + trainer.get_run_name() + '-' + datetime.datetime.today().strftime('%Y-%m-%d-%H-%M-%S') + '.pth') diff --git a/docs/src/RWKV-v2-RNN/cuda/timex_cuda.cu b/docs/src/RWKV-v2-RNN/cuda/timex_cuda.cu new file mode 100644 index 00000000..647eddcd --- /dev/null +++ b/docs/src/RWKV-v2-RNN/cuda/timex_cuda.cu @@ -0,0 +1,172 @@ +#include+ + +https://james4ever0.github.io/RWKV-LM?q=/README.md +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v1/src/model.py +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v1/src/trainer.py +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v1/src/utils.py +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v1/train.py +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v2-RNN/cuda/timex_cuda.cu +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v2-RNN/cuda/timex_op.cpp +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v2-RNN/run.py +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v2-RNN/src/model.py +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v2-RNN/src/model_run.py +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v2-RNN/src/trainer.py +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v2-RNN/src/utils.py +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v2-RNN/train.py +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v3/cuda/timex_cuda.cu +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v3/cuda/timex_op.cpp +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v3/run.py +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v3/src/model.py +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v3/src/model_run.py +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v3/src/trainer.py +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v3/src/utils.py +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v3/train.py +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v3/verify.py +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v4/cuda/wkv_cuda.cu +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v4/cuda/wkv_op.cpp +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v4/run.py +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v4/src/binidx.py +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v4/src/model.py +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v4/src/model_run.py +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v4/src/trainer.py +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v4/src/utils.py +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v4/train.py +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v4/verify.py +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v4neo/chat.py +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v4neo/cuda/wkv5_cuda.cu +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v4neo/cuda/wkv5_op.cpp +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v4neo/cuda/wkv_cuda.cu +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v4neo/cuda/wkv_cuda_bf16.cu +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v4neo/cuda/wkv_op.cpp +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v4neo/cuda/wkv_op_bf16.cpp +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v4neo/img_demoAE.py +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v4neo/math_demo/run.py +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v4neo/run.py +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v4neo/src/binidx.py +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v4neo/src/dataset.py +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v4neo/src/model.py +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v4neo/src/model_img.py +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v4neo/src/model_run.py +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v4neo/src/trainer.py +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v4neo/src/utils.py +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v4neo/train.py +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v4neo/verify.py +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v5/cuda/wkv5_cuda.cu +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v5/cuda/wkv5_op.cpp +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v5/demo.jsonl +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v5/make_data.py +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v5/src/binidx.py +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v5/src/dataset.py +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v5/src/model.py +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v5/src/trainer.py +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v5/src/utils.py +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v5/tokenizer/rwkv_tokenizer.py +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM?q=/RWKV-v5/train.py +2023-12-28T09:21:02+00:00 +1.00 ++ + +https://james4ever0.github.io/RWKV-LM/tree.html?full=true +2023-12-28T09:21:02+00:00 +1.00 ++ +// require T <= Tmax, T % 4 == 0, B % BF == 0, B % BB === 0 (Tmax and BF and BB are passed by compiler) + +#define F4(A, B) ((float4 *)(A))[(B) >> 2] + +template +__global__ void kernel_forward(const F *__restrict__ const __w, const F *__restrict__ const __k, F *__restrict__ const x, + const F eps, const int B, const int C, const int T) { + const int i = blockIdx.y; + const int ij = (B * C) / BF; + const int t = threadIdx.x << 2; + + __shared__ F ww[Tmax]; + __shared__ F kk[Tmax * BF]; + F4(ww, t) = F4(__w, t + T * (i % C)); + + #pragma unroll + for (int j = 0; j < BF; j++) { + F4(kk, t + Tmax * j) = F4(__k, t + T * (i + ij * j)); + } + __syncthreads(); + + float4 s[BF]; + #pragma unroll + for (int j = 0; j < BF; j++) { + s[j] = {eps, eps, eps, eps}; + } + const F *__restrict__ const w = ww + T - t - 4; + for (int u = 0; u <= t; u++) { + #pragma unroll + for (int j = 0; j < BF; j++) { + const F x = kk[u + Tmax * j]; + s[j].x += w[u + 3] * x; + s[j].y += w[u + 2] * x; + s[j].z += w[u + 1] * x; + s[j].w += w[u + 0] * x; + } + } + #pragma unroll + for (int j = 0; j < BF; j++) { + const F *__restrict__ const k = kk + Tmax * j; + s[j].y += w[t + 3] * k[t + 1]; + s[j].z += w[t + 2] * k[t + 1]; + s[j].z += w[t + 3] * k[t + 2]; + s[j].w += w[t + 1] * k[t + 1]; + s[j].w += w[t + 2] * k[t + 2]; + s[j].w += w[t + 3] * k[t + 3]; + F4(x, t + T * (i + ij * j)) = s[j]; + } +} + +template +__global__ void kernel_backward_W(const F *__restrict__ const __w, const F *__restrict__ const __k, const F *__restrict__ const __gwk, + F *__restrict__ const gw, F *__restrict__ const gk, + const int B, const int C, const int T) { + const int i = blockIdx.y; + const int t = threadIdx.x << 2; + + __shared__ F k[Tmax]; + __shared__ F gg[Tmax]; + F4(k, t) = F4(__k, t + T * i); + F4(gg, t) = F4(__gwk, t + T * i); + __syncthreads(); + + float4 s = {0, 0, 0, 0}; + + const F *__restrict__ const g = gg + T - t - 4; + for (int u = 0; u <= t; u++) { + F x = k[u]; + s.x += g[u + 3] * x; + s.y += g[u + 2] * x; + s.z += g[u + 1] * x; + s.w += g[u + 0] * x; + } + s.y += g[t + 3] * k[t + 1]; + s.z += g[t + 2] * k[t + 1]; + s.z += g[t + 3] * k[t + 2]; + s.w += g[t + 1] * k[t + 1]; + s.w += g[t + 2] * k[t + 2]; + s.w += g[t + 3] * k[t + 3]; + F4(gw, t + T * i) = s; +} +void cuda_forward(const float *w, const float *k, float *x, float eps, int B, int C, int T) { + dim3 gridDim(1, B * C / BF); + dim3 blockDim(T >> 2); + kernel_forward<< >>(w, k, x, eps, B, C, T); +} + +template +__global__ void kernel_backward(const F *__restrict__ const __w, const F *__restrict__ const __k, const F *__restrict__ const __gwk, + F *__restrict__ const gw, F *__restrict__ const gk, + const int B, const int C, const int T) { + const int i = blockIdx.y; + const int ij = (B * C) / BB; + const int t = threadIdx.x << 2; + + __shared__ F w[Tmax]; + __shared__ F kk[Tmax * BB]; + __shared__ F gg[Tmax * BB]; + F4(w, t) = F4(__w, t + T * (i % C)); + + #pragma unroll + for (int j = 0; j < BB; j++) { + F4(kk, t + Tmax * j) = F4(__k, t + T * (i + ij * j)); + F4(gg, t + Tmax * j) = F4(__gwk, t + T * (i + ij * j)); + } + __syncthreads(); + + float4 s[BB]; + #pragma unroll + for (int j = 0; j < BB; j++) { + s[j] = {0, 0, 0, 0}; + } + + for (int u = 0; u <= t; u++) { + #pragma unroll + for (int j = 0; j < BB; j++) { + const F *__restrict__ const g = gg + Tmax * j + T - t - 4; + F x = kk[u + Tmax * j]; + s[j].x += g[u + 3] * x; + s[j].y += g[u + 2] * x; + s[j].z += g[u + 1] * x; + s[j].w += g[u + 0] * x; + } + } + #pragma unroll + for (int j = 0; j < BB; j++) { + const F *__restrict__ const k = kk + Tmax * j; + const F *__restrict__ const g = gg + Tmax * j + T - t - 4; + s[j].y += g[t + 3] * k[t + 1]; + s[j].z += g[t + 2] * k[t + 1]; + s[j].z += g[t + 3] * k[t + 2]; + s[j].w += g[t + 1] * k[t + 1]; + s[j].w += g[t + 2] * k[t + 2]; + s[j].w += g[t + 3] * k[t + 3]; + F4(gw, t + T * (i + ij * j)) = s[j]; + } + + #pragma unroll + for (int j = 0; j < BB; j++) { + s[j] = {0, 0, 0, 0}; + } + + for (int u = t + 3; u < T; u++) { + F x = w[u]; + #pragma unroll + for (int j = 0; j < BB; j++) { + const F *__restrict__ const g = gg + Tmax * j + T + t - 3; + s[j].x += g[2 - u] * x; + s[j].y += g[3 - u] * x; + s[j].z += g[4 - u] * x; + s[j].w += g[5 - u] * x; + } + } + #pragma unroll + for (int j = 0; j < BB; j++) { + const F *__restrict__ const g = gg + Tmax * j + T + t - 3; + s[j].x += g[2 - t] * w[t + 0]; + s[j].x += g[1 - t] * w[t + 1]; + s[j].x += g[0 - t] * w[t + 2]; + s[j].y += g[2 - t] * w[t + 1]; + s[j].y += g[1 - t] * w[t + 2]; + s[j].z += g[2 - t] * w[t + 2]; + F4(gk, t + T * (i + ij * j)) = s[j]; + } +} +void cuda_backward(const float *w, const float *k, const float *gwk, float *gw, float *gk, int B, int C, int T) { + dim3 gridDim(1, B * C / BB); + dim3 blockDim(T >> 2); + kernel_backward<< >>(w, k, gwk, gw, gk, B, C, T); +} diff --git a/docs/src/RWKV-v2-RNN/cuda/timex_op.cpp b/docs/src/RWKV-v2-RNN/cuda/timex_op.cpp new file mode 100644 index 00000000..fd389071 --- /dev/null +++ b/docs/src/RWKV-v2-RNN/cuda/timex_op.cpp @@ -0,0 +1,21 @@ +#include + +void cuda_forward(const float *w, const float *k, float *x, float eps, int B, int C, int T); +void cuda_backward(const float *w, const float *k, const float *gwk, float *gw, float *gk, int B, int C, int T); + +void forward(torch::Tensor &w, const torch::Tensor &k, torch::Tensor &x, double eps, int64_t B, int64_t C, int64_t T) { + cuda_forward((const float *)w.data_ptr(), (const float *)k.data_ptr(), (float *)x.data_ptr(), eps, B, C, T); +} +void backward(torch::Tensor &w, const torch::Tensor &k, const torch::Tensor &gwk, torch::Tensor &gw, torch::Tensor &gk, int64_t B, int64_t C, int64_t T) { + cuda_backward((const float *)w.data_ptr(), (const float *)k.data_ptr(), (const float *)gwk.data_ptr(), (float *)gw.data_ptr(), (float *)gk.data_ptr(), B, C, T); +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("forward", &forward, "timex forward"); + m.def("backward", &backward, "timex backward"); +} + +TORCH_LIBRARY(timex, m) { + m.def("forward", forward); + m.def("backward", backward); +} diff --git a/docs/src/RWKV-v2-RNN/run.py b/docs/src/RWKV-v2-RNN/run.py new file mode 100644 index 00000000..a6ee6a2b --- /dev/null +++ b/docs/src/RWKV-v2-RNN/run.py @@ -0,0 +1,133 @@ +# -*- coding:utf-8 -*- +######################################################################################################## +# The RWKV v2-RNN Language Model - https://github.com/BlinkDL/RWKV-LM +######################################################################################################## + +import numpy as np +import math +import time +import types +import copy +import torch +from torch.nn import functional as F +from src.utils import TOKENIZER, Dataset +from src.model_run import RWKV_RNN +torch.backends.cudnn.benchmark = True +torch.backends.cudnn.allow_tf32 = True +torch.backends.cuda.matmul.allow_tf32 = True +np.set_printoptions(precision=4, suppress=True, linewidth=200) + +### Step 1: set model ################################################################################## + +ctx_len = 1024 +n_layer = 6 +n_embd = 512 +model_type = 'RWKV' # 'RWKV' or 'RWKV-ffnPre' + +# your trained model +MODEL_NAME = 'trained-31' +WORD_NAME = 'vocab' # the .json vocab (generated by train.py + +# ########## Uncomment these to test my 27M params enwik8 model ########## +# MODEL_NAME = 'enwik8-ppl1.65-6064-1024-RWKV-6-512-2022-03-25-21-05-13' +# WORD_NAME = 'enwik8-vocab' +# EVAL_DATA = 'enwik8' # uncomment this for EVAL MODE (no text generation) +# ######################################################################## + +# --> set UNKNOWN_CHAR to the rarest token in your vocab.json <-- +# --> all unknown tokens in your context will be denoted by it <-- +UNKNOWN_CHAR = ' ' # here we just set it to [space] for simplicity + +RUN_DEVICE = 'cpu' # 'cpu' (already very fast) or 'cuda' +DEBUG_DEBUG = False # True False - show softmax output + +### Step 2: set context ################################################################################ + +context = "\nIn the" # ==> this is your prompt + +NUM_TRIALS = 999 +LENGTH_PER_TRIAL = 500 + +TEMPERATURE = 1.0 +top_p = 0.7 +top_p_newline = 0.9 + +######################################################################################################## + +print(f'Loading {MODEL_NAME}...') +model = RWKV_RNN(MODEL_NAME, RUN_DEVICE, model_type, n_layer, n_embd, ctx_len) +tokenizer = TOKENIZER(WORD_NAME, UNKNOWN_CHAR=UNKNOWN_CHAR) + +######################################################################################################## + +if 'EVAL_DATA' in vars() or 'EVAL_DATA' in globals(): + print('Evaluating on ' + EVAL_DATA + ' ...') + + data = open(EVAL_DATA, "r", encoding='utf-8').read() + + loss_table = np.zeros(ctx_len) + + N_SAMPLE = 1000 + + for iii in range(N_SAMPLE): + pos = np.random.randint(0, len(data) - ctx_len-1) + context = data[pos:pos+ctx_len+1] + ctx = [tokenizer.stoi.get(s, tokenizer.UNKNOWN_CHAR) for s in context] + + model.clear() + for i in range(1, ctx_len+1): + x = ctx[:i] + out = model.run(x) + prob = F.softmax(torch.tensor(out), dim=-1) + loss_table[i-1] += -math.log(prob[ctx[i]]) + + print(f'Tested {iii+1} samples: avg_loss over ctx_len =', + np.mean(loss_table) / (iii+1)) + + exit(0) + +######################################################################################################## + +context = tokenizer.refine_context(context) +print('\nYour prompt has ' + str(len(context)) + ' tokens.') +print('\n--> Currently the first run takes a while if your prompt is long, as we are using RNN to process the prompt. This will be much faster in future versions. <--\n') + +for TRIAL in range(1 if DEBUG_DEBUG else NUM_TRIALS): + t_begin = time.time_ns() + + src_len = len(context) + ctx = [tokenizer.stoi.get(s, tokenizer.UNKNOWN_CHAR) for s in context] + print(('-' * 30) + context, end='') + + model.clear() + if TRIAL == 0: + init_state = types.SimpleNamespace() + for i in range(src_len): + x = ctx[:i+1] + if i == src_len - 1: + init_state.out = model.run(x) + else: + model.run(x) + model.save(init_state) + else: + model.load(init_state) + + for i in range(src_len, src_len + (1 if DEBUG_DEBUG else LENGTH_PER_TRIAL)): + x = ctx[:i+1] + x = x[-ctx_len:] + + if i == src_len: + out = copy.deepcopy(init_state.out) + else: + out = model.run(x) + if DEBUG_DEBUG: + print('model', np.array(x), '==>', np.array( + out), np.max(out), np.min(out)) + + char = tokenizer.sample_logits(out, x, ctx_len, temperature=TEMPERATURE, + top_p_usual=top_p, top_p_newline=top_p_newline) + char = char.item() + print(tokenizer.itos[int(char)], end='', flush=True) + ctx += [char] + t_end = time.time_ns() + print("\n----------", round((t_end - t_begin) / (10 ** 9), 2), end='s ') diff --git a/docs/src/RWKV-v2-RNN/src/model.py b/docs/src/RWKV-v2-RNN/src/model.py new file mode 100644 index 00000000..afa67ed2 --- /dev/null +++ b/docs/src/RWKV-v2-RNN/src/model.py @@ -0,0 +1,349 @@ +######################################################################################################## +# The RWKV v2-RNN Language Model - https://github.com/BlinkDL/RWKV-LM +######################################################################################################## + +from torch.utils.cpp_extension import load +import math +import numpy as np +import logging +import torch +import torch.nn as nn +from torch.nn import functional as F +logger = logging.getLogger(__name__) + +######################################################################################################## +# CUDA Kernel +######################################################################################################## + +T_MAX = 1024 # increase this if your ctx_len > 1024 +B_GROUP_FORWARD = 4 # set to 8 for best performance +B_GROUP_BACKWARD = 2 # set to 2 for best performance + +timex_cuda = load(name="timex", sources=["cuda/timex_op.cpp", "cuda/timex_cuda.cu"], + verbose=True, extra_cuda_cflags=['--use_fast_math', '--extra-device-vectorization', f'-DTmax={T_MAX}', f'-DBF={B_GROUP_FORWARD}', f'-DBB={B_GROUP_BACKWARD}']) + + +class TimeX(torch.autograd.Function): + @staticmethod + def forward(ctx, w, k, B, C, T, eps): + ctx.B = B + ctx.C = C + ctx.T = T + assert ctx.T % 4 == 0 and ctx.T <= T_MAX and ctx.B % B_GROUP_FORWARD == 0 and ctx.B % B_GROUP_BACKWARD == 0 + w = w.contiguous() + k = k.contiguous() + ctx.save_for_backward(w, k) + wk = torch.empty((B, C, T), device='cuda', + memory_format=torch.contiguous_format) + timex_cuda.forward(w, k, wk, eps, B, C, T) + return wk + + @staticmethod + def backward(ctx, gwk): + assert ctx.T % 4 == 0 and ctx.T <= T_MAX and ctx.B % B_GROUP_FORWARD == 0 and ctx.B % B_GROUP_BACKWARD == 0 + w, k = ctx.saved_tensors + gw = torch.empty((ctx.B, ctx.C, ctx.T), device='cuda', + memory_format=torch.contiguous_format) + gk = torch.empty((ctx.B, ctx.C, ctx.T), device='cuda', + memory_format=torch.contiguous_format) + timex_cuda.backward(w, k, gwk.contiguous(), gw, + gk, ctx.B, ctx.C, ctx.T) + return (gw.sum(dim=0), gk, None, None, None, None) + +######################################################################################################## +# RWKV: RWKV Time-mix + RWKV Channel-mix +######################################################################################################## + + +RWKV_K_CLAMP = 60 # e^60 = 1e26 +RWKV_K_EPS = 1e-16 +RWKV_HEAD_QK_DIM = 256 + + +def RWKV_Init(module, config): # fancy initialization of all lin & emb layer in the module + for m in module.modules(): + if not isinstance(m, (nn.Linear, nn.Embedding)): + continue + with torch.no_grad(): + name = '[unknown weight]' + for name, parameter in module.named_parameters(): # find the name of the weight + if id(m.weight) == id(parameter): + break + + shape = m.weight.data.shape + gain = 1.0 + scale = 1.0 # extra scale for gain + + if isinstance(m, nn.Embedding): + gain = math.sqrt(max(shape[0], shape[1])) + if shape[0] == config.vocab_size and shape[1] == config.n_embd: # token emb? + scale = 1e-4 + else: + scale = 0 + + if isinstance(m, nn.Linear): + if m.bias is not None: + m.bias.data.zero_() + if shape[0] > shape[1]: + gain = math.sqrt(shape[0] / shape[1]) + if shape[0] == config.vocab_size and shape[1] == config.n_embd: # final projection? + scale = 0.5 + + if hasattr(m, 'scale_init'): + scale = m.scale_init + + # print(str(shape[0]).ljust(5), str(shape[1]).ljust(5), f'{round(scale,2):g}'.ljust(4), name) + + gain *= scale + if scale == -999: + nn.init.eye_(m.weight) + elif gain == 0: + # zero init is great for some RWKV matrices + nn.init.zeros_(m.weight) + elif gain > 0: + nn.init.orthogonal_(m.weight, gain=gain) + else: + nn.init.normal_(m.weight, mean=0.0, std=-scale) + + +class RWKV_TimeMix(nn.Module): + def __init__(self, config, layer_id): + super().__init__() + self.layer_id = layer_id + self.ctx_len = config.ctx_len + self.n_embd = config.n_embd + + attn_sz = config.n_embd + + ############# fancy init of time_w curves ################################### + f1_begin = 3.0 + f1_end = 1.2 + f2_begin = 0.65 + f2_end = 0.4 + with torch.no_grad(): # initial time_w curves for better convergence + decay_speed = torch.ones(attn_sz, 1) + first_sa_layer_id = 1 + for h in range(attn_sz): + f1 = f1_begin + (layer_id-first_sa_layer_id) / \ + (config.n_layer-1-first_sa_layer_id) * (f1_end - f1_begin) + f2 = f2_begin + (layer_id-first_sa_layer_id) / \ + (config.n_layer-1-first_sa_layer_id) * (f2_end - f2_begin) + if layer_id == first_sa_layer_id: + f1 += 0.5 + if layer_id == config.n_layer-2: + f2 = 0.4 + if layer_id == config.n_layer-1: + f2 = 0.37 + decay_speed[h][0] = math.pow(f2, h / (attn_sz-1) * 7) * f1 + self.time_decay = nn.Parameter(torch.log(decay_speed)) # will use exp(self.time_decay) to ensure time_decay > 0 + self.time_curve = torch.tensor( + [-(config.ctx_len - 2 - i) for i in range(config.ctx_len-1)]).unsqueeze(0) + self.time_curve = self.time_curve.to('cuda') + self.time_first = nn.Parameter(torch.ones(attn_sz, 1) * math.log(0.3)) + ############################################################################# + + self.time_shift = nn.ZeroPad2d((0, 0, 1, -1)) + with torch.no_grad(): # init to "shift half of the channels" + ww = torch.ones(1, 1, config.n_embd) + for i in range(config.n_embd // 2): + ww[0, 0, i] = 0 + self.time_mix = nn.Parameter(ww) + + self.key = nn.Linear(config.n_embd, attn_sz, bias=False) + self.value = nn.Linear(config.n_embd, attn_sz, bias=False) + self.receptance = nn.Linear(config.n_embd, attn_sz, bias=False) + + self.output = nn.Linear(attn_sz, config.n_embd, bias=False) + + self.key.scale_init = 0 + self.receptance.scale_init = 0 + self.output.scale_init = 0 + + def forward(self, x): + B, T, C = x.size() + + x = x * self.time_mix + self.time_shift(x) * (1 - self.time_mix) + + k = self.key(x).transpose(-1, -2) + v = self.value(x).transpose(-1, -2) + r = self.receptance(x) + + # RWKV_K_CLAMP can be removed if the CUDA kernel substracts the correct k_max for each k (I will do this later) + k = torch.clamp(k, max=RWKV_K_CLAMP) + k = torch.exp(k) + kv = k * v + + self.time_w = torch.cat( + [torch.exp(self.time_decay) * self.time_curve, self.time_first], dim=-1) + w = torch.exp(self.time_w) + + wkv = TimeX.apply(w, kv, B, C, T, 0) + # RWKV_K_EPS can be removed if the CUDA kernel sets 0/0 = 0 (I will do this later) + wk = TimeX.apply(w, k, B, C, T, RWKV_K_EPS) + + rwkv = torch.sigmoid(r) * (wkv / wk).transpose(-1, -2) + rwkv = self.output(rwkv) + return rwkv + + +class RWKV_ChannelMix(nn.Module): + def __init__(self, config, layer_id): + super().__init__() + self.layer_id = layer_id + + self.time_shift = nn.ZeroPad2d((0, 0, 1, -1)) + + with torch.no_grad(): # init to "shift half of the channels" + x = torch.ones(1, 1, config.n_embd) + for i in range(config.n_embd // 2): + x[0, 0, i] = 0 + self.time_mix = nn.Parameter(x) + + hidden_sz = 4 * config.n_embd + self.key = nn.Linear(config.n_embd, hidden_sz, bias=False) + self.receptance = nn.Linear(config.n_embd, config.n_embd, bias=False) + self.value = nn.Linear(hidden_sz, config.n_embd, bias=False) + + self.value.scale_init = 0 + self.receptance.scale_init = 0 + + def forward(self, x): + x = x * self.time_mix + self.time_shift(x) * (1 - self.time_mix) + + k = self.key(x) + k = torch.square(torch.relu(k)) + kv = self.value(k) + + rkv = torch.sigmoid(self.receptance(x)) * kv + return rkv + +######################################################################################################## +# The GPT Model with our blocks +######################################################################################################## + + +class GPTConfig: + def __init__(self, vocab_size, ctx_len, **kwargs): + self.vocab_size = vocab_size + self.ctx_len = ctx_len + for k, v in kwargs.items(): + setattr(self, k, v) + + +class Block(nn.Module): + def __init__(self, config, layer_id): + super().__init__() + self.config = config + self.layer_id = layer_id + + self.ln1 = nn.LayerNorm(config.n_embd) + self.ln2 = nn.LayerNorm(config.n_embd) + + if self.layer_id == 0 and self.config.model_type == 'RWKV-ffnPre': + self.ffnPre = RWKV_ChannelMix(config, layer_id+1000) + else: + self.att = RWKV_TimeMix(config, layer_id) + + self.ffn = RWKV_ChannelMix(config, layer_id) + + def forward(self, x): + x = self.ln1(x) + if self.layer_id == 0 and self.config.model_type == 'RWKV-ffnPre': + x = x + self.ffnPre(x) # better in some cases + else: + x = x + self.att(x) + x = self.ln2(x) + x = x + self.ffn(x) + return x + + +class GPT(nn.Module): + def __init__(self, config): + super().__init__() + self.step = 0 + self.config = config + + self.emb = nn.Embedding(config.vocab_size, config.n_embd) + + self.blocks = nn.Sequential(*[Block(config, i) + for i in range(config.n_layer)]) + + self.ln_out = nn.LayerNorm(config.n_embd) + self.head = nn.Linear(config.n_embd, config.vocab_size, bias=False) + + self.head_q = nn.Linear(config.n_embd, RWKV_HEAD_QK_DIM, bias=False) + self.head_q.scale_init = 0 + self.head_k = nn.Linear(config.n_embd, RWKV_HEAD_QK_DIM, bias=False) + self.head_k.scale_init = 0.1 + self.register_buffer("copy_mask", torch.tril( + torch.ones(config.ctx_len, config.ctx_len))) + + self.ctx_len = config.ctx_len + + RWKV_Init(self, config) + + logger.info("number of parameters: %e", sum(p.numel() + for p in self.parameters())) + + def get_ctx_len(self): + return self.ctx_len + + def _init_weights(self, module): + if isinstance(module, (nn.Linear)): + module.weight.data.normal_(mean=0.0, std=0.01) + if isinstance(module, (nn.Embedding)): + module.weight.data.normal_(mean=0.0, std=1e-5) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + def configure_optimizers(self, train_config): + # separate out all parameters to those that will and won't experience regularizing weight decay + decay = set() + no_decay = set() + + for mn, m in self.named_modules(): # here we disable weight_decay + for pn, p in m.named_parameters(): + fpn = '%s.%s' % (mn, pn) if mn else pn # full param name + no_decay.add(fpn) + + param_dict = {pn: p for pn, p in self.named_parameters()} + inter_params = decay & no_decay + union_params = decay | no_decay + assert len( + inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), ) + assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \ + % (str(param_dict.keys() - union_params), ) + + optim_groups = [ + {"params": [param_dict[pn] + for pn in sorted(list(no_decay))], "weight_decay": 0.0}, + ] + + optimizer = torch.optim.Adam( + optim_groups, lr=train_config.learning_rate, betas=train_config.betas, eps=train_config.eps) + + return optimizer + + def forward(self, idx, targets=None): + self.step += 1 + B, T = idx.size() + assert T <= self.ctx_len, "Cannot forward, because len(input) > model ctx_len." + x = self.emb(idx) + + x = self.blocks(x) + + x = self.ln_out(x) + + q = self.head_q(x)[:, :T, :] + k = self.head_k(x)[:, :T, :] + c = (q @ k.transpose(-2, -1)) * (1.0 / RWKV_HEAD_QK_DIM) + c = c.masked_fill(self.copy_mask[:T, :T] == 0, 0) + + c = c @ F.one_hot(idx, num_classes=self.config.vocab_size).float() + x = self.head(x) + c + + loss = None + if targets is not None: + loss = F.cross_entropy(x.view(-1, x.size(-1)), targets.view(-1)) + + return x, loss diff --git a/docs/src/RWKV-v2-RNN/src/model_run.py b/docs/src/RWKV-v2-RNN/src/model_run.py new file mode 100644 index 00000000..ecb459e5 --- /dev/null +++ b/docs/src/RWKV-v2-RNN/src/model_run.py @@ -0,0 +1,143 @@ +import types +import copy +import torch +from torch.nn import functional as F + +RWKV_K_CLAMP = 60 +RWKV_K_EPS = 1e-16 +RWKV_HEAD_QK_DIM = 256 + +DEBUG_TIME = False # True False - show trained time-coeffs + + +class RWKV_RNN(): + def __init__(self, MODEL_NAME, RUN_DEVICE, model_type, n_layer, n_embd, ctx_len): + self.RUN_DEVICE = RUN_DEVICE + self.model_type = model_type + self.n_layer = n_layer + self.n_embd = n_embd + self.ctx_len = ctx_len + + self.w = types.SimpleNamespace() + + w = torch.load(MODEL_NAME + '.pth', + map_location=torch.device(RUN_DEVICE)) + for x in w.keys(): + if '.time_' in x: + w[x] = w[x].squeeze() + if '.time_decay' in x: + w[x] = torch.exp(-torch.exp(w[x])) + if '.time_first' in x: + w[x] = torch.exp(w[x]) + if DEBUG_TIME and '.time_' in x: + print(x, w[x].squeeze().cpu().numpy()) + + xx = x.split('.') + here = self.w + for i in range(len(xx)): + if xx[i].isdigit(): + ii = int(xx[i]) + if ii not in here: + here[ii] = types.SimpleNamespace() + here = here[ii] + else: + if i == len(xx) - 1: + setattr(here, xx[i], w[x]) + elif not hasattr(here, xx[i]): + if xx[i+1].isdigit(): + setattr(here, xx[i], {}) + else: + setattr(here, xx[i], types.SimpleNamespace()) + here = getattr(here, xx[i]) + + self.clear() + + def clear(self): + self.xx = {} + self.aa = {} + self.bb = {} + self.hk = None + + def save(self, target): + target.xx = copy.deepcopy(self.xx) + target.aa = copy.deepcopy(self.aa) + target.bb = copy.deepcopy(self.bb) + target.hk = copy.deepcopy(self.hk) + + def load(self, target): + self.xx = copy.deepcopy(target.xx) + self.aa = copy.deepcopy(target.aa) + self.bb = copy.deepcopy(target.bb) + self.hk = copy.deepcopy(target.hk) + + def LN(self, xx, w): + return F.layer_norm(xx, (self.n_embd,), weight=w.weight, bias=w.bias) + + def FF(self, xx, w, name): + if name not in self.xx: + self.xx[name] = torch.zeros(self.n_embd, device=self.RUN_DEVICE) + x = xx * w.time_mix + self.xx[name] * (1 - w.time_mix) + self.xx[name] = xx + + r = torch.sigmoid(w.receptance.weight @ x) + k = torch.square(torch.relu(w.key.weight @ x)) + kv = w.value.weight @ k + + return r * kv + + def SA(self, xx, w, name): + if name not in self.xx: + self.xx[name] = torch.zeros(self.n_embd, device=self.RUN_DEVICE) + self.aa[name] = torch.zeros(self.n_embd, device=self.RUN_DEVICE) + self.bb[name] = torch.zeros(self.n_embd, device=self.RUN_DEVICE) + x = xx * w.time_mix + self.xx[name] * (1 - w.time_mix) + self.xx[name] = xx + + r = torch.sigmoid(w.receptance.weight @ x) + + k = torch.exp(torch.clamp(w.key.weight @ x, max=RWKV_K_CLAMP)) + v = w.value.weight @ x + kv = k * v + + a = self.aa[name] + w.time_first * kv + b = self.bb[name] + w.time_first * k + self.aa[name] = w.time_decay * self.aa[name] + kv + self.bb[name] = w.time_decay * self.bb[name] + k + + rwkv = r * a / (b + RWKV_K_EPS) + + return w.output.weight @ rwkv + + def run(self, ctx): + w = self.w + x = w.emb.weight[ctx[-1]] + + for i in range(self.n_layer): + x = self.LN(x, w.blocks[i].ln1) + if i == 0 and self.model_type == 'RWKV-ffnPre': + x = x + self.FF(x, w.blocks[i].ffnPre, f'ffnPre.{i}') + else: + x = x + self.SA(x, w.blocks[i].att, f'att.{i}') + x = self.LN(x, w.blocks[i].ln2) + x = x + self.FF(x, w.blocks[i].ffn, f'ffn.{i}') + + x = self.LN(x, w.ln_out) + + if self.hk == None: + self.hk = (w.head_k.weight @ x).unsqueeze(0) + else: + self.hk = torch.cat( + [self.hk, (w.head_k.weight @ x).unsqueeze(0)], dim=0) + if self.hk.shape[0] > self.ctx_len: + self.hk = self.hk[-self.ctx_len:, :] + + q = w.head_q.weight @ x + + x = w.head.weight @ x + x = x.cpu().numpy().tolist() + + c = (self.hk @ q) / RWKV_HEAD_QK_DIM + for i in range(len(c)): + x[ctx[i]] += c[i] + + return x diff --git a/docs/src/RWKV-v2-RNN/src/trainer.py b/docs/src/RWKV-v2-RNN/src/trainer.py new file mode 100644 index 00000000..19ea1d8e --- /dev/null +++ b/docs/src/RWKV-v2-RNN/src/trainer.py @@ -0,0 +1,170 @@ +######################################################################################################## +# The RWKV v2-RNN Language Model - https://github.com/BlinkDL/RWKV-LM +######################################################################################################## + +from torch.utils.data.dataloader import DataLoader +from torch.optim.lr_scheduler import LambdaLR +from torch.nn import functional as F +import torch.nn as nn +import torch.optim as optim +import torch +from tqdm.auto import tqdm +import numpy as np +import logging +import os +import datetime +import sys +import math + +# import wandb # comment this if you don't have wandb +# print('logging to wandb... (comment it if you don\'t have wandb)') + +logger = logging.getLogger(__name__) +torch.backends.cudnn.benchmark = True +torch.backends.cudnn.allow_tf32 = True +torch.backends.cuda.matmul.allow_tf32 = True + +log_file = open("mylog.txt", "a") + + +class TrainerConfig: + max_epochs = 10 + batch_size = 64 + learning_rate = 4e-4 + betas = (0.9, 0.99) + eps = 1e-8 + grad_norm_clip = 1.0 + lr_decay = True # linear warmup followed by cosine decay + warmup_tokens = 0 + final_tokens = 0 + epoch_save_frequency = 0 + epoch_save_path = 'trained-' + num_workers = 0 # for DataLoader + + def __init__(self, **kwargs): + for k, v in kwargs.items(): + setattr(self, k, v) + + +class Trainer: + + def __init__(self, model, train_dataset, test_dataset, config): + self.model = model + self.train_dataset = train_dataset + self.test_dataset = test_dataset + self.config = config + self.avg_loss = -1 + self.steps = 0 + + if 'wandb' in sys.modules: + cfg = model.config + for k in config.__dict__: + setattr(cfg, k, config.__dict__[k]) # combine cfg + wandb.init(project="RWKV-LM", name=self.get_run_name() + '-' + + datetime.datetime.today().strftime('%Y-%m-%d-%H-%M-%S'), config=cfg, save_code=False) + + self.device = 'cpu' + if torch.cuda.is_available(): # take over whatever gpus are on the system + self.device = torch.cuda.current_device() + + def get_run_name(self): + raw_model = self.model.module if hasattr( + self.model, "module") else self.model + cfg = raw_model.config + run_name = str(cfg.vocab_size) + '-' + str(cfg.ctx_len) + '-' + \ + cfg.model_type + '-' + str(cfg.n_layer) + '-' + str(cfg.n_embd) + return run_name + + def train(self): + model, config = self.model, self.config + raw_model = model.module if hasattr(self.model, "module") else model + optimizer = raw_model.configure_optimizers(config) + + def run_epoch(split): + is_train = split == 'train' + model.train(is_train) + data = self.train_dataset if is_train else self.test_dataset + + if config.num_workers > 0: + loader = DataLoader(data, shuffle=False, pin_memory=True, + batch_size=config.batch_size, + num_workers=config.num_workers) + else: + loader = DataLoader(data, shuffle=False, + batch_size=config.batch_size, + num_workers=config.num_workers) + + pbar = tqdm(enumerate(loader), total=len( + loader), bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}') if is_train else enumerate(loader) + + for it, (x, y) in pbar: + x = x.to(self.device) # place data on the correct device + y = y.to(self.device) + + with torch.set_grad_enabled(is_train): + _, loss = model(x, y) # forward the model + + if is_train: # backprop and update the parameters + model.zero_grad() + loss.backward() + + if config.grad_norm_clip > 0: + torch.nn.utils.clip_grad_norm_( + model.parameters(), config.grad_norm_clip) + + optimizer.step() + + if config.lr_decay: # decay the learning rate based on our progress + # number of tokens processed this step (i.e. label is not -100) + self.tokens += (y >= 0).sum() + lr_final_factor = config.lr_final / config.learning_rate + if self.tokens < config.warmup_tokens: + # linear warmup + lr_mult = lr_final_factor + \ + (1 - lr_final_factor) * float(self.tokens) / \ + float(config.warmup_tokens) + progress = 0 + else: + # cosine learning rate decay + progress = float(self.tokens - config.warmup_tokens) / float( + max(1, config.final_tokens - config.warmup_tokens)) + lr_mult = (0.5 + lr_final_factor / 2) + (0.5 - lr_final_factor / + 2) * math.cos(math.pi * progress) # better 1.0 ~ 0.1 + lr = config.learning_rate * lr_mult + for param_group in optimizer.param_groups: + param_group['lr'] = lr + else: + lr = config.learning_rate + + now_loss = loss.item() # report progress + self.lr = lr + + if 'wandb' in sys.modules: + wandb.log({"loss": now_loss}, + step=self.steps * self.config.batch_size) + self.steps += 1 + + if self.avg_loss < 0: + self.avg_loss = now_loss + else: + factor = 1 / (it + 1) + self.avg_loss = self.avg_loss * \ + (1.0 - factor) + now_loss * factor + pbar.set_description( + f"mini-epoch {epoch+1} prog {progress*100.0:.2f}% iter {it}: ppl {math.exp(self.avg_loss):.2f} loss {self.avg_loss:.4f} lr {lr:e}") + + self.tokens = 0 # counter used for learning rate decay + for epoch in range(config.max_epochs): + + run_epoch('train') + + log_file.write( + f'{epoch+1} {self.avg_loss:.6f} {math.exp(self.avg_loss):.4f} {self.lr:.8f} {datetime.datetime.now()} \n') + log_file.flush() + + if (self.config.epoch_save_frequency > 0 and epoch % self.config.epoch_save_frequency == 0) or (epoch == config.max_epochs - 1): + # DataParallel wrappers keep raw model object in .module + raw_model = self.model.module if hasattr( + self.model, "module") else self.model + torch.save(raw_model.state_dict(), + self.config.epoch_save_path + str(epoch+1) + '.pth') diff --git a/docs/src/RWKV-v2-RNN/src/utils.py b/docs/src/RWKV-v2-RNN/src/utils.py new file mode 100644 index 00000000..480518f0 --- /dev/null +++ b/docs/src/RWKV-v2-RNN/src/utils.py @@ -0,0 +1,122 @@ +######################################################################################################## +# The RWKV v2-RNN Language Model - https://github.com/BlinkDL/RWKV-LM +######################################################################################################## + +import json +import random +import time +import math +import numpy as np +import torch +import torch.nn as nn +from torch.nn import functional as F +from torch.utils.data import Dataset + + +class Dataset(Dataset): + def __init__(self, data, ctx_len, epoch_length_fixed): + print('building token list...', end=' ') + unique = sorted(list(set(data))) + # print() + # for u in unique: + # print(u, end=' ') + # print('\n\n') + + xx = 0 + xxObj = {} + for u in unique: + xxObj[xx] = u + xx += 1 + with open('vocab.json', "w", encoding="utf-16") as vocab_file: + vocab_file.write(json.dumps(xxObj, ensure_ascii=False)) + + data_size, vocab_size = len(data), len(unique) + print('data has %d tokens, %d unique.' % (data_size, vocab_size)) + self.stoi = {ch: i for i, ch in enumerate(unique)} + self.itos = {i: ch for i, ch in enumerate(unique)} + self.ctx_len = ctx_len + self.epoch_length_fixed = epoch_length_fixed + self.vocab_size = vocab_size + self.data = data + + def __len__(self): + return self.epoch_length_fixed + + def __getitem__(self, idx): + # cheat: pick a random spot in dataset + i = np.random.randint(0, len(self.data) - (self.ctx_len + 1)) + chunk = self.data[i:i+self.ctx_len+1] + dix = [self.stoi[s] for s in chunk] + x = torch.tensor(dix[:-1], dtype=torch.long, + device=torch.device('cuda')) + y = torch.tensor(dix[1:], dtype=torch.long, + device=torch.device('cuda')) + return x, y + + +class TOKENIZER(): + def __init__(self, WORD_NAME, UNKNOWN_CHAR='\ue083'): + with open(WORD_NAME + '.json', "r", encoding="utf-16") as result_file: + self.word_table = json.load(result_file) + + self.vocab_size = len(self.word_table) + + self.stoi = {v: int(k) for k, v in self.word_table.items()} + self.itos = {int(k): v for k, v in self.word_table.items()} + + self.UNKNOWN_CHAR = self.stoi[UNKNOWN_CHAR] + + def refine_context(self, context): + context = context.strip().split('\n') + for c in range(len(context)): + context[c] = context[c].strip().strip('\u3000').strip('\r') + context = list(filter(lambda c: c != '', context)) + context = '\n' + ('\n'.join(context)).strip() + if context == '': + context = '\n' + + return context + + def sample_logits(self, out, x, ctx_len, temperature=1.0, top_p_usual=None, top_p_newline=None): + # out[self.UNKNOWN_CHAR] = -float('Inf') + + lastChar = int(x[-1]) + + probs = F.softmax(torch.tensor(out), dim=-1) + + if self.itos[lastChar] == '\n': + top_p = top_p_newline + else: + top_p = top_p_usual + + sorted_probs, s_index = torch.sort(probs, descending=True) + + # for j in range(30): + # pp = sorted_probs[j].item() + # if pp < 0.005: + # break + # ss = self.itos[int(s_index[j])].replace('\n','_') + # print(f'{math.floor(pp*100):>3.0f}{ss}', end='') + # print('') + + cumulative_probs = torch.cumsum(sorted_probs, dim=-1).numpy() + cutoff = float(sorted_probs[np.argmax(cumulative_probs > top_p)]) + + probs[probs < cutoff] = 0 + # print("[" + str(round(cutoff,4)) + ' ' + str(round(to_float(sum(probs)),3)) + "]", end = "") + + if temperature != 1.0: + probs = probs.pow(1.0 / temperature) + + return torch.multinomial(probs, num_samples=1)[0] + + +def to_float(x): + return x.cpu().detach().numpy().flatten()[0].astype(float) + + +def set_seed(seed): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) diff --git a/docs/src/RWKV-v2-RNN/train.py b/docs/src/RWKV-v2-RNN/train.py new file mode 100644 index 00000000..e46c0ac0 --- /dev/null +++ b/docs/src/RWKV-v2-RNN/train.py @@ -0,0 +1,98 @@ +######################################################################################################## +# The RWKV v2-RNN Language Model - https://github.com/BlinkDL/RWKV-LM +######################################################################################################## + +import logging +import datetime +import json +from src.model import GPT, GPTConfig +from src.trainer import Trainer, TrainerConfig +from src.utils import Dataset +import torch +import numpy as np +torch.backends.cudnn.benchmark = True +torch.backends.cudnn.allow_tf32 = True +torch.backends.cuda.matmul.allow_tf32 = True + +### Step 1: set training data ########################################################################## + +datafile = "enwik8" +datafile_encoding = 'utf-8' +# datafile_encoding = 'utf-16le' + +### Step 2: set model size ############################################################################# + +ctx_len = 1024 # ===> increase T_MAX in model.py if your ctx_len > 1024 +n_layer = 6 +n_embd = 512 + +# 'RWKV' (better for char-level English) or 'RWKV-ffnPre' (better in some cases) +model_type = 'RWKV' + +### Step 3: set batch size ############################################################################# + +# ===> batch_size must be divisible by B_GROUP_FORWARD and B_GROUP_BACKWARD in model.py +# For example, if your batch_size = 20, you can set B_GROUP_FORWARD = 4, B_GROUP_BACKWARD = 2 +# If you see "CUDA out of memory", reduce it. Use GPU-Z to find the highest value for your VRAM. +batch_size = 12 + +### Step 4: set learning rate, training mini-epochs ####################################################### + +lr_init = 6e-4 +lr_final = 1e-5 +# the mini-epoch is very short and of fixed length (ctx_len * epoch_length_fixed tokens) +n_epoch = 500 +# 0 = never, 1 = every mini-epoch, 2 = every two mini-epochs, etc. +epoch_save_frequency = 30 +epoch_save_path = 'trained-' + +epoch_length_fixed = 10000 + +######################################################################################################## + +# import src.utils +# src.utils.set_seed(42) # remember to change seed if you load a model + +np.set_printoptions(precision=4, suppress=True, linewidth=200) +logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO,) + +grad_norm_clip = 1.0 +warmup_tokens = 0 + +betas = (0.9, 0.99) +eps = 4e-9 + +num_workers = 0 + +######################################################################################################## +# Load data +######################################################################################################## + +print('loading data... ' + datafile) +train_dataset = Dataset(open( + datafile, "r", encoding=datafile_encoding).read(), ctx_len, epoch_length_fixed) + +######################################################################################################## +# Train model +######################################################################################################## +if __name__ == '__main__': + + model = GPT(GPTConfig(train_dataset.vocab_size, train_dataset.ctx_len, model_type=model_type, + n_layer=n_layer, n_embd=n_embd)).cuda() + + # # # load a trained model. remember to change random seed + # m2 = torch.load('trained-61.pth') + # model.load_state_dict(m2) + + print('model', model_type, 'epoch', n_epoch, 'batchsz', batch_size, 'betas', + betas, 'eps', eps, 'ctx', ctx_len, 'layer', n_layer, 'embd', n_embd, ) + tconf = TrainerConfig(model_type=model_type, max_epochs=n_epoch, batch_size=batch_size, + learning_rate=lr_init, lr_decay=True, lr_final=lr_final, betas=betas, eps=eps, grad_norm_clip=grad_norm_clip, + warmup_tokens=warmup_tokens, final_tokens=n_epoch*len(train_dataset)*ctx_len, num_workers=num_workers, epoch_save_frequency=epoch_save_frequency, epoch_save_path=epoch_save_path) + trainer = Trainer(model, train_dataset, None, tconf) + + trainer.train() + + torch.save(model.state_dict(), 'trained-' + str(n_epoch) + '-' + trainer.get_run_name() + + '-' + datetime.datetime.today().strftime('%Y-%m-%d-%H-%M-%S') + '.pth') diff --git a/docs/src/RWKV-v3/cuda/timex_cuda.cu b/docs/src/RWKV-v3/cuda/timex_cuda.cu new file mode 100644 index 00000000..647eddcd --- /dev/null +++ b/docs/src/RWKV-v3/cuda/timex_cuda.cu @@ -0,0 +1,172 @@ +#include + +// require T <= Tmax, T % 4 == 0, B % BF == 0, B % BB === 0 (Tmax and BF and BB are passed by compiler) + +#define F4(A, B) ((float4 *)(A))[(B) >> 2] + +template +__global__ void kernel_forward(const F *__restrict__ const __w, const F *__restrict__ const __k, F *__restrict__ const x, + const F eps, const int B, const int C, const int T) { + const int i = blockIdx.y; + const int ij = (B * C) / BF; + const int t = threadIdx.x << 2; + + __shared__ F ww[Tmax]; + __shared__ F kk[Tmax * BF]; + F4(ww, t) = F4(__w, t + T * (i % C)); + + #pragma unroll + for (int j = 0; j < BF; j++) { + F4(kk, t + Tmax * j) = F4(__k, t + T * (i + ij * j)); + } + __syncthreads(); + + float4 s[BF]; + #pragma unroll + for (int j = 0; j < BF; j++) { + s[j] = {eps, eps, eps, eps}; + } + const F *__restrict__ const w = ww + T - t - 4; + for (int u = 0; u <= t; u++) { + #pragma unroll + for (int j = 0; j < BF; j++) { + const F x = kk[u + Tmax * j]; + s[j].x += w[u + 3] * x; + s[j].y += w[u + 2] * x; + s[j].z += w[u + 1] * x; + s[j].w += w[u + 0] * x; + } + } + #pragma unroll + for (int j = 0; j < BF; j++) { + const F *__restrict__ const k = kk + Tmax * j; + s[j].y += w[t + 3] * k[t + 1]; + s[j].z += w[t + 2] * k[t + 1]; + s[j].z += w[t + 3] * k[t + 2]; + s[j].w += w[t + 1] * k[t + 1]; + s[j].w += w[t + 2] * k[t + 2]; + s[j].w += w[t + 3] * k[t + 3]; + F4(x, t + T * (i + ij * j)) = s[j]; + } +} + +template +__global__ void kernel_backward_W(const F *__restrict__ const __w, const F *__restrict__ const __k, const F *__restrict__ const __gwk, + F *__restrict__ const gw, F *__restrict__ const gk, + const int B, const int C, const int T) { + const int i = blockIdx.y; + const int t = threadIdx.x << 2; + + __shared__ F k[Tmax]; + __shared__ F gg[Tmax]; + F4(k, t) = F4(__k, t + T * i); + F4(gg, t) = F4(__gwk, t + T * i); + __syncthreads(); + + float4 s = {0, 0, 0, 0}; + + const F *__restrict__ const g = gg + T - t - 4; + for (int u = 0; u <= t; u++) { + F x = k[u]; + s.x += g[u + 3] * x; + s.y += g[u + 2] * x; + s.z += g[u + 1] * x; + s.w += g[u + 0] * x; + } + s.y += g[t + 3] * k[t + 1]; + s.z += g[t + 2] * k[t + 1]; + s.z += g[t + 3] * k[t + 2]; + s.w += g[t + 1] * k[t + 1]; + s.w += g[t + 2] * k[t + 2]; + s.w += g[t + 3] * k[t + 3]; + F4(gw, t + T * i) = s; +} +void cuda_forward(const float *w, const float *k, float *x, float eps, int B, int C, int T) { + dim3 gridDim(1, B * C / BF); + dim3 blockDim(T >> 2); + kernel_forward<< >>(w, k, x, eps, B, C, T); +} + +template +__global__ void kernel_backward(const F *__restrict__ const __w, const F *__restrict__ const __k, const F *__restrict__ const __gwk, + F *__restrict__ const gw, F *__restrict__ const gk, + const int B, const int C, const int T) { + const int i = blockIdx.y; + const int ij = (B * C) / BB; + const int t = threadIdx.x << 2; + + __shared__ F w[Tmax]; + __shared__ F kk[Tmax * BB]; + __shared__ F gg[Tmax * BB]; + F4(w, t) = F4(__w, t + T * (i % C)); + + #pragma unroll + for (int j = 0; j < BB; j++) { + F4(kk, t + Tmax * j) = F4(__k, t + T * (i + ij * j)); + F4(gg, t + Tmax * j) = F4(__gwk, t + T * (i + ij * j)); + } + __syncthreads(); + + float4 s[BB]; + #pragma unroll + for (int j = 0; j < BB; j++) { + s[j] = {0, 0, 0, 0}; + } + + for (int u = 0; u <= t; u++) { + #pragma unroll + for (int j = 0; j < BB; j++) { + const F *__restrict__ const g = gg + Tmax * j + T - t - 4; + F x = kk[u + Tmax * j]; + s[j].x += g[u + 3] * x; + s[j].y += g[u + 2] * x; + s[j].z += g[u + 1] * x; + s[j].w += g[u + 0] * x; + } + } + #pragma unroll + for (int j = 0; j < BB; j++) { + const F *__restrict__ const k = kk + Tmax * j; + const F *__restrict__ const g = gg + Tmax * j + T - t - 4; + s[j].y += g[t + 3] * k[t + 1]; + s[j].z += g[t + 2] * k[t + 1]; + s[j].z += g[t + 3] * k[t + 2]; + s[j].w += g[t + 1] * k[t + 1]; + s[j].w += g[t + 2] * k[t + 2]; + s[j].w += g[t + 3] * k[t + 3]; + F4(gw, t + T * (i + ij * j)) = s[j]; + } + + #pragma unroll + for (int j = 0; j < BB; j++) { + s[j] = {0, 0, 0, 0}; + } + + for (int u = t + 3; u < T; u++) { + F x = w[u]; + #pragma unroll + for (int j = 0; j < BB; j++) { + const F *__restrict__ const g = gg + Tmax * j + T + t - 3; + s[j].x += g[2 - u] * x; + s[j].y += g[3 - u] * x; + s[j].z += g[4 - u] * x; + s[j].w += g[5 - u] * x; + } + } + #pragma unroll + for (int j = 0; j < BB; j++) { + const F *__restrict__ const g = gg + Tmax * j + T + t - 3; + s[j].x += g[2 - t] * w[t + 0]; + s[j].x += g[1 - t] * w[t + 1]; + s[j].x += g[0 - t] * w[t + 2]; + s[j].y += g[2 - t] * w[t + 1]; + s[j].y += g[1 - t] * w[t + 2]; + s[j].z += g[2 - t] * w[t + 2]; + F4(gk, t + T * (i + ij * j)) = s[j]; + } +} +void cuda_backward(const float *w, const float *k, const float *gwk, float *gw, float *gk, int B, int C, int T) { + dim3 gridDim(1, B * C / BB); + dim3 blockDim(T >> 2); + kernel_backward<< >>(w, k, gwk, gw, gk, B, C, T); +} diff --git a/docs/src/RWKV-v3/cuda/timex_op.cpp b/docs/src/RWKV-v3/cuda/timex_op.cpp new file mode 100644 index 00000000..fd389071 --- /dev/null +++ b/docs/src/RWKV-v3/cuda/timex_op.cpp @@ -0,0 +1,21 @@ +#include + +void cuda_forward(const float *w, const float *k, float *x, float eps, int B, int C, int T); +void cuda_backward(const float *w, const float *k, const float *gwk, float *gw, float *gk, int B, int C, int T); + +void forward(torch::Tensor &w, const torch::Tensor &k, torch::Tensor &x, double eps, int64_t B, int64_t C, int64_t T) { + cuda_forward((const float *)w.data_ptr(), (const float *)k.data_ptr(), (float *)x.data_ptr(), eps, B, C, T); +} +void backward(torch::Tensor &w, const torch::Tensor &k, const torch::Tensor &gwk, torch::Tensor &gw, torch::Tensor &gk, int64_t B, int64_t C, int64_t T) { + cuda_backward((const float *)w.data_ptr(), (const float *)k.data_ptr(), (const float *)gwk.data_ptr(), (float *)gw.data_ptr(), (float *)gk.data_ptr(), B, C, T); +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("forward", &forward, "timex forward"); + m.def("backward", &backward, "timex backward"); +} + +TORCH_LIBRARY(timex, m) { + m.def("forward", forward); + m.def("backward", backward); +} diff --git a/docs/src/RWKV-v3/run.py b/docs/src/RWKV-v3/run.py new file mode 100644 index 00000000..c6862a54 --- /dev/null +++ b/docs/src/RWKV-v3/run.py @@ -0,0 +1,98 @@ +######################################################################################################## +# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM +######################################################################################################## + +import numpy as np +import math +import time +import types +import copy +import torch +from torch.nn import functional as F +from src.utils import TOKENIZER, Dataset +from src.model_run import RWKV_RNN +torch.backends.cudnn.benchmark = True +torch.backends.cudnn.allow_tf32 = True +torch.backends.cuda.matmul.allow_tf32 = True +np.set_printoptions(precision=4, suppress=True, linewidth=200) + +### Step 1: set model ################################################################################## + +ctx_len = 1024 +n_layer = 6 +n_embd = 512 +model_type = 'RWKV' # 'RWKV' or 'RWKV-ffnPre' + +# your trained model +MODEL_NAME = 'trained-1' +WORD_NAME = 'vocab' # the .json vocab (generated by train.py + +# --> set UNKNOWN_CHAR to the rarest token in your vocab.json <-- +# --> all unknown tokens in your context will be denoted by it <-- +UNKNOWN_CHAR = ' ' # here we just set it to [space] for simplicity + +RUN_DEVICE = 'cpu' # 'cpu' (already very fast) or 'cuda' +DEBUG_DEBUG = False # True False - show softmax output + +### Step 2: set context ################################################################################ + +context = "\nIn the" # ==> this is your prompt + +NUM_TRIALS = 999 +LENGTH_PER_TRIAL = 500 + +TEMPERATURE = 1.0 +top_p = 0.7 +top_p_newline = 0.9 + +######################################################################################################## + +print(f'Loading {MODEL_NAME}...') +model = RWKV_RNN(MODEL_NAME, RUN_DEVICE, model_type, n_layer, n_embd, ctx_len) +tokenizer = TOKENIZER(WORD_NAME, UNKNOWN_CHAR=UNKNOWN_CHAR) + +######################################################################################################## + +context = tokenizer.refine_context(context) +print('\nYour prompt has ' + str(len(context)) + ' tokens.') +print('\n--> Currently the first run takes a while if your prompt is long, as we are using RNN to process the prompt. Use GPT to build the hidden state for better speed. <--\n') + +for TRIAL in range(1 if DEBUG_DEBUG else NUM_TRIALS): + t_begin = time.time_ns() + + src_len = len(context) + ctx = [tokenizer.stoi.get(s, tokenizer.UNKNOWN_CHAR) for s in context] + print(('-' * 30) + context, end='') + + model.clear() + if TRIAL == 0: + init_state = types.SimpleNamespace() + for i in range(src_len): + x = ctx[:i+1] + if i == src_len - 1: + init_state.out = model.run(x) + else: + model.run(x) + model.save(init_state) + else: + model.load(init_state) + + for i in range(src_len, src_len + (1 if DEBUG_DEBUG else LENGTH_PER_TRIAL)): + x = ctx[:i+1] + x = x[-ctx_len:] + + if i == src_len: + out = copy.deepcopy(init_state.out) + else: + out = model.run(x) + if DEBUG_DEBUG: + print('model', np.array(x), '==>', np.array( + out), np.max(out), np.min(out)) + + char = tokenizer.sample_logits(out, x, ctx_len, temperature=TEMPERATURE, + top_p_usual=top_p, top_p_newline=top_p_newline) + char = char.item() + print(tokenizer.itos[int(char)], end='', flush=True) + ctx += [char] + t_end = time.time_ns() + print("\n----------", round((t_end - t_begin) / (10 ** 9), 2), end='s ') diff --git a/docs/src/RWKV-v3/src/model.py b/docs/src/RWKV-v3/src/model.py new file mode 100644 index 00000000..4275aa33 --- /dev/null +++ b/docs/src/RWKV-v3/src/model.py @@ -0,0 +1,363 @@ +######################################################################################################## +# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM +######################################################################################################## + +from torch.utils.cpp_extension import load +import math +import numpy as np +import logging +import torch +import torch.nn as nn +from torch.nn import functional as F +logger = logging.getLogger(__name__) + +RWKV_K_CLAMP = 60 # e^60 = 1e26 +RWKV_K_EPS = 1e-8 +RWKV_HEAD_QK_DIM = 256 +print(f'\nRWKV_K_CLAMP {RWKV_K_CLAMP} RWKV_K_EPS {RWKV_K_EPS} RWKV_HEAD_QK_DIM {RWKV_HEAD_QK_DIM}\n') + +######################################################################################################## +# CUDA Kernel +######################################################################################################## + +T_MAX = 1024 # increase this if your ctx_len > 1024 +B_GROUP_FORWARD = 4 # set to 8 for best performance +B_GROUP_BACKWARD = 2 # set to 2 for best performance (sometimes 8 is faster) + +timex_cuda = load(name="timex", sources=["cuda/timex_op.cpp", "cuda/timex_cuda.cu"], + verbose=True, extra_cuda_cflags=['--use_fast_math', '--extra-device-vectorization', f'-DTmax={T_MAX}', f'-DBF={B_GROUP_FORWARD}', f'-DBB={B_GROUP_BACKWARD}']) + + +class TimeX(torch.autograd.Function): + @staticmethod + def forward(ctx, w, k, B, C, T, eps): + ctx.B = B + ctx.C = C + ctx.T = T + assert ctx.T % 4 == 0 and ctx.T <= T_MAX and ctx.B % B_GROUP_FORWARD == 0 and ctx.B % B_GROUP_BACKWARD == 0 + w = w.contiguous() + k = k.contiguous() + ctx.save_for_backward(w, k) + wk = torch.empty((B, C, T), device='cuda', + memory_format=torch.contiguous_format) + timex_cuda.forward(w, k, wk, eps, B, C, T) + return wk + + @staticmethod + def backward(ctx, gwk): + assert ctx.T % 4 == 0 and ctx.T <= T_MAX and ctx.B % B_GROUP_FORWARD == 0 and ctx.B % B_GROUP_BACKWARD == 0 + w, k = ctx.saved_tensors + gw = torch.empty((ctx.B, ctx.C, ctx.T), device='cuda', + memory_format=torch.contiguous_format) + gk = torch.empty((ctx.B, ctx.C, ctx.T), device='cuda', + memory_format=torch.contiguous_format) + timex_cuda.backward(w, k, gwk.contiguous(), gw, + gk, ctx.B, ctx.C, ctx.T) + return (gw.sum(dim=0), gk, None, None, None, None) + +######################################################################################################## +# RWKV: RWKV Time-mix + RWKV Channel-mix +######################################################################################################## + +def RWKV_Init(module, config): # fancy initialization of all lin & emb layer in the module + for m in module.modules(): + if not isinstance(m, (nn.Linear, nn.Embedding)): + continue + with torch.no_grad(): + name = '[unknown weight]' + for name, parameter in module.named_parameters(): # find the name of the weight + if id(m.weight) == id(parameter): + break + + shape = m.weight.data.shape + gain = 1.0 + scale = 1.0 # extra scale for gain + + if isinstance(m, nn.Embedding): + gain = math.sqrt(max(shape[0], shape[1])) + if shape[0] == config.vocab_size and shape[1] == config.n_embd: # token emb? + scale = 1e-4 + else: + scale = 0 + + if isinstance(m, nn.Linear): + if m.bias is not None: + m.bias.data.zero_() + if shape[0] > shape[1]: + gain = math.sqrt(shape[0] / shape[1]) + if shape[0] == config.vocab_size and shape[1] == config.n_embd: # final projection? + scale = 0.5 + + if hasattr(m, 'scale_init'): + scale = m.scale_init + + # print(str(shape[0]).ljust(5), str(shape[1]).ljust(5), f'{round(scale,2):g}'.ljust(4), name) + + gain *= scale + if scale == -999: + nn.init.eye_(m.weight) + elif gain == 0: + # zero init is great for some RWKV matrices + nn.init.zeros_(m.weight) + elif gain > 0: + nn.init.orthogonal_(m.weight, gain=gain) + else: + nn.init.normal_(m.weight, mean=0.0, std=-scale) + + +class RWKV_TimeMix(nn.Module): + def __init__(self, config, layer_id): + super().__init__() + self.layer_id = layer_id + self.ctx_len = config.ctx_len + self.n_embd = config.n_embd + + attn_sz = config.n_embd + + with torch.no_grad(): # fancy init + self.time_curve = torch.tensor([-(config.ctx_len - 2 - i) for i in range(config.ctx_len-1)]).unsqueeze(0) + self.time_curve = self.time_curve.to('cuda') + + ratio_0_to_1 = (layer_id / (config.n_layer - 1)) # 0 to 1 + ratio_1_to_almost0 = (1.0 - (layer_id / config.n_layer)) # 1 to ~0 + + # fancy time_decay + decay_speed = torch.ones(attn_sz, 1) + for h in range(attn_sz): + decay_speed[h][0] = -5 + 8 * (h / (attn_sz-1)) ** (0.7 + 1.3 * ratio_0_to_1) + self.time_decay = nn.Parameter(decay_speed) + # print(layer_id, self.time_decay.flatten()[:3].cpu().numpy(), '...', self.time_decay.flatten()[-3:].cpu().numpy()) + + # fancy time_first + zigzag = (torch.tensor([(i+1)%3 - 1 for i in range(attn_sz)]) * 0.5).unsqueeze(1) + self.time_first = nn.Parameter(torch.ones(attn_sz, 1) * math.log(0.3) + zigzag) + + # fancy time_mix + x = torch.ones(1, 1, config.n_embd) + for i in range(config.n_embd): + x[0, 0, i] = i / config.n_embd + self.time_mix_k = nn.Parameter(torch.pow(x, ratio_1_to_almost0)) + self.time_mix_v = nn.Parameter(torch.pow(x, ratio_1_to_almost0) + 0.3 * ratio_0_to_1) + self.time_mix_r = nn.Parameter(torch.pow(x, 0.5 * ratio_1_to_almost0)) + + + self.time_shift = nn.ZeroPad2d((0, 0, 1, -1)) + + self.key = nn.Linear(config.n_embd, attn_sz, bias=False) + self.value = nn.Linear(config.n_embd, attn_sz, bias=False) + self.receptance = nn.Linear(config.n_embd, attn_sz, bias=False) + + self.output = nn.Linear(attn_sz, config.n_embd, bias=False) + + self.key.scale_init = 0 + self.receptance.scale_init = 0 + self.output.scale_init = 0 + + def forward(self, x): + B, T, C = x.size() # x = (Batch,Time,Channel) + + # Mix x with the previous timestep to produce xk, xv, xr + xx = self.time_shift(x) # self.time_shift = nn.ZeroPad2d((0,0,1,-1)) + xk = x * self.time_mix_k + xx * (1 - self.time_mix_k) + xv = x * self.time_mix_v + xx * (1 - self.time_mix_v) + xr = x * self.time_mix_r + xx * (1 - self.time_mix_r) + + # Use xk, xv, xr to produce k, v, r + k = self.key(xk).transpose(-1, -2) + v = self.value(xv).transpose(-1, -2) + r = self.receptance(xr) + + # RWKV_K_CLAMP can be removed if the CUDA kernel substracts the correct k_max for each k (I will do this later) + k = torch.clamp(k, max=RWKV_K_CLAMP) # clamp k to avoid overflow + k = torch.exp(k) + kv = k * v + + # Compute the W-curve = [e^(-n * e^time_decay), e^(-(n-1) * e^time_decay), ..., 1, e^(time_first)] + self.time_w = torch.cat( + [torch.exp(self.time_decay) * self.time_curve, self.time_first], dim=-1) + w = torch.exp(self.time_w) + + # Use W to mix kv and k respectively. Add K_EPS to wk to avoid divide-by-zero + wkv = TimeX.apply(w, kv, B, C, T, 0) + # RWKV_K_EPS can be removed if the CUDA kernel sets 0/0 = 0 (I will do this later) + wk = TimeX.apply(w, k, B, C, T, RWKV_K_EPS) + + rwkv = torch.sigmoid(r) * (wkv / wk).transpose(-1, -2) + rwkv = self.output(rwkv) + return rwkv + + +class RWKV_ChannelMix(nn.Module): + def __init__(self, config, layer_id): + super().__init__() + self.layer_id = layer_id + + self.time_shift = nn.ZeroPad2d((0, 0, 1, -1)) + + with torch.no_grad(): # fancy init of time_mix + ratio_1_to_almost0 = (1.0 - (layer_id / config.n_layer)) # 1 to ~0 + + x = torch.ones(1, 1, config.n_embd) + for i in range(config.n_embd): + x[0, 0, i] = i / config.n_embd + + self.time_mix_k = nn.Parameter(torch.pow(x, ratio_1_to_almost0)) + self.time_mix_r = nn.Parameter(torch.pow(x, ratio_1_to_almost0)) + + hidden_sz = 4 * config.n_embd + self.key = nn.Linear(config.n_embd, hidden_sz, bias=False) + self.receptance = nn.Linear(config.n_embd, config.n_embd, bias=False) + self.value = nn.Linear(hidden_sz, config.n_embd, bias=False) + + self.value.scale_init = 0 + self.receptance.scale_init = 0 + + def forward(self, x): + xx = self.time_shift(x) + xk = x * self.time_mix_k + xx * (1 - self.time_mix_k) + xr = x * self.time_mix_r + xx * (1 - self.time_mix_r) + + k = self.key(xk) + k = torch.square(torch.relu(k)) + kv = self.value(k) + + rkv = torch.sigmoid(self.receptance(xr)) * kv + return rkv + +######################################################################################################## +# The GPT Model with our blocks +######################################################################################################## + + +class GPTConfig: + def __init__(self, vocab_size, ctx_len, **kwargs): + self.vocab_size = vocab_size + self.ctx_len = ctx_len + for k, v in kwargs.items(): + setattr(self, k, v) + + +class Block(nn.Module): + def __init__(self, config, layer_id): + super().__init__() + self.config = config + self.layer_id = layer_id + + self.ln1 = nn.LayerNorm(config.n_embd) + self.ln2 = nn.LayerNorm(config.n_embd) + + if self.layer_id == 0: + self.ln0 = nn.LayerNorm(config.n_embd) + + if self.layer_id == 0 and self.config.model_type == 'RWKV-ffnPre': + self.ffnPre = RWKV_ChannelMix(config, layer_id+1000) + else: + self.att = RWKV_TimeMix(config, layer_id) + + self.ffn = RWKV_ChannelMix(config, layer_id) + + def forward(self, x): + if self.layer_id == 0: + x = self.ln0(x) + if self.layer_id == 0 and self.config.model_type == 'RWKV-ffnPre': + x = x + self.ffnPre(self.ln1(x)) # better in some cases + else: + x = x + self.att(self.ln1(x)) + x = x + self.ffn(self.ln2(x)) + return x + + +class GPT(nn.Module): + def __init__(self, config): + super().__init__() + self.step = 0 + self.config = config + + self.emb = nn.Embedding(config.vocab_size, config.n_embd) + + self.blocks = nn.Sequential(*[Block(config, i) + for i in range(config.n_layer)]) + + self.ln_out = nn.LayerNorm(config.n_embd) + self.head = nn.Linear(config.n_embd, config.vocab_size, bias=False) + + if RWKV_HEAD_QK_DIM > 0: + self.head_q = nn.Linear(config.n_embd, RWKV_HEAD_QK_DIM, bias=False) + self.head_q.scale_init = 0 + self.head_k = nn.Linear(config.n_embd, RWKV_HEAD_QK_DIM, bias=False) + self.head_k.scale_init = 0.1 + self.register_buffer("copy_mask", torch.tril( + torch.ones(config.ctx_len, config.ctx_len))) + + self.ctx_len = config.ctx_len + + RWKV_Init(self, config) + + logger.info("number of parameters: %e", sum(p.numel() + for p in self.parameters())) + + def get_ctx_len(self): + return self.ctx_len + + def _init_weights(self, module): + if isinstance(module, (nn.Linear)): + module.weight.data.normal_(mean=0.0, std=0.01) + if isinstance(module, (nn.Embedding)): + module.weight.data.normal_(mean=0.0, std=1e-5) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + def configure_optimizers(self, train_config): + # separate out all parameters to those that will and won't experience regularizing weight decay + decay = set() + no_decay = set() + + for mn, m in self.named_modules(): # here we disable weight_decay + for pn, p in m.named_parameters(): + fpn = '%s.%s' % (mn, pn) if mn else pn # full param name + no_decay.add(fpn) + + param_dict = {pn: p for pn, p in self.named_parameters()} + inter_params = decay & no_decay + union_params = decay | no_decay + assert len( + inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" % (str(inter_params), ) + assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated into either decay/no_decay set!" \ + % (str(param_dict.keys() - union_params), ) + + optim_groups = [ + {"params": [param_dict[pn] + for pn in sorted(list(no_decay))], "weight_decay": 0.0}, + ] + + optimizer = torch.optim.Adam( + optim_groups, lr=train_config.learning_rate, betas=train_config.betas, eps=train_config.eps) + + return optimizer + + def forward(self, idx, targets=None): + self.step += 1 + B, T = idx.size() + assert T <= self.ctx_len, "Cannot forward, because len(input) > model ctx_len." + x = self.emb(idx) + + x = self.blocks(x) + + x = self.ln_out(x) + + if RWKV_HEAD_QK_DIM > 0: + q = self.head_q(x)[:, :T, :] + k = self.head_k(x)[:, :T, :] + c = (q @ k.transpose(-2, -1)) * (1.0 / RWKV_HEAD_QK_DIM) + c = c.masked_fill(self.copy_mask[:T, :T] == 0, 0) + + c = c @ F.one_hot(idx, num_classes=self.config.vocab_size).float() + x = self.head(x) + c + else: + x = self.head(x) + + loss = None + if targets is not None: + loss = F.cross_entropy(x.view(-1, x.size(-1)), targets.view(-1)) + + return x, loss diff --git a/docs/src/RWKV-v3/src/model_run.py b/docs/src/RWKV-v3/src/model_run.py new file mode 100644 index 00000000..44ea131c --- /dev/null +++ b/docs/src/RWKV-v3/src/model_run.py @@ -0,0 +1,319 @@ +######################################################################################################## +# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM +######################################################################################################## + +import types +import copy +import torch +import math +from torch.nn import functional as F +import torch.nn as nn + +RWKV_K_CLAMP = 60 +RWKV_K_EPS = 1e-8 +RWKV_HEAD_QK_DIM = 256 +print(f'\nRWKV_K_CLAMP {RWKV_K_CLAMP} RWKV_K_EPS {RWKV_K_EPS} RWKV_HEAD_QK_DIM {RWKV_HEAD_QK_DIM}\n') + +DEBUG_TIME = False # True False - show trained time-coeffs + +############################################################################################################ + +RWKV_CFG = types.SimpleNamespace() + +class RWKV_ChannelMix(nn.Module): + def __init__(self, layer_id): + super().__init__() + self.layer_id = layer_id + + self.time_shift = nn.ZeroPad2d((0,0,1,-1)) + self.time_mix_k = nn.Parameter(torch.ones(1, 1, RWKV_CFG.n_embd)) + self.time_mix_r = nn.Parameter(torch.ones(1, 1, RWKV_CFG.n_embd)) + + hidden_sz = 4 * RWKV_CFG.n_embd + self.key = nn.Linear(RWKV_CFG.n_embd, hidden_sz, bias=False) + self.receptance = nn.Linear(RWKV_CFG.n_embd, RWKV_CFG.n_embd, bias=False) + self.value = nn.Linear(hidden_sz, RWKV_CFG.n_embd, bias=False) + + def forward(self, x): + xx = self.time_shift(x) + xk = x * self.time_mix_k + xx * (1 - self.time_mix_k) + xr = x * self.time_mix_r + xx * (1 - self.time_mix_r) + + k = self.key(xk) + k = torch.square(torch.relu(k)) + kv = self.value(k) + + rkv = torch.sigmoid(self.receptance(xr)) * kv + return rkv + +class RWKV_TimeMix(nn.Module): + def __init__(self, layer_id): + super().__init__() + self.layer_id = layer_id + self.time_decay = nn.Parameter(torch.ones(RWKV_CFG.n_embd, 1)) + self.time_curve = torch.tensor([-(RWKV_CFG.ctx_len - 2 - i) for i in range(RWKV_CFG.ctx_len-1)]).unsqueeze(0) + self.time_first = nn.Parameter(torch.ones(RWKV_CFG.n_embd, 1) * math.log(0.3)) + + self.time_shift = nn.ZeroPad2d((0,0,1,-1)) + self.time_mix_k = nn.Parameter(torch.ones(1,1,RWKV_CFG.n_embd)) + self.time_mix_v = nn.Parameter(torch.ones(1,1,RWKV_CFG.n_embd)) + self.time_mix_r = nn.Parameter(torch.ones(1,1,RWKV_CFG.n_embd)) + + self.key = nn.Linear(RWKV_CFG.n_embd, RWKV_CFG.n_embd, bias=False) + self.value = nn.Linear(RWKV_CFG.n_embd, RWKV_CFG.n_embd, bias=False) + self.receptance = nn.Linear(RWKV_CFG.n_embd, RWKV_CFG.n_embd, bias=False) + + self.output = nn.Linear(RWKV_CFG.n_embd, RWKV_CFG.n_embd, bias=False) + + def forward(self, x): + B, T, C = x.size() + + xx = self.time_shift(x) + xk = x * self.time_mix_k + xx * (1 - self.time_mix_k) + xv = x * self.time_mix_v + xx * (1 - self.time_mix_v) + xr = x * self.time_mix_r + xx * (1 - self.time_mix_r) + + k = self.key(xk).transpose(-1, -2) + v = self.value(xv).transpose(-1, -2) + r = self.receptance(xr) + + k = torch.clamp(k, max=RWKV_K_CLAMP) + k = torch.exp(k) + + kv = k * v + + self.time_w = torch.cat([torch.exp(self.time_decay) * self.time_curve.to(self.time_decay.device), self.time_first], dim=-1) + w = torch.exp(self.time_w) + + w = w[:,-T:].unsqueeze(1) + wkv = F.conv1d(nn.ZeroPad2d((T-1, 0, 0, 0))(kv), w, groups=C) + wk = F.conv1d(nn.ZeroPad2d((T-1, 0, 0, 0))(k), w, groups=C) + RWKV_K_EPS + + rwkv = torch.sigmoid(r) * (wkv / wk).transpose(-1, -2) + + rwkv = self.output(rwkv) + return rwkv + +class Block(nn.Module): + def __init__(self, layer_id): + super().__init__() + self.layer_id = layer_id + + self.ln1 = nn.LayerNorm(RWKV_CFG.n_embd) + self.ln2 = nn.LayerNorm(RWKV_CFG.n_embd) + if self.layer_id == 0: + self.ln0 = nn.LayerNorm(RWKV_CFG.n_embd) + + if self.layer_id == 0 and RWKV_CFG.model_type == 'RWKV-ffnPre': + self.ffnPre = RWKV_ChannelMix(layer_id+1000) + else: + self.att = RWKV_TimeMix(layer_id) + + self.ffn = RWKV_ChannelMix(layer_id) + + def forward(self, x): + if self.layer_id == 0: + x = self.ln0(x) + if self.layer_id == 0 and RWKV_CFG.model_type == 'RWKV-ffnPre': + x = x + self.ffnPre(self.ln1(x)) + else: + x = x + self.att(self.ln1(x)) + x = x + self.ffn(self.ln2(x)) + return x + +class RWKV_GPT(nn.Module): + def __init__(self, MODEL_NAME, RUN_DEVICE, model_type, vocab_size, n_layer, n_embd, ctx_len): + global RWKV_CFG + super().__init__() + + RWKV_CFG.RUN_DEVICE = RUN_DEVICE + RWKV_CFG.model_type = model_type + RWKV_CFG.vocab_size = vocab_size + RWKV_CFG.n_layer = n_layer + RWKV_CFG.n_embd = n_embd + RWKV_CFG.ctx_len = ctx_len + + print('\nloading RWKV-GPT', MODEL_NAME) + + self.emb = nn.Embedding(vocab_size, n_embd) + + self.blocks = nn.Sequential(*[Block(i) for i in range(n_layer)]) + + self.ln_out = nn.LayerNorm(n_embd) + self.head = nn.Linear(n_embd, vocab_size, bias=False) + + if RWKV_HEAD_QK_DIM > 0: + self.head_q = nn.Linear(n_embd, RWKV_HEAD_QK_DIM, bias=False) + self.head_q.scale_init = 0 + self.head_k = nn.Linear(n_embd, RWKV_HEAD_QK_DIM, bias=False) + self.head_k.scale_init = 0.1 + self.register_buffer("copy_mask", torch.tril( + torch.ones(ctx_len, ctx_len))) + + self.ctx_len = ctx_len + self.eval() + self.load_state_dict(torch.load(MODEL_NAME + '.pth')) + self.eval() + + def forward(self, idx): + B, T = idx.size() + assert T <= self.ctx_len, "Cannot forward, because len(input) > model ctx_len." + + x = self.emb(idx) + x = self.blocks(x) + x = self.ln_out(x) + + if RWKV_HEAD_QK_DIM > 0: + q = self.head_q(x)[:, :T, :] + k = self.head_k(x)[:, :T, :] + c = (q @ k.transpose(-2, -1)) * (1.0 / RWKV_HEAD_QK_DIM) + c = c.masked_fill(self.copy_mask[:T, :T] == 0, 0) + + c = c @ F.one_hot(idx, num_classes=RWKV_CFG.vocab_size).float() + x = self.head(x) + c + else: + x = self.head(x) + + return x + +############################################################################################################ + +class RWKV_RNN(): + def __init__(self, MODEL_NAME, RUN_DEVICE, model_type, n_layer, n_embd, ctx_len): + self.RUN_DEVICE = RUN_DEVICE + self.model_type = model_type + self.n_layer = n_layer + self.n_embd = n_embd + self.ctx_len = ctx_len + + self.w = types.SimpleNamespace() + + w = torch.load(MODEL_NAME + '.pth', + map_location=torch.device(RUN_DEVICE)) + for x in w.keys(): + if '.time_' in x: + w[x] = w[x].squeeze() + if '.time_decay' in x: + w[x] = torch.exp(-torch.exp(w[x])) + if '.time_first' in x: + w[x] = torch.exp(w[x]) + if DEBUG_TIME and '.time_' in x: + print(x, w[x].squeeze().cpu().numpy()) + + xx = x.split('.') + here = self.w + for i in range(len(xx)): + if xx[i].isdigit(): + ii = int(xx[i]) + if ii not in here: + here[ii] = types.SimpleNamespace() + here = here[ii] + else: + if i == len(xx) - 1: + setattr(here, xx[i], w[x]) + elif not hasattr(here, xx[i]): + if xx[i+1].isdigit(): + setattr(here, xx[i], {}) + else: + setattr(here, xx[i], types.SimpleNamespace()) + here = getattr(here, xx[i]) + + self.clear() + + def clear(self): + self.xx = {} + self.aa = {} + self.bb = {} + self.hk = None + + def save(self, target): + target.xx = copy.deepcopy(self.xx) + target.aa = copy.deepcopy(self.aa) + target.bb = copy.deepcopy(self.bb) + target.hk = copy.deepcopy(self.hk) + + def load(self, target): + self.xx = copy.deepcopy(target.xx) + self.aa = copy.deepcopy(target.aa) + self.bb = copy.deepcopy(target.bb) + self.hk = copy.deepcopy(target.hk) + + def LN(self, xx, w): + return F.layer_norm(xx, (self.n_embd,), weight=w.weight, bias=w.bias) + + def FF(self, xx, w, name): + if name not in self.xx: + self.xx[name] = torch.zeros(self.n_embd, device=self.RUN_DEVICE) + xk = xx * w.time_mix_k + self.xx[name] * (1 - w.time_mix_k) + xr = xx * w.time_mix_r + self.xx[name] * (1 - w.time_mix_r) + self.xx[name] = xx + + r = torch.sigmoid(w.receptance.weight @ xr) + k = torch.square(torch.relu(w.key.weight @ xk)) + kv = w.value.weight @ k + + return r * kv + + def SA(self, xx, w, name): + if name not in self.xx: + self.xx[name] = torch.zeros(self.n_embd, device=self.RUN_DEVICE) + self.aa[name] = torch.zeros(self.n_embd, device=self.RUN_DEVICE) + self.bb[name] = torch.zeros(self.n_embd, device=self.RUN_DEVICE) + + xk = xx * w.time_mix_k + self.xx[name] * (1 - w.time_mix_k) + xv = xx * w.time_mix_v + self.xx[name] * (1 - w.time_mix_v) + xr = xx * w.time_mix_r + self.xx[name] * (1 - w.time_mix_r) + self.xx[name] = xx + + r = torch.sigmoid(w.receptance.weight @ xr) + + k = torch.exp(torch.clamp(w.key.weight @ xk, max=RWKV_K_CLAMP)) + v = w.value.weight @ xv + kv = k * v + + a = self.aa[name] + w.time_first * kv + b = self.bb[name] + w.time_first * k + self.aa[name] = w.time_decay * self.aa[name] + kv + self.bb[name] = w.time_decay * self.bb[name] + k + + rwkv = r * a / (b + RWKV_K_EPS) + + return w.output.weight @ rwkv + + def run(self, ctx): + w = self.w + x = w.emb.weight[ctx[-1]] + + for i in range(self.n_layer): + if i == 0: + x = self.LN(x, w.blocks[i].ln0) + if i == 0 and self.model_type == 'RWKV-ffnPre': + x = x + self.FF(self.LN(x, w.blocks[i].ln1), w.blocks[i].ffnPre, f'ffnPre.{i}') + else: + x = x + self.SA(self.LN(x, w.blocks[i].ln1), w.blocks[i].att, f'att.{i}') + x = x + self.FF(self.LN(x, w.blocks[i].ln2), w.blocks[i].ffn, f'ffn.{i}') + + x = self.LN(x, w.ln_out) + + if RWKV_HEAD_QK_DIM > 0: + if self.hk == None: + self.hk = (w.head_k.weight @ x).unsqueeze(0) + else: + self.hk = torch.cat( + [self.hk, (w.head_k.weight @ x).unsqueeze(0)], dim=0) + if self.hk.shape[0] > self.ctx_len: + self.hk = self.hk[-self.ctx_len:, :] + + q = w.head_q.weight @ x + + x = w.head.weight @ x + x = x.cpu().numpy().tolist() + + c = (self.hk @ q) / RWKV_HEAD_QK_DIM + for i in range(len(c)): + x[ctx[i]] += c[i] + else: + x = w.head.weight @ x + x = x.cpu().numpy().tolist() + + return x diff --git a/docs/src/RWKV-v3/src/trainer.py b/docs/src/RWKV-v3/src/trainer.py new file mode 100644 index 00000000..418d72e0 --- /dev/null +++ b/docs/src/RWKV-v3/src/trainer.py @@ -0,0 +1,171 @@ +######################################################################################################## +# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM +######################################################################################################## + +from torch.utils.data.dataloader import DataLoader +from torch.optim.lr_scheduler import LambdaLR +from torch.nn import functional as F +import torch.nn as nn +import torch.optim as optim +import torch +from tqdm.auto import tqdm +import numpy as np +import logging +import os +import datetime +import sys +import math + +# import wandb # comment this if you don't have wandb +# print('logging to wandb... (comment it if you don\'t have wandb)') + +logger = logging.getLogger(__name__) +torch.backends.cudnn.benchmark = True +torch.backends.cudnn.allow_tf32 = True +torch.backends.cuda.matmul.allow_tf32 = True + +log_file = open("mylog.txt", "a") + + +class TrainerConfig: + max_epochs = 10 + batch_size = 64 + learning_rate = 4e-4 + betas = (0.9, 0.99) + eps = 1e-8 + grad_norm_clip = 1.0 + lr_decay = True # linear warmup followed by cosine decay + warmup_tokens = 0 + final_tokens = 0 + epoch_save_frequency = 0 + epoch_save_path = 'trained-' + num_workers = 0 # for DataLoader + + def __init__(self, **kwargs): + for k, v in kwargs.items(): + setattr(self, k, v) + + +class Trainer: + + def __init__(self, model, train_dataset, test_dataset, config): + self.model = model + self.train_dataset = train_dataset + self.test_dataset = test_dataset + self.config = config + self.avg_loss = -1 + self.steps = 0 + + if 'wandb' in sys.modules: + cfg = model.config + for k in config.__dict__: + setattr(cfg, k, config.__dict__[k]) # combine cfg + wandb.init(project="RWKV-LM", name=self.get_run_name() + '-' + + datetime.datetime.today().strftime('%Y-%m-%d-%H-%M-%S'), config=cfg, save_code=False) + + self.device = 'cpu' + if torch.cuda.is_available(): # take over whatever gpus are on the system + self.device = torch.cuda.current_device() + + def get_run_name(self): + raw_model = self.model.module if hasattr( + self.model, "module") else self.model + cfg = raw_model.config + run_name = str(cfg.vocab_size) + '-' + str(cfg.ctx_len) + '-' + \ + cfg.model_type + '-' + str(cfg.n_layer) + '-' + str(cfg.n_embd) + return run_name + + def train(self): + model, config = self.model, self.config + raw_model = model.module if hasattr(self.model, "module") else model + optimizer = raw_model.configure_optimizers(config) + + def run_epoch(split): + is_train = split == 'train' + model.train(is_train) + data = self.train_dataset if is_train else self.test_dataset + + if config.num_workers > 0: + loader = DataLoader(data, shuffle=False, pin_memory=True, + batch_size=config.batch_size, + num_workers=config.num_workers) + else: + loader = DataLoader(data, shuffle=False, + batch_size=config.batch_size, + num_workers=config.num_workers) + + pbar = tqdm(enumerate(loader), total=len( + loader), bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}') if is_train else enumerate(loader) + + for it, (x, y) in pbar: + x = x.to(self.device) # place data on the correct device + y = y.to(self.device) + + with torch.set_grad_enabled(is_train): + _, loss = model(x, y) # forward the model + + if is_train: # backprop and update the parameters + model.zero_grad() + loss.backward() + + if config.grad_norm_clip > 0: + torch.nn.utils.clip_grad_norm_( + model.parameters(), config.grad_norm_clip) + + optimizer.step() + + if config.lr_decay: # decay the learning rate based on our progress + # number of tokens processed this step (i.e. label is not -100) + self.tokens += (y >= 0).sum() + lr_final_factor = config.lr_final / config.learning_rate + if self.tokens < config.warmup_tokens: + # linear warmup + lr_mult = lr_final_factor + \ + (1 - lr_final_factor) * float(self.tokens) / \ + float(config.warmup_tokens) + progress = 0 + else: + # exponential learning rate decay + progress = float(self.tokens - config.warmup_tokens) / float(max(1, config.final_tokens - config.warmup_tokens)) + if progress >= 1: + lr_mult = lr_final_factor + else: + lr_mult = math.exp(math.log(lr_final_factor) * pow(progress, 1)) + lr = config.learning_rate * lr_mult + for param_group in optimizer.param_groups: + param_group['lr'] = lr + else: + lr = config.learning_rate + + now_loss = loss.item() # report progress + self.lr = lr + + if 'wandb' in sys.modules: + wandb.log({"loss": now_loss}, + step=self.steps * self.config.batch_size) + self.steps += 1 + + if self.avg_loss < 0: + self.avg_loss = now_loss + else: + factor = 1 / (it + 1) + self.avg_loss = self.avg_loss * \ + (1.0 - factor) + now_loss * factor + pbar.set_description( + f"mini-epoch {epoch+1} prog {progress*100.0:.2f}% iter {it}: ppl {math.exp(self.avg_loss):.2f} loss {self.avg_loss:.4f} lr {lr:e}") + + self.tokens = 0 # counter used for learning rate decay + for epoch in range(config.max_epochs): + + run_epoch('train') + + log_file.write( + f'{epoch+1} {self.avg_loss:.6f} {math.exp(self.avg_loss):.4f} {self.lr:.8f} {datetime.datetime.now()} \n') + log_file.flush() + + if (self.config.epoch_save_frequency > 0 and epoch % self.config.epoch_save_frequency == 0) or (epoch == config.max_epochs - 1): + # DataParallel wrappers keep raw model object in .module + raw_model = self.model.module if hasattr( + self.model, "module") else self.model + torch.save(raw_model.state_dict(), + self.config.epoch_save_path + str(epoch+1) + '.pth') diff --git a/docs/src/RWKV-v3/src/utils.py b/docs/src/RWKV-v3/src/utils.py new file mode 100644 index 00000000..42e9f47b --- /dev/null +++ b/docs/src/RWKV-v3/src/utils.py @@ -0,0 +1,122 @@ +######################################################################################################## +# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM +######################################################################################################## + +import json +import random +import time +import math +import numpy as np +import torch +import torch.nn as nn +from torch.nn import functional as F +from torch.utils.data import Dataset + + +class Dataset(Dataset): + def __init__(self, data, ctx_len, epoch_length_fixed): + print('building token list...', end=' ') + unique = sorted(list(set(data))) + # print() + # for u in unique: + # print(u, end=' ') + # print('\n\n') + + xx = 0 + xxObj = {} + for u in unique: + xxObj[xx] = u + xx += 1 + with open('vocab.json', "w", encoding="utf-16") as vocab_file: + vocab_file.write(json.dumps(xxObj, ensure_ascii=False)) + + data_size, vocab_size = len(data), len(unique) + print('data has %d tokens, %d unique.' % (data_size, vocab_size)) + self.stoi = {ch: i for i, ch in enumerate(unique)} + self.itos = {i: ch for i, ch in enumerate(unique)} + self.ctx_len = ctx_len + self.epoch_length_fixed = epoch_length_fixed + self.vocab_size = vocab_size + self.data = data + + def __len__(self): + return self.epoch_length_fixed + + def __getitem__(self, idx): + # cheat: pick a random spot in dataset + i = np.random.randint(0, len(self.data) - (self.ctx_len + 1)) + chunk = self.data[i:i+self.ctx_len+1] + dix = [self.stoi[s] for s in chunk] + x = torch.tensor(dix[:-1], dtype=torch.long, + device=torch.device('cuda')) + y = torch.tensor(dix[1:], dtype=torch.long, + device=torch.device('cuda')) + return x, y + + +class TOKENIZER(): + def __init__(self, WORD_NAME, UNKNOWN_CHAR='\ue083'): + with open(WORD_NAME + '.json', "r", encoding="utf-16") as result_file: + self.word_table = json.load(result_file) + + self.vocab_size = len(self.word_table) + + self.stoi = {v: int(k) for k, v in self.word_table.items()} + self.itos = {int(k): v for k, v in self.word_table.items()} + + self.UNKNOWN_CHAR = self.stoi[UNKNOWN_CHAR] + + def refine_context(self, context): + context = context.strip().split('\n') + for c in range(len(context)): + context[c] = context[c].strip().strip('\u3000').strip('\r') + context = list(filter(lambda c: c != '', context)) + context = '\n' + ('\n'.join(context)).strip() + if context == '': + context = '\n' + + return context + + def sample_logits(self, out, x, ctx_len, temperature=1.0, top_p_usual=None, top_p_newline=None): + # out[self.UNKNOWN_CHAR] = -float('Inf') + + lastChar = int(x[-1]) + + probs = F.softmax(torch.tensor(out), dim=-1) + + if self.itos[lastChar] == '\n': + top_p = top_p_newline + else: + top_p = top_p_usual + + sorted_probs, s_index = torch.sort(probs, descending=True) + + # for j in range(30): + # pp = sorted_probs[j].item() + # if pp < 0.005: + # break + # ss = self.itos[int(s_index[j])].replace('\n','_') + # print(f'{math.floor(pp*100):>3.0f}{ss}', end='') + # print('') + + cumulative_probs = torch.cumsum(sorted_probs, dim=-1).numpy() + cutoff = float(sorted_probs[np.argmax(cumulative_probs > top_p)]) + + probs[probs < cutoff] = 0 + # print("[" + str(round(cutoff,4)) + ' ' + str(round(to_float(sum(probs)),3)) + "]", end = "") + + if temperature != 1.0: + probs = probs.pow(1.0 / temperature) + + return torch.multinomial(probs, num_samples=1)[0] + + +def to_float(x): + return x.cpu().detach().numpy().flatten()[0].astype(float) + + +def set_seed(seed): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) diff --git a/docs/src/RWKV-v3/train.py b/docs/src/RWKV-v3/train.py new file mode 100644 index 00000000..1a07cad2 --- /dev/null +++ b/docs/src/RWKV-v3/train.py @@ -0,0 +1,118 @@ +######################################################################################################## +# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM +######################################################################################################## + +import os + +# if False: # True False ---> Set to False if you don't understand it +# print("\n\n[[[ SPECIAL DEBUG MODE FOR MYSELF. DON'T ENABLE THIS IF YOU DON'T UNDERSTAND IT ]]]\n\n") +# os.environ["CUDA_VISIBLE_DEVICES"] = "0" +# import src.utils +# src.utils.set_seed(42) # make training deterministic (including dataloader). if you are doing this, remember to change seed when you load a model (otherwise the dataloader loads old samples) + +import logging +import datetime +from src.model import GPT, GPTConfig +from src.trainer import Trainer, TrainerConfig +from src.utils import Dataset +import torch +import numpy as np + +np.set_printoptions(precision=4, suppress=True, linewidth=200) +logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO,) +torch.backends.cudnn.benchmark = True +torch.backends.cudnn.allow_tf32 = True +torch.backends.cuda.matmul.allow_tf32 = True + +### Step 1: set training data ########################################################################## + +datafile = "../data/enwik8" # your data +datafile_encoding = 'utf-8' +# datafile_encoding = 'utf-16le' + +### Step 2: set model size ############################################################################# +# ----> test deeper models (n_layer at least 12) to see the advantage of RWKV-3 over RWKV-2 + +ctx_len = 1024 # increase T_MAX in model.py if your ctx_len > 1024 +n_layer = 6 +n_embd = 512 + +# 'RWKV' (better for English) or 'RWKV-ffnPre' (better in some cases) +model_type = 'RWKV' + +# ---> there is a RWKV_HEAD_QK_DIM in model.py and model_run.py +# set it to 256, then it's using my headQK trick (similar to a tiny attention) to improve loss +# set it to 0, then it's a pure RNN (attention-free) + +### Step 3: set batch size ############################################################################# + +# ---> batch_size must be divisible by B_GROUP_FORWARD and B_GROUP_BACKWARD in model.py +# for example, if your batch_size = 20, you can set B_GROUP_FORWARD = 4, B_GROUP_BACKWARD = 2 +# if you see "CUDA out of memory", reduce batch_size. Use nvidia-smi to find the highest value for your GPU. +batch_size = 12 + +### Step 4: set learning rate, number of mini-epochs ####################################################### +# By default we are using exponential LR decay. +# +# Here are my suggestions for training a good model. +# Let's say you will train a L6-D512 model. +# 1) Set lr_init = lr_final = 8e-4. Let it run for some mini-epochs, until the improvement of loss become slow. +# 2) Check epoch_save_frequency and make sure the partially-trained model is saved. Ctrl+C to stop the run. +# 3) Set lr_init = 8e-4, lr_final = 1e-5, warmup_tokens = ctx_len * batch_size * 50, betas = (0.9, 0.999). +# 4) Search for "torch.load" here and modify it to load the partially-trained model. Continue the training. +# +# For L12-D768, set lr_init = 6e-4. For L24-D1024, set lr_init = 4e-4. For L24-D2048, set lr_init = 3e-4. + +lr_init = 8e-4 # we can use larger lr because of preLN +lr_final = 1e-5 + +# the mini-epoch is very short and of fixed length (length = ctx_len * epoch_length_fixed tokens) +n_epoch = 500 +epoch_length_fixed = 10000 + +# 0 = never, 1 = every mini-epoch, 2 = every two mini-epochs, ... +epoch_save_frequency = 10 +epoch_save_path = 'trained-' + +######################################################################################################## + +grad_norm_clip = 1.0 +warmup_tokens = ctx_len * batch_size * 0 + +betas = (0.9, 0.99) +eps = 4e-9 + +num_workers = 0 + +######################################################################################################## +# Load data +######################################################################################################## + +print('loading data... ' + datafile) +train_dataset = Dataset(open( + datafile, "r", encoding=datafile_encoding).read(), ctx_len, epoch_length_fixed) + +######################################################################################################## +# Train model +######################################################################################################## +if __name__ == '__main__': + + model = GPT(GPTConfig(train_dataset.vocab_size, train_dataset.ctx_len, model_type=model_type, + n_layer=n_layer, n_embd=n_embd)).cuda() + + ### ---> load a trained model <--- + # m2 = torch.load('trained-61.pth') + # model.load_state_dict(m2) + + print('model', model_type, 'epoch', n_epoch, 'batchsz', batch_size, 'betas', + betas, 'eps', eps, 'ctx', ctx_len, 'layer', n_layer, 'embd', n_embd, ) + tconf = TrainerConfig(model_type=model_type, max_epochs=n_epoch, batch_size=batch_size, + learning_rate=lr_init, lr_decay=True, lr_final=lr_final, betas=betas, eps=eps, grad_norm_clip=grad_norm_clip, + warmup_tokens=warmup_tokens, final_tokens=n_epoch*len(train_dataset)*ctx_len, num_workers=num_workers, epoch_save_frequency=epoch_save_frequency, epoch_save_path=epoch_save_path) + trainer = Trainer(model, train_dataset, None, tconf) + + trainer.train() + + torch.save(model.state_dict(), 'trained-' + str(n_epoch) + '-' + trainer.get_run_name() + + '-' + datetime.datetime.today().strftime('%Y-%m-%d-%H-%M-%S') + '.pth') diff --git a/docs/src/RWKV-v3/verify.py b/docs/src/RWKV-v3/verify.py new file mode 100644 index 00000000..fd911510 --- /dev/null +++ b/docs/src/RWKV-v3/verify.py @@ -0,0 +1,65 @@ +######################################################################################################## +# The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM +######################################################################################################## + +# this is for verifying the results of different models and make sure they agree with each other + +import numpy as np +np.set_printoptions(precision=4, suppress=True, linewidth=200) + +import os +os.environ["CUDA_VISIBLE_DEVICES"] = "0" +RUN_DEVICE = 'cuda' + +import torch +from src.model_run import RWKV_RNN, RWKV_GPT +from src.model import GPT, GPTConfig + +ctx_len = 1024 +n_layer = 6 +n_embd = 512 +model_type = 'RWKV' + +model_name = 'trained-1' + +from src.utils import TOKENIZER +tokenizer = TOKENIZER('vocab', UNKNOWN_CHAR=' ') + +######################################################################################################## + +model_train = GPT(GPTConfig(tokenizer.vocab_size, ctx_len, model_type=model_type, n_layer=n_layer, n_embd=n_embd)).cuda() +print('loading ' + model_name) +m2 = torch.load(model_name + '.pth', map_location=RUN_DEVICE) +model_train.load_state_dict(m2) + +model_rnn = RWKV_RNN(model_name, RUN_DEVICE, model_type, n_layer, n_embd, ctx_len) +model_gpt = RWKV_GPT(model_name, RUN_DEVICE, model_type, tokenizer.vocab_size, n_layer, n_embd, ctx_len).cuda() + +######################################################################################################## + +context = '\nIn a' +ctx = [tokenizer.stoi.get(s, tokenizer.UNKNOWN_CHAR) for s in context] +print(f'input len {len(ctx)} data {ctx}') + +######################################################################################################## + +print('\nRWKV-GPT output') +out = model_gpt.forward(torch.tensor(ctx).unsqueeze(0).cuda())[0].detach().cpu().numpy() +print(out) + +print('\nRWKV-RNN output') +model_rnn.clear() +src_len = len(ctx) +for i in range(src_len): + x = ctx[:i+1] + out = model_rnn.run(x) + if i < 3 or i >= src_len - 3: + print(torch.tensor(out).detach().cpu().numpy()) + if i == 2: + print('...') + +print('\nRWKV-train output') +ctx += [0] * (ctx_len - src_len) # pad to ctx_len +ctx = [ctx] * 4 # increase batch size (to make it work with B_GROUP_FORWARD & B_GROUP_BACKWARD) +out = model_train.forward(torch.tensor(ctx).cuda())[0][0][:src_len].detach().cpu().numpy() +print(out, '\n') diff --git a/docs/src/RWKV-v4/cuda/wkv_cuda.cu b/docs/src/RWKV-v4/cuda/wkv_cuda.cu new file mode 100644 index 00000000..6acd0f36 --- /dev/null +++ b/docs/src/RWKV-v4/cuda/wkv_cuda.cu @@ -0,0 +1,125 @@ +#include +#include + +#define MIN_VALUE (-1e38) + +template +__global__ void kernel_forward(const int B, const int T, const int C, + const F *__restrict__ const _w, const F *__restrict__ const _u, const F *__restrict__ const _k, const F *__restrict__ const _v, + F *__restrict__ const _y) { + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + const int _b = idx / C; + const int _c = idx % C; + const int _offset = _b * T * C + _c; + + F u = _u[_c]; + F w = _w[_c]; + const F *__restrict__ const k = _k + _offset; + const F *__restrict__ const v = _v + _offset; + F *__restrict__ const y = _y + _offset; + + F p = 0, q = 0, o = MIN_VALUE; + // p and q are running sums divided by exp(o) (to avoid overflows) + for (int i = 0; i < T; i++) { + const int ii = i * C; + + F no = max(o, u + k[ii]); + F A = exp(o - no); + F B = exp(u + k[ii] - no); + y[ii] = (A * p + B * v[ii]) / (A * q + B); + + no = max(w + o, k[ii]); + A = exp(w + o - no); + B = exp(k[ii] - no); + p = A * p + B * v[ii]; + q = A * q + B; + o = no; + } +} + +template +__global__ void kernel_backward(const int B, const int T, const int C, + const F *__restrict__ const _w, const F *__restrict__ const _u, const F *__restrict__ const _k, const F *__restrict__ const _v, const F *__restrict__ const _gy, + F *__restrict__ const _gw, F *__restrict__ const _gu, F *__restrict__ const _gk, F *__restrict__ const _gv) { + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + const int _b = idx / C; + const int _c = idx % C; + const int _offset = _b * T * C + _c; + + F u = _u[_c]; + F w = _w[_c]; + const F *__restrict__ const k = _k + _offset; + const F *__restrict__ const v = _v + _offset; + const F *__restrict__ const gy = _gy + _offset; + + F *__restrict__ const gk = _gk + _offset; + F *__restrict__ const gv = _gv + _offset; + + F y[Tmax], z[Tmax], zexp[Tmax]; + + F gw = 0, gu = 0; + F p = 0, q = 0; + F dpdw = 0, dqdw = 0; + F o = MIN_VALUE; + for (int i = 0; i < T; i++) { + const int ii = i * C; + F no = max(o, k[ii] + u); + F A = exp(o - no); + F B = exp(k[ii] + u - no); + + F num = A * p + B * v[ii]; + F iden = 1 / (A * q + B); + + y[i] = num * iden; + z[i] = iden; + zexp[i] = k[ii] + u - no; + + gw += gy[ii] * (dpdw - dqdw * y[i]) * iden * A; + gu += gy[ii] * (v[ii] - y[i]) * B * iden; + + no = max(w + o, k[ii]); + A = exp(w + o - no); + B = exp(k[ii] - no); + dpdw = A * (p + dpdw); + dqdw = A * (q + dqdw); + p = A * p + B * v[ii]; + q = A * q + B; + o = no; + } + + F gp = 0, gq = 0; + o = MIN_VALUE; + for (int i = T - 1; i >= 0; i--) { + const int ii = i * C; + F A = gy[ii] * z[i] * exp(zexp[i]); + F B = exp(k[ii] + o); + gk[ii] = A * (v[ii] - y[i]) + B * (gp * v[ii] + gq); + gv[ii] = A + B * gp; + + F no = max(w + o, zexp[i] - k[ii] - u); + A = exp(w + o - no); + B = gy[ii] * z[i] * exp(zexp[i] - k[ii] - u - no); + gp = A * gp + B; + gq = A * gq - B * y[i]; + o = no; + } + + // Multiply by w because the w -> -exp(w) preprocessing is halfway in the backwards pass, even though it's not in the forward pass + const int _offsetBC = _b * C + _c; + _gw[_offsetBC] += gw * _w[_c]; + _gu[_offsetBC] += gu; +} + +void cuda_forward(int B, int T, int C, float *w, float *u, float *k, float *v, float *y) { + dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance + assert(B * C % threadsPerBlock.x == 0); + dim3 numBlocks(B * C / threadsPerBlock.x); + kernel_forward<< >>(B, T, C, w, u, k, v, y); +} + +void cuda_backward(int B, int T, int C, float *w, float *u, float *k, float *v, float *gy, float *gw, float *gu, float *gk, float *gv) { + dim3 threadsPerBlock( min(C, 32) ); // requires --maxrregcount 60 for optimal performance + assert(B * C % threadsPerBlock.x == 0); + dim3 numBlocks(B * C / threadsPerBlock.x); + kernel_backward<< >>(B, T, C, w, u, k, v, gy, gw, gu, gk, gv); +} diff --git a/docs/src/RWKV-v4/cuda/wkv_op.cpp b/docs/src/RWKV-v4/cuda/wkv_op.cpp new file mode 100644 index 00000000..efe56d8d --- /dev/null +++ b/docs/src/RWKV-v4/cuda/wkv_op.cpp @@ -0,0 +1,21 @@ +#include + +void cuda_forward(int B, int T, int C, float *w, float *u, float *k, float *v, float *y); +void cuda_backward(int B, int T, int C, float *w, float *u, float *k, float *v, float *gy, float *gw, float *gu, float *gk, float *gv); + +void forward(int64_t B, int64_t T, int64_t C, torch::Tensor &w, torch::Tensor &u, torch::Tensor &k, torch::Tensor &v, torch::Tensor &y) { + cuda_forward(B, T, C, w.data_ptr (), u.data_ptr (), k.data_ptr (), v.data_ptr