From 223252cda373a6abfce0c4df07c96dc672c8f00b Mon Sep 17 00:00:00 2001 From: John Lyu Date: Fri, 16 Jun 2023 17:08:39 +0800 Subject: [PATCH 01/37] check dir exist for storage --- qlib/data/storage/file_storage.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/qlib/data/storage/file_storage.py b/qlib/data/storage/file_storage.py index 288500c555..3cfc6b3a5f 100644 --- a/qlib/data/storage/file_storage.py +++ b/qlib/data/storage/file_storage.py @@ -80,6 +80,7 @@ def __init__(self, freq: str, future: bool, provider_uri: dict = None, **kwargs) self._provider_uri = None if provider_uri is None else C.DataPathManager.format_provider_uri(provider_uri) self.enable_read_cache = True # TODO: make it configurable self.region = C["region"] + (Path(self.uri.parent) / self.file_name).mkdir(parents=True, exist_ok=True) @property def file_name(self) -> str: @@ -200,6 +201,7 @@ def __init__(self, market: str, freq: str, provider_uri: dict = None, **kwargs): super(FileInstrumentStorage, self).__init__(market, freq, **kwargs) self._provider_uri = None if provider_uri is None else C.DataPathManager.format_provider_uri(provider_uri) self.file_name = f"{market.lower()}.txt" + (Path(self.uri.parent) / self.file_name).mkdir(parents=True, exist_ok=True) def _read_instrument(self) -> Dict[InstKT, InstVT]: if not self.uri.exists(): @@ -289,6 +291,7 @@ def __init__(self, instrument: str, field: str, freq: str, provider_uri: dict = super(FileFeatureStorage, self).__init__(instrument, field, freq, **kwargs) self._provider_uri = None if provider_uri is None else C.DataPathManager.format_provider_uri(provider_uri) self.file_name = f"{instrument.lower()}/{field.lower()}.{freq.lower()}.bin" + (Path(self.uri.parent) / self.file_name).mkdir(parents=True, exist_ok=True) def clear(self): with self.uri.open("wb") as _: From 60edbc411629df233454a031aef452fbe96732c3 Mon Sep 17 00:00:00 2001 From: John Lyu Date: Fri, 16 Jun 2023 17:18:38 +0800 Subject: [PATCH 02/37] index should be int --- qlib/data/storage/file_storage.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/qlib/data/storage/file_storage.py b/qlib/data/storage/file_storage.py index 3cfc6b3a5f..f8b9f02f6f 100644 --- a/qlib/data/storage/file_storage.py +++ b/qlib/data/storage/file_storage.py @@ -80,7 +80,7 @@ def __init__(self, freq: str, future: bool, provider_uri: dict = None, **kwargs) self._provider_uri = None if provider_uri is None else C.DataPathManager.format_provider_uri(provider_uri) self.enable_read_cache = True # TODO: make it configurable self.region = C["region"] - (Path(self.uri.parent) / self.file_name).mkdir(parents=True, exist_ok=True) + self.uri.parent.mkdir(parents=True, exist_ok=True) @property def file_name(self) -> str: @@ -201,7 +201,7 @@ def __init__(self, market: str, freq: str, provider_uri: dict = None, **kwargs): super(FileInstrumentStorage, self).__init__(market, freq, **kwargs) self._provider_uri = None if provider_uri is None else C.DataPathManager.format_provider_uri(provider_uri) self.file_name = f"{market.lower()}.txt" - (Path(self.uri.parent) / self.file_name).mkdir(parents=True, exist_ok=True) + self.uri.parent.mkdir(parents=True, exist_ok=True) def _read_instrument(self) -> Dict[InstKT, InstVT]: if not self.uri.exists(): @@ -291,7 +291,7 @@ def __init__(self, instrument: str, field: str, freq: str, provider_uri: dict = super(FileFeatureStorage, self).__init__(instrument, field, freq, **kwargs) self._provider_uri = None if provider_uri is None else C.DataPathManager.format_provider_uri(provider_uri) self.file_name = f"{instrument.lower()}/{field.lower()}.{freq.lower()}.bin" - (Path(self.uri.parent) / self.file_name).mkdir(parents=True, exist_ok=True) + self.uri.parent.mkdir(parents=True, exist_ok=True) def clear(self): with self.uri.open("wb") as _: From d607d81951661361fcb4a234af790d0d4992b3da Mon Sep 17 00:00:00 2001 From: John Lyu Date: Sun, 18 Jun 2023 11:27:02 +0800 Subject: [PATCH 03/37] remove to_csv twice --- qlib/data/storage/file_storage.py | 1 - 1 file changed, 1 deletion(-) diff --git a/qlib/data/storage/file_storage.py b/qlib/data/storage/file_storage.py index f8b9f02f6f..e84ec169ce 100644 --- a/qlib/data/storage/file_storage.py +++ b/qlib/data/storage/file_storage.py @@ -236,7 +236,6 @@ def _write_instrument(self, data: Dict[InstKT, InstVT] = None) -> None: df.loc[:, [self.SYMBOL_FIELD_NAME, self.INSTRUMENT_START_FIELD, self.INSTRUMENT_END_FIELD]].to_csv( self.uri, header=False, sep=self.INSTRUMENT_SEP, index=False ) - df.to_csv(self.uri, sep="\t", encoding="utf-8", header=False, index=False) def clear(self) -> None: self._write_instrument(data={}) From 5579b8a1e8d97b328b830999e5a30a7378655ae8 Mon Sep 17 00:00:00 2001 From: John Lyu Date: Sun, 18 Jun 2023 11:45:37 +0800 Subject: [PATCH 04/37] index should be int --- qlib/data/storage/file_storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qlib/data/storage/file_storage.py b/qlib/data/storage/file_storage.py index e84ec169ce..9bc972bf9f 100644 --- a/qlib/data/storage/file_storage.py +++ b/qlib/data/storage/file_storage.py @@ -322,7 +322,7 @@ def write(self, data_array: Union[List, np.ndarray], index: int = None) -> None: # rewrite with self.uri.open("rb+") as fp: _old_data = np.fromfile(fp, dtype=" Date: Sun, 18 Jun 2023 11:54:15 +0800 Subject: [PATCH 05/37] should not get_recent_freq if folder is empty --- qlib/data/storage/file_storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qlib/data/storage/file_storage.py b/qlib/data/storage/file_storage.py index 9bc972bf9f..080384cf1d 100644 --- a/qlib/data/storage/file_storage.py +++ b/qlib/data/storage/file_storage.py @@ -91,7 +91,7 @@ def _freq_file(self) -> str: """the freq to read from file""" if not hasattr(self, "_freq_file_cache"): freq = Freq(self.freq) - if freq not in self.support_freq: + if self.support_freq and freq not in self.support_freq: # NOTE: uri # 1. If `uri` does not exist # - Get the `min_uri` of the closest `freq` under the same "directory" as the `uri` From 07083ae0a88a5817782b7af50a486192b2a9231b Mon Sep 17 00:00:00 2001 From: John Lyu Date: Mon, 19 Jun 2023 08:32:56 +0800 Subject: [PATCH 06/37] fix index missing bug --- qlib/data/storage/file_storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qlib/data/storage/file_storage.py b/qlib/data/storage/file_storage.py index 080384cf1d..ea72f53e4e 100644 --- a/qlib/data/storage/file_storage.py +++ b/qlib/data/storage/file_storage.py @@ -330,7 +330,7 @@ def write(self, data_array: Union[List, np.ndarray], index: int = None) -> None: _new_df = pd.DataFrame(data_array, index=range(index, index + len(data_array)), columns=["new"]) _df = pd.concat([_old_df, _new_df], sort=False, axis=1) _df = _df.reindex(range(_df.index.min(), _df.index.max() + 1)) - _df["new"].fillna(_df["old"]).values.astype(" Union[int, None]: From 655e666243822fada1eed607036613f20a46bf10 Mon Sep 17 00:00:00 2001 From: John Lyu Date: Tue, 27 Jun 2023 08:41:55 +0800 Subject: [PATCH 07/37] improve logging --- qlib/data/pit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qlib/data/pit.py b/qlib/data/pit.py index 093b98cab3..2294cfc0f4 100644 --- a/qlib/data/pit.py +++ b/qlib/data/pit.py @@ -40,7 +40,7 @@ def _load_internal(self, instrument, start_index, end_index, freq): s = self._load_feature(instrument, -start_ws, 0, cur_time) resample_data[cur_index - start_index] = s.iloc[-1] if len(s) > 0 else np.nan except FileNotFoundError: - get_module_logger("base").warning(f"WARN: period data not found for {str(self)}") + get_module_logger("base").warning(f"WARN: period data not found for {instrument} {str(self)} ({freq})") return pd.Series(dtype="float32", name=str(self)) resample_series = pd.Series( From 0a0d7dc6910fb9f1864ec6d5c6d020d2a409a027 Mon Sep 17 00:00:00 2001 From: John Lyu Date: Fri, 7 Jul 2023 13:17:58 +0800 Subject: [PATCH 08/37] allow None model and dataset in SoftTopkStrategy --- qlib/contrib/strategy/cost_control.py | 18 ++++++++---------- qlib/contrib/strategy/signal_strategy.py | 2 +- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/qlib/contrib/strategy/cost_control.py b/qlib/contrib/strategy/cost_control.py index ff51f484f5..8e04adc26e 100644 --- a/qlib/contrib/strategy/cost_control.py +++ b/qlib/contrib/strategy/cost_control.py @@ -13,16 +13,11 @@ class SoftTopkStrategy(WeightStrategyBase): def __init__( self, - model, - dataset, topk, order_generator_cls_or_obj=OrderGenWInteract, max_sold_weight=1.0, risk_degree=0.95, buy_method="first_fill", - trade_exchange=None, - level_infra=None, - common_infra=None, **kwargs, ): """ @@ -37,7 +32,8 @@ def __init__( average_fill: assign the weight to the stocks rank high averagely. """ super(SoftTopkStrategy, self).__init__( - model, dataset, order_generator_cls_or_obj, trade_exchange, level_infra, common_infra, **kwargs + order_generator_cls_or_obj=order_generator_cls_or_obj, + **kwargs, ) self.topk = topk self.max_sold_weight = max_sold_weight @@ -89,13 +85,15 @@ def generate_target_weight_position(self, score, current, trade_start_time, trad max(1 / self.topk - final_stock_weight.get(stock_id, 0), 0.0), sold_stock_weight, ) - final_stock_weight[stock_id] = final_stock_weight.get(stock_id, 0.0) + add_weight + final_stock_weight[stock_id] = ( + final_stock_weight.get(stock_id, 0.0) + add_weight + ) sold_stock_weight -= add_weight elif self.buy_method == "average_fill": for stock_id in buy_signal_stocks: - final_stock_weight[stock_id] = final_stock_weight.get(stock_id, 0.0) + sold_stock_weight / len( - buy_signal_stocks - ) + final_stock_weight[stock_id] = final_stock_weight.get( + stock_id, 0.0 + ) + sold_stock_weight / len(buy_signal_stocks) else: raise ValueError("Buy method not found") return final_stock_weight diff --git a/qlib/contrib/strategy/signal_strategy.py b/qlib/contrib/strategy/signal_strategy.py index cb94017cd4..16ffff82d7 100644 --- a/qlib/contrib/strategy/signal_strategy.py +++ b/qlib/contrib/strategy/signal_strategy.py @@ -333,7 +333,7 @@ def generate_target_weight_position(self, score, current, trade_start_time, trad Parameters ----------- - score : pd.Series + score : pd.DataFrame pred score for this trade date, index is stock_id, contain 'score' column. current : Position() current position. From a0d1450462f834465871cc66a3b864ead229ea53 Mon Sep 17 00:00:00 2001 From: John Lyu Date: Thu, 17 Aug 2023 18:04:30 +0800 Subject: [PATCH 09/37] use line width 120 --- qlib/contrib/strategy/cost_control.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/qlib/contrib/strategy/cost_control.py b/qlib/contrib/strategy/cost_control.py index 8e04adc26e..326e29652b 100644 --- a/qlib/contrib/strategy/cost_control.py +++ b/qlib/contrib/strategy/cost_control.py @@ -85,15 +85,13 @@ def generate_target_weight_position(self, score, current, trade_start_time, trad max(1 / self.topk - final_stock_weight.get(stock_id, 0), 0.0), sold_stock_weight, ) - final_stock_weight[stock_id] = ( - final_stock_weight.get(stock_id, 0.0) + add_weight - ) + final_stock_weight[stock_id] = final_stock_weight.get(stock_id, 0.0) + add_weight sold_stock_weight -= add_weight elif self.buy_method == "average_fill": for stock_id in buy_signal_stocks: - final_stock_weight[stock_id] = final_stock_weight.get( - stock_id, 0.0 - ) + sold_stock_weight / len(buy_signal_stocks) + final_stock_weight[stock_id] = final_stock_weight.get(stock_id, 0.0) + sold_stock_weight / len( + buy_signal_stocks + ) else: raise ValueError("Buy method not found") return final_stock_weight From 70b5c9f3ee43837844f16f70d41847f1cbbd0ac4 Mon Sep 17 00:00:00 2001 From: Linlang <30293408+SunsetWolf@users.noreply.github.com> Date: Sun, 25 Jun 2023 23:39:11 +0800 Subject: [PATCH 10/37] change get_data url (#1558) * change_url * fix_CI * fix_CI_2 * fix_CI_3 * fix_CI_4 * fix_CI_5 * fix_CI_6 * fix_CI_7 * fix_CI_8 * fix_CI_9 * fix_CI_10 * fix_CI_11 * fix_CI_12 * fix_CI_13 * fix_CI_13 * fix_CI_14 * fix_CI_15 * fix_CI_16 * fix_CI_17 * fix_CI_18 * fix_CI_19 * fix_CI_20 * fix_CI_21 * fix_CI_22 * fix_CI_23 * fix_CI_24 * fix_CI_25 * fix_CI_26 * fix_CI_27 * fix_get_data_error * fix_get_data_error2 * modify_get_data * modify_get_data2 * modify_get_data3 * modify_get_data4 * fix_CI_28 * fix_CI_29 * fix_CI_30 --------- Co-authored-by: Linlang --- .github/workflows/test_qlib_from_source.yml | 21 +++-- .../workflows/test_qlib_from_source_slow.yml | 18 +++- docs/component/data.rst | 2 +- qlib/tests/data.py | 90 ++++++++++--------- setup.py | 1 + tests/test_dump_data.py | 2 +- tests/test_get_data.py | 2 +- 7 files changed, 82 insertions(+), 54 deletions(-) diff --git a/.github/workflows/test_qlib_from_source.yml b/.github/workflows/test_qlib_from_source.yml index 68dfe5b3fd..0bd3517d55 100644 --- a/.github/workflows/test_qlib_from_source.yml +++ b/.github/workflows/test_qlib_from_source.yml @@ -20,18 +20,28 @@ jobs: steps: - name: Test qlib from source - uses: actions/checkout@v2 + uses: actions/checkout@v3 + # Since version 3.7 of python for MacOS is installed in CI, version 3.7.17, this version causes "_bz not found error". + # So we make the version number of python 3.7 for MacOS more specific. + # refs: https://github.com/actions/setup-python/issues/682 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + if: (matrix.os == 'macos-latest' && matrix.python-version == '3.7') || (matrix.os == 'macos-11' && matrix.python-version == '3.7') + uses: actions/setup-python@v4 + with: + python-version: "3.7.16" + + - name: Set up Python ${{ matrix.python-version }} + if: (matrix.os != 'macos-latest' || matrix.python-version != '3.7') && (matrix.os != 'macos-11' || matrix.python-version != '3.7') + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - name: Update pip to the latest version # pip release version 23.1 on Apr.15 2023, CI failed to run, Please refer to #1495 ofr detailed logs. - # The pip version has been temporarily fixed to 23.0.1 + # The pip version has been temporarily fixed to 23.0 run: | - python -m pip install pip==23.0.1 + python -m pip install pip==23.0 - name: Installing pytorch for macos if: ${{ matrix.os == 'macos-11' || matrix.os == 'macos-latest' }} @@ -129,8 +139,7 @@ jobs: - name: Test data downloads run: | python scripts/get_data.py qlib_data --name qlib_data_simple --target_dir ~/.qlib/qlib_data/cn_data --interval 1d --region cn - azcopy copy https://qlibpublic.blob.core.windows.net/data/rl /tmp/qlibpublic/data --recursive - mv /tmp/qlibpublic/data tests/.data + python scripts/get_data.py download_data --file_name rl_data.zip --target_dir tests/.data/rl - name: Install Lightgbm for MacOS if: ${{ matrix.os == 'macos-11' || matrix.os == 'macos-latest' }} diff --git a/.github/workflows/test_qlib_from_source_slow.yml b/.github/workflows/test_qlib_from_source_slow.yml index f8e43fa179..1dfcc0179c 100644 --- a/.github/workflows/test_qlib_from_source_slow.yml +++ b/.github/workflows/test_qlib_from_source_slow.yml @@ -20,18 +20,28 @@ jobs: steps: - name: Test qlib from source slow - uses: actions/checkout@v2 + uses: actions/checkout@v3 + # Since version 3.7 of python for MacOS is installed in CI, version 3.7.17, this version causes "_bz not found error". + # So we make the version number of python 3.7 for MacOS more specific. + # refs: https://github.com/actions/setup-python/issues/682 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + if: (matrix.os == 'macos-latest' && matrix.python-version == '3.7') || (matrix.os == 'macos-11' && matrix.python-version == '3.7') + uses: actions/setup-python@v4 + with: + python-version: "3.7.16" + + - name: Set up Python ${{ matrix.python-version }} + if: (matrix.os != 'macos-latest' || matrix.python-version != '3.7') && (matrix.os != 'macos-11' || matrix.python-version != '3.7') + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - name: Set up Python tools # pip release version 23.1 on Apr.15 2023, CI failed to run, Please refer to #1495 ofr detailed logs. - # The pip version has been temporarily fixed to 23.0.1 + # The pip version has been temporarily fixed to 23.0 run: | - python -m pip install pip==23.0.1 + python -m pip install pip==23.0 pip install --upgrade cython numpy pip install -e .[dev] diff --git a/docs/component/data.rst b/docs/component/data.rst index 60e8d4fa1b..5a2d458f68 100644 --- a/docs/component/data.rst +++ b/docs/component/data.rst @@ -119,7 +119,7 @@ Here are some example: for daily data: .. code-block:: bash - python scripts/get_data.py csv_data_cn --target_dir ~/.qlib/csv_data/cn_data + python scripts/get_data.py download_data --file_name csv_data_cn.zip --target_dir ~/.qlib/csv_data/cn_data for 1min data: .. code-block:: bash diff --git a/qlib/tests/data.py b/qlib/tests/data.py index 2163b4bf7e..8de32f3f6c 100644 --- a/qlib/tests/data.py +++ b/qlib/tests/data.py @@ -1,6 +1,7 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. +import os import re import sys import qlib @@ -11,13 +12,15 @@ from tqdm import tqdm from pathlib import Path from loguru import logger +from cryptography.fernet import Fernet from qlib.utils import exists_qlib_data class GetData: - DATASET_VERSION = "v2" REMOTE_URL = "https://qlibpublic.blob.core.windows.net/data/default/stock_data" - QLIB_DATA_NAME = "{dataset_name}_{region}_{interval}_{qlib_version}.zip" + # "?" is not included in the token. + TOKEN = "gAAAAABkmDhojHc0VSCDdNK1MqmRzNLeDFXe5hy8obHpa6SDQh4de6nW5gtzuD-fa6O_WZb0yyqYOL7ndOfJX_751W3xN5YB4-n-P22jK-t6ucoZqhT70KPD0Lf0_P328QPJVZ1gDnjIdjhi2YLOcP4BFTHLNYO0mvzszR8TKm9iT5AKRvuysWnpi8bbYwGU9zAcJK3x9EPL43hOGtxliFHcPNGMBoJW4g_ercdhi0-Qgv5_JLsV-29_MV-_AhuaYvJuN2dEywBy" + KEY = "EYcA8cgorA8X9OhyMwVfuFxn_1W3jGk6jCbs3L2oPoA=" def __init__(self, delete_zip_file=False): """ @@ -29,24 +32,44 @@ def __init__(self, delete_zip_file=False): """ self.delete_zip_file = delete_zip_file - def normalize_dataset_version(self, dataset_version: str = None): - if dataset_version is None: - dataset_version = self.DATASET_VERSION - return dataset_version + def merge_remote_url(self, file_name: str): + fernet = Fernet(self.KEY) + token = fernet.decrypt(self.TOKEN).decode() + return f"{self.REMOTE_URL}/{file_name}?{token}" - def merge_remote_url(self, file_name: str, dataset_version: str = None): - return f"{self.REMOTE_URL}/{self.normalize_dataset_version(dataset_version)}/{file_name}" + def download_data(self, file_name: str, target_dir: [Path, str], delete_old: bool = True): + """ + Download the specified file to the target folder. - def _download_data( - self, file_name: str, target_dir: [Path, str], delete_old: bool = True, dataset_version: str = None - ): + Parameters + ---------- + target_dir: str + data save directory + file_name: str + dataset name, needs to endwith .zip, value from [rl_data.zip, csv_data_cn.zip, ...] + may contain folder names, for example: v2/qlib_data_simple_cn_1d_latest.zip + delete_old: bool + delete an existing directory, by default True + + Examples + --------- + # get rl data + python get_data.py download_data --file_name rl_data.zip --target_dir ~/.qlib/qlib_data/rl_data + When this command is run, the data will be downloaded from this link: https://qlibpublic.blob.core.windows.net/data/default/stock_data/rl_data.zip?{token} + + # get cn csv data + python get_data.py download_data --file_name csv_data_cn.zip --target_dir ~/.qlib/csv_data/cn_data + When this command is run, the data will be downloaded from this link: https://qlibpublic.blob.core.windows.net/data/default/stock_data/csv_data_cn.zip?{token} + ------- + + """ target_dir = Path(target_dir).expanduser() target_dir.mkdir(exist_ok=True, parents=True) # saved file name - _target_file_name = datetime.datetime.now().strftime("%Y%m%d%H%M%S") + "_" + file_name + _target_file_name = datetime.datetime.now().strftime("%Y%m%d%H%M%S") + "_" + os.path.basename(file_name) target_path = target_dir.joinpath(_target_file_name) - url = self.merge_remote_url(file_name, dataset_version) + url = self.merge_remote_url(file_name) resp = requests.get(url, stream=True, timeout=60) resp.raise_for_status() if resp.status_code != 200: @@ -56,7 +79,7 @@ def _download_data( logger.warning( f"The data for the example is collected from Yahoo Finance. Please be aware that the quality of the data might not be perfect. (You can refer to the original data source: https://finance.yahoo.com/lookup.)" ) - logger.info(f"{file_name} downloading......") + logger.info(f"{os.path.basename(file_name)} downloading......") with tqdm(total=int(resp.headers.get("Content-Length", 0))) as p_bar: with target_path.open("wb") as fp: for chunk in resp.iter_content(chunk_size=chunk_size): @@ -67,8 +90,8 @@ def _download_data( if self.delete_zip_file: target_path.unlink() - def check_dataset(self, file_name: str, dataset_version: str = None): - url = self.merge_remote_url(file_name, dataset_version) + def check_dataset(self, file_name: str): + url = self.merge_remote_url(file_name) resp = requests.get(url, stream=True, timeout=60) status = True if resp.status_code == 404: @@ -140,9 +163,11 @@ def qlib_data( --------- # get 1d data python get_data.py qlib_data --name qlib_data --target_dir ~/.qlib/qlib_data/cn_data --interval 1d --region cn + When this command is run, the data will be downloaded from this link: https://qlibpublic.blob.core.windows.net/data/default/stock_data/v2/qlib_data_cn_1d_latest.zip?{token} # get 1min data python get_data.py qlib_data --name qlib_data --target_dir ~/.qlib/qlib_data/cn_data_1min --interval 1min --region cn + When this command is run, the data will be downloaded from this link: https://qlibpublic.blob.core.windows.net/data/default/stock_data/v2/qlib_data_cn_1min_latest.zip?{token} ------- """ @@ -155,29 +180,12 @@ def qlib_data( qlib_version = ".".join(re.findall(r"(\d+)\.+", qlib.__version__)) - def _get_file_name(v): - return self.QLIB_DATA_NAME.format( - dataset_name=name, region=region.lower(), interval=interval.lower(), qlib_version=v - ) - - file_name = _get_file_name(qlib_version) - if not self.check_dataset(file_name, version): - file_name = _get_file_name("latest") - self._download_data(file_name.lower(), target_dir, delete_old, dataset_version=version) - - def csv_data_cn(self, target_dir="~/.qlib/csv_data/cn_data"): - """download cn csv data from remote - - Parameters - ---------- - target_dir: str - data save directory - - Examples - --------- - python get_data.py csv_data_cn --target_dir ~/.qlib/csv_data/cn_data - ------- + def _get_file_name_with_version(qlib_version, dataset_version): + dataset_version = "v2" if dataset_version is None else dataset_version + file_name_with_version = f"{dataset_version}/{name}_{region.lower()}_{interval.lower()}_{qlib_version}.zip" + return file_name_with_version - """ - file_name = "csv_data_cn.zip" - self._download_data(file_name, target_dir) + file_name = _get_file_name_with_version(qlib_version, dataset_version=version) + if not self.check_dataset(file_name): + file_name = _get_file_name_with_version("latest", dataset_version=version) + self.download_data(file_name.lower(), target_dir, delete_old) diff --git a/setup.py b/setup.py index 109fed2135..9d7c185ab9 100644 --- a/setup.py +++ b/setup.py @@ -80,6 +80,7 @@ def get_version(rel_path: str) -> str: "gym", # Installing the latest version of protobuf for python versions below 3.8 will cause unit tests to fail. "protobuf<=3.20.1;python_version<='3.8'", + "cryptography", ] # Numpy include diff --git a/tests/test_dump_data.py b/tests/test_dump_data.py index dfa7f8556d..33cae4e808 100644 --- a/tests/test_dump_data.py +++ b/tests/test_dump_data.py @@ -35,7 +35,7 @@ class TestDumpData(unittest.TestCase): @classmethod def setUpClass(cls) -> None: - GetData().csv_data_cn(SOURCE_DIR) + GetData().download_data(file_name="csv_data_cn.zip", target_dir=SOURCE_DIR) TestDumpData.DUMP_DATA = DumpDataAll(csv_path=SOURCE_DIR, qlib_dir=QLIB_DIR, include_fields=cls.FIELDS) TestDumpData.STOCK_NAMES = list(map(lambda x: x.name[:-4].upper(), SOURCE_DIR.glob("*.csv"))) provider_uri = str(QLIB_DIR.resolve()) diff --git a/tests/test_get_data.py b/tests/test_get_data.py index 93a852f554..94e685e1fb 100644 --- a/tests/test_get_data.py +++ b/tests/test_get_data.py @@ -42,7 +42,7 @@ def test_0_qlib_data(self): self.assertFalse(df.dropna().empty, "get qlib data failed") def test_1_csv_data(self): - GetData().csv_data_cn(SOURCE_DIR) + GetData().download_data(file_name="csv_data_cn.zip", target_dir=SOURCE_DIR) stock_name = set(map(lambda x: x.name[:-4].upper(), SOURCE_DIR.glob("*.csv"))) self.assertEqual(len(stock_name), 85, "get csv data failed") From 58f73de454ba6c1a7ba8cd9df81238af24de6902 Mon Sep 17 00:00:00 2001 From: you-n-g Date: Sun, 25 Jun 2023 23:48:37 +0800 Subject: [PATCH 11/37] Update release-drafter.yml (#1569) * Update release-drafter.yml * Update release-drafter.yml --- .github/release-drafter.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/release-drafter.yml b/.github/release-drafter.yml index ec8ea5d69a..488419d527 100644 --- a/.github/release-drafter.yml +++ b/.github/release-drafter.yml @@ -14,6 +14,9 @@ categories: label: - 'doc' - 'documentation' + - title: '🧹 Maintenance' + label: + - 'maintenance' change-template: '- $TITLE @$AUTHOR (#$NUMBER)' change-title-escapes: '\<*_&' # You can add # and @ to disable mentions, and add ` to disable code blocks. version-resolver: @@ -30,4 +33,4 @@ version-resolver: template: | ## Changes - $CHANGES \ No newline at end of file + $CHANGES From ba2df87b7a338db082aaae8538db90e92a3f122f Mon Sep 17 00:00:00 2001 From: you-n-g Date: Mon, 26 Jun 2023 00:00:46 +0800 Subject: [PATCH 12/37] Update __init__.py --- qlib/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qlib/__init__.py b/qlib/__init__.py index 11d22cc230..96daaad1a5 100644 --- a/qlib/__init__.py +++ b/qlib/__init__.py @@ -2,7 +2,7 @@ # Licensed under the MIT License. from pathlib import Path -__version__ = "0.9.1.99" +__version__ = "0.9.2" __version__bak = __version__ # This version is backup for QlibConfig.reset_qlib_version import os from typing import Union From 1e9140dbaa2391343c9650c61835b019a12f0597 Mon Sep 17 00:00:00 2001 From: you-n-g Date: Tue, 27 Jun 2023 11:55:40 +0800 Subject: [PATCH 13/37] Update __init__.py --- qlib/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qlib/__init__.py b/qlib/__init__.py index 96daaad1a5..a963a8c285 100644 --- a/qlib/__init__.py +++ b/qlib/__init__.py @@ -2,7 +2,7 @@ # Licensed under the MIT License. from pathlib import Path -__version__ = "0.9.2" +__version__ = "0.9.2.99" __version__bak = __version__ # This version is backup for QlibConfig.reset_qlib_version import os from typing import Union From 194ac598f11f6b1530a5525f24cb677f370713b1 Mon Sep 17 00:00:00 2001 From: you-n-g Date: Wed, 28 Jun 2023 10:53:58 +0800 Subject: [PATCH 14/37] Update README.md for RL (#1573) * Update README.md * Update README.md --- README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/README.md b/README.md index c09e1276e2..539700a910 100644 --- a/README.md +++ b/README.md @@ -91,6 +91,7 @@ For more details, please refer to our paper ["Qlib: An AI-oriented Quantitative
  • Adapting to Market Dynamics
  • +
  • Reinforcement Learning: modeling continuous decisions
  • @@ -392,6 +393,17 @@ Here is a list of solutions built on `Qlib`. - [Rolling Retraining](examples/benchmarks_dynamic/baseline/) - [DDG-DA on pytorch (Wendi, et al. AAAI 2022)](examples/benchmarks_dynamic/DDG-DA/) +## Reinforcement Learning: modeling continuous decisions +Qlib now supports reinforcement learning, a feature designed to model continuous investment decisions. This functionality assists investors in optimizing their trading strategies by learning from interactions with the environment to maximize some notion of cumulative reward. + +Here is a list of solutions built on `Qlib` categorized by scenarios. + +### [RL for order execution](examples/rl_order_execution) +[Here](https://qlib.readthedocs.io/en/latest/component/rl/overall.html#order-execution) is the introduction of this scenario. All the methods below are compared [here](examples/rl_order_execution). +- [TWAP](examples/rl_order_execution/exp_configs/backtest_twap.yml) +- [PPO: "An End-to-End Optimal Trade Execution Framework based on Proximal Policy Optimization", IJCAL 2020](examples/rl_order_execution/exp_configs/backtest_ppo.yml) +- [OPDS: "Universal Trading for Order Execution with Oracle Policy Distillation", AAAI 2021](examples/rl_order_execution/exp_configs/backtest_opds.yml) + # Quant Dataset Zoo Dataset plays a very important role in Quant. Here is a list of the datasets built on `Qlib`: From a656648ec9432651ad186006ed82ccf9a3d7db51 Mon Sep 17 00:00:00 2001 From: Linlang <30293408+SunsetWolf@users.noreply.github.com> Date: Wed, 5 Jul 2023 21:23:15 +0800 Subject: [PATCH 15/37] fix_pip_ci (#1584) * fix_pip_ci * fix_ci_get_data_error --------- Co-authored-by: Linlang --- .github/workflows/test_qlib_from_pip.yml | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test_qlib_from_pip.yml b/.github/workflows/test_qlib_from_pip.yml index e6202e57ed..346dd49606 100644 --- a/.github/workflows/test_qlib_from_pip.yml +++ b/.github/workflows/test_qlib_from_pip.yml @@ -19,10 +19,20 @@ jobs: steps: - name: Test qlib from pip - uses: actions/checkout@v2 + uses: actions/checkout@v3 + # Since version 3.7 of python for MacOS is installed in CI, version 3.7.17, this version causes "_bz not found error". + # So we make the version number of python 3.7 for MacOS more specific. + # refs: https://github.com/actions/setup-python/issues/682 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + if: (matrix.os == 'macos-latest' && matrix.python-version == '3.7') || (matrix.os == 'macos-11' && matrix.python-version == '3.7') + uses: actions/setup-python@v4 + with: + python-version: "3.7.16" + + - name: Set up Python ${{ matrix.python-version }} + if: (matrix.os != 'macos-latest' || matrix.python-version != '3.7') && (matrix.os != 'macos-11' || matrix.python-version != '3.7') + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} @@ -50,7 +60,9 @@ jobs: - name: Downloads dependencies data run: | - python scripts/get_data.py qlib_data --name qlib_data_simple --target_dir ~/.qlib/qlib_data/cn_data --interval 1d --region cn + cd .. + python -m qlib.run.get_data qlib_data --target_dir ~/.qlib/qlib_data/cn_data --region cn + cd qlib - name: Test workflow by config run: | From 706138c801a57a9d6899c42cd764434f7de66026 Mon Sep 17 00:00:00 2001 From: Yang <3349368+m3ngyang@users.noreply.github.com> Date: Thu, 6 Jul 2023 12:38:52 +0800 Subject: [PATCH 16/37] fix download token (#1577) --- qlib/tests/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qlib/tests/data.py b/qlib/tests/data.py index 8de32f3f6c..f6bd780905 100644 --- a/qlib/tests/data.py +++ b/qlib/tests/data.py @@ -19,7 +19,7 @@ class GetData: REMOTE_URL = "https://qlibpublic.blob.core.windows.net/data/default/stock_data" # "?" is not included in the token. - TOKEN = "gAAAAABkmDhojHc0VSCDdNK1MqmRzNLeDFXe5hy8obHpa6SDQh4de6nW5gtzuD-fa6O_WZb0yyqYOL7ndOfJX_751W3xN5YB4-n-P22jK-t6ucoZqhT70KPD0Lf0_P328QPJVZ1gDnjIdjhi2YLOcP4BFTHLNYO0mvzszR8TKm9iT5AKRvuysWnpi8bbYwGU9zAcJK3x9EPL43hOGtxliFHcPNGMBoJW4g_ercdhi0-Qgv5_JLsV-29_MV-_AhuaYvJuN2dEywBy" + TOKEN = b"gAAAAABkmDhojHc0VSCDdNK1MqmRzNLeDFXe5hy8obHpa6SDQh4de6nW5gtzuD-fa6O_WZb0yyqYOL7ndOfJX_751W3xN5YB4-n-P22jK-t6ucoZqhT70KPD0Lf0_P328QPJVZ1gDnjIdjhi2YLOcP4BFTHLNYO0mvzszR8TKm9iT5AKRvuysWnpi8bbYwGU9zAcJK3x9EPL43hOGtxliFHcPNGMBoJW4g_ercdhi0-Qgv5_JLsV-29_MV-_AhuaYvJuN2dEywBy" KEY = "EYcA8cgorA8X9OhyMwVfuFxn_1W3jGk6jCbs3L2oPoA=" def __init__(self, delete_zip_file=False): From fab4e0a8be72b2cbf36a64a5bcefc498e8eadf6f Mon Sep 17 00:00:00 2001 From: Lewen Wang <49936435+lwwang1995@users.noreply.github.com> Date: Fri, 7 Jul 2023 15:40:03 +0800 Subject: [PATCH 17/37] Update qlibrl docs. (#1588) * Update qlibrl docs. * Update docs/component/rl/guidance.rst * Update docs/component/rl/guidance.rst * Update docs/component/rl/guidance.rst --------- Co-authored-by: Litzy Co-authored-by: you-n-g --- docs/component/rl/guidance.rst | 32 +++++++++++++++++++++ docs/component/rl/overall.rst | 52 +++++++++++++++++++++++----------- docs/component/rl/toctree.rst | 1 + 3 files changed, 69 insertions(+), 16 deletions(-) create mode 100644 docs/component/rl/guidance.rst diff --git a/docs/component/rl/guidance.rst b/docs/component/rl/guidance.rst new file mode 100644 index 0000000000..7f917d5594 --- /dev/null +++ b/docs/component/rl/guidance.rst @@ -0,0 +1,32 @@ + +======== +Guidance +======== +.. currentmodule:: qlib + +QlibRL can help users quickly get started and conveniently implement quantitative strategies based on reinforcement learning(RL) algorithms. For different user groups, we recommend the following guidance to use QlibRL. + +Beginners to Reinforcement Learning Algorithms +============================================== +Whether you are a quantitative researcher who wants to understand what RL can do in trading or a learner who wants to get started with RL algorithms in trading scenarios, if you have limited knowledge of RL and want to shield various detailed settings to quickly get started with RL algorithms, we recommend the following sequence to learn qlibrl: + - Learn the fundamentals of RL in `part1 `_. + - Understand the trading scenarios where RL methods can be applied in `part2 `_. + - Run the examples in `part3 `_ to solve trading problems using RL. + - If you want to further explore QlibRL and make some customizations, you need to first understand the framework of QlibRL in `part4 `_ and rewrite specific components according to your needs. + +Reinforcement Learning Algorithm Researcher +============================================== +If you are already familiar with existing RL algorithms and dedicated to researching RL algorithms but lack domain knowledge in the financial field, and you want to validate the effectiveness of your algorithms in financial trading scenarios, we recommend the following steps to get started with QlibRL: + - Understand the trading scenarios where RL methods can be applied in `part2 `_. + - Choose an RL application scenario (currently, QlibRL has implemented two scenario examples: order execution and algorithmic trading). Run the example in `part3 `_ to get it working. + - Modify the `policy `_ part to incorporate your own RL algorithm. + +Quantitative Researcher +======================= +If you have a certain level of financial domain knowledge and coding skills, and you want to explore the application of RL algorithms in the investment field, we recommend the following steps to explore QlibRL: + - Learn the fundamentals of RL in `part1 `_. + - Understand the trading scenarios where RL methods can be applied in `part2 `_. + - Run the examples in `part3 `_ to solve trading problems using RL. + - Understand the framework of QlibRL in `part4 `_. + - Choose a suitable RL algorithm based on the characteristics of the problem you want to solve (currently, QlibRL supports PPO and DQN algorithms based on tianshou). + - Design the MDP (Markov Decision Process) process based on market trading rules and the problem you want to solve. Refer to the example in order execution and make corresponding modifications to the following modules: `State `_, `Metrics `_, `ActionInterpreter `_, `StateInterpreter `_, `Reward `_, `Observation `_, `Simulator `_. \ No newline at end of file diff --git a/docs/component/rl/overall.rst b/docs/component/rl/overall.rst index 4f59dd17a7..f586a07e22 100644 --- a/docs/component/rl/overall.rst +++ b/docs/component/rl/overall.rst @@ -4,7 +4,7 @@ Reinforcement Learning in Quantitative Trading Reinforcement Learning ====================== -Different from supervised learning tasks such as classification tasks and regression tasks. Another important paradigm in machine learning is Reinforcement Learning, +Different from supervised learning tasks such as classification tasks and regression tasks. Another important paradigm in machine learning is Reinforcement Learning(RL), which attempts to optimize an accumulative numerical reward signal by directly interacting with the environment under a few assumptions such as Markov Decision Process(MDP). As demonstrated in the following figure, an RL system consists of four elements, 1)the agent 2) the environment the agent interacts with 3) the policy that the agent follows to take actions on the environment and 4)the reward signal from the environment to the agent. @@ -25,26 +25,46 @@ The Qlib Reinforcement Learning toolkit (QlibRL) is an RL platform for quantitat Potential Application Scenarios in Quantitative Trading ======================================================= -RL methods have already achieved outstanding achievement in many applications, such as game playing, resource allocating, recommendation, marketing and advertising, etc. -Investment is always a continuous process, taking the stock market as an example, investors need to control their positions and stock holdings by one or more buying and selling behaviors, to maximize the investment returns. -Besides, each buy and sell decision is made by investors after fully considering the overall market information and stock information. -From the view of an investor, the process could be described as a continuous decision-making process generated according to interaction with the market, such problems could be solved by the RL algorithms. -Following are some scenarios where RL can potentially be used in quantitative investment. +RL methods have demonstrated remarkable achievements in various applications, including game playing, resource allocation, recommendation systems, marketing, and advertising. +In the context of investment, which involves continuous decision-making, let's consider the example of the stock market. Investors strive to optimize their investment returns by effectively managing their positions and stock holdings through various buying and selling behaviors. +Furthermore, investors carefully evaluate market conditions and stock-specific information before making each buying or selling decision. From an investor's perspective, this process can be viewed as a continuous decision-making process driven by interactions with the market. RL algorithms offer a promising approach to tackle such challenges. +Here are several scenarios where RL holds potential for application in quantitative investment. + +Order Execution +--------------- +The order execution task is to execute orders efficiently while considering multiple factors, including optimal prices, minimizing trading costs, reducing market impact, maximizing order fullfill rates, and achieving execution within a specified time frame. RL can be applied to such tasks by incorporating these objectives into the reward function and action selection process. Specifically, the RL agent interacts with the market environment, observes the state from market information, and makes decisions on next step execution. The RL algorithm learns an optimal execution strategy through trial and error, aiming to maximize the expected cumulative reward, which incorporates the desired objectives. + + - General Setting + - Environment: The environment represents the financial market where order execution takes place. It encompasses variables such as the order book dynamics, liquidity, price movements, and market conditions. + + - State: The state refers to the information available to the RL agent at a given time step. It typically includes features such as the current order book state (bid-ask spread, order depth), historical price data, historical trading volume, market volatility, and any other relevant information that can aid in decision-making. + + - Action: The action is the decision made by the RL agent based on the observed state. In order execution, actions can include selecting the order size, price, and timing of execution. + + - Reward: The reward is a scalar signal that indicates the performance of the RL agent's action in the environment. The reward function is designed to encourage actions that lead to efficient and cost-effective order execution. It typically considers multiple objectives, such as maximizing price advantages, minimizing trading costs (including transaction fees and slippage), reducing market impact (the effect of the order on the market price) and maximizing order fullfill rates. + + - Scenarios + - Single-asset order execution: Single-asset order execution focuses on the task of executing a single order for a specific asset, such as a stock or a cryptocurrency. The primary objective is to execute the order efficiently while considering factors such as maximizing price advantages, minimizing trading costs, reducing market impact, and achieving a high fullfill rate. The RL agent interacts with the market environment and makes decisions on order size, price, and timing of execution for that particular asset. The goal is to learn an optimal execution strategy for the single asset, maximizing the expected cumulative reward while considering the specific dynamics and characteristics of that asset. + + - Multi-asset order execution: Multi-asset order execution expands the order execution task to involve multiple assets or securities. It typically involves executing a portfolio of orders across different assets simultaneously or sequentially. Unlike single-asset order execution, the focus is not only on the execution of individual orders but also on managing the interactions and dependencies between different assets within the portfolio. The RL agent needs to make decisions on the order sizes, prices, and timings for each asset in the portfolio, considering their interdependencies, cash constraints, market conditions, and transaction costs. The goal is to learn an optimal execution strategy that balances the execution efficiency for each asset while considering the overall performance and objectives of the portfolio as a whole. + +The choice of settings and RL algorithm depends on the specific requirements of the task, available data, and desired performance objectives. Portfolio Construction ---------------------- -Portfolio construction is a process of selecting securities optimally by taking a minimum risk to achieve maximum returns. With an RL-based solution, an agent allocates stocks at every time step by obtaining information for each stock and the market. The key is to develop of policy for building a portfolio and make the policy able to pick the optimal portfolio. +Portfolio construction is a process of selecting and allocating assets in an investment portfolio. RL provides a framework to optimize portfolio management decisions by learning from interactions with the market environment and maximizing long-term returns while considering risk management. + - General Setting + - State: The state represents the current information about the market and the portfolio. It typically includes historical prices and volumes, technical indicators, and other relevant data. -Order Execution ---------------- -As a fundamental problem in algorithmic trading, order execution aims at fulfilling a specific trading order, either liquidation or acquirement, for a given instrument. Essentially, the goal of order execution is twofold: it not only requires to fulfill the whole order but also targets a more economical execution with maximizing profit gain (or minimizing capital loss). The order execution with only one order of liquidation or acquirement is called single-asset order execution. + - Action: The action corresponds to the decision of allocating capital to different assets in the portfolio. It determines the weights or proportions of investments in each asset. + + - Reward: The reward is a metric that evaluates the performance of the portfolio. It can be defined in various ways, such as total return, risk-adjusted return, or other objectives like maximizing Sharpe ratio or minimizing drawdown. -Considering stock investment always aim to pursue long-term maximized profits, it usually manifests as a sequential process of continuously adjusting the asset portfolios, execution for multiple orders, including order of liquidation and acquirement, brings more constraints and makes the sequence of execution for different orders should be considered, e.g. before executing an order to buy some stocks, we have to sell at least one stock. The order execution with multiple assets is called multi-asset order execution. + - Scenarios + - Stock market: RL can be used to construct portfolios of stocks, where the agent learns to allocate capital among different stocks. -According to the order execution’s trait of sequential decision-making, an RL-based solution could be applied to solve the order execution. With an RL-based solution, an agent optimizes execution strategy by interacting with the market environment. + - Cryptocurrency market: RL can be applied to construct portfolios of cryptocurrencies, where the agent learns to make allocation decisions. -With QlibRL, the RL algorithm in the above scenarios can be easily implemented. + - Foreign exchange (Forex) market: RL can be used to construct portfolios of currency pairs, where the agent learns to allocate capital across different currencies based on exchange rate data, economic indicators, and other factors. -Nested Portfolio Construction and Order Executor ------------------------------------------------- -QlibRL makes it possible to jointly optimize different levels of strategies/models/agents. Take `Nested Decision Execution Framework `_ as an example, the optimization of order execution strategy and portfolio management strategies can interact with each other to maximize returns. +Similarly, the choice of basic setting and algorithm depends on the specific requirements of the problem and the characteristics of the market. \ No newline at end of file diff --git a/docs/component/rl/toctree.rst b/docs/component/rl/toctree.rst index d79d5e060d..4b88de06e3 100644 --- a/docs/component/rl/toctree.rst +++ b/docs/component/rl/toctree.rst @@ -5,6 +5,7 @@ Reinforcement Learning in Quantitative Trading ======================================================================== .. toctree:: + Guidance Overall Quick Start Framework From 9a0291fc2540058b9780f7971146cab25d967dfa Mon Sep 17 00:00:00 2001 From: you-n-g Date: Wed, 12 Jul 2023 09:59:09 +0800 Subject: [PATCH 18/37] Postpone PR stale. (#1591) --- .github/workflows/stale.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index b07bdf1e7d..6ce457dfdc 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -18,7 +18,8 @@ jobs: stale-issue-label: 'stale' stale-pr-label: 'stale' days-before-stale: 90 + days-before-pr-stale: 365 days-before-close: 5 operations-per-run: 100 exempt-issue-labels: 'bug,enhancement' - remove-stale-when-updated: true \ No newline at end of file + remove-stale-when-updated: true From d9936c456ae76da443a75a7fb1d489cc125abc4e Mon Sep 17 00:00:00 2001 From: you-n-g Date: Fri, 14 Jul 2023 12:16:12 +0800 Subject: [PATCH 19/37] Adjust rolling api (#1594) * Intermediate version * Fix yaml template & Successfully run rolling * Be compatible with benchmark * Get same results with previous linear model * Black formatting * Update black * Update the placeholder mechanism * Update CI * Update CI * Upgrade Black * Fix CI and simplify code * Fix CI * Move the data processing caching mechanism into utils. * Adjusting DDG-DA * Organize import --- .github/workflows/python-publish.yml | 2 +- .github/workflows/test_qlib_from_pip.yml | 1 + .github/workflows/test_qlib_from_source.yml | 3 + .pre-commit-config.yaml | 4 +- docs/component/workflow.rst | 8 +- .../workflow_config_adarnn_Alpha360.yaml | 3 +- .../ADD/workflow_config_add_Alpha360.yaml | 4 +- .../ALSTM/workflow_config_alstm_Alpha158.yaml | 4 +- .../ALSTM/workflow_config_alstm_Alpha360.yaml | 4 +- .../workflow_config_catboost_Alpha158.yaml | 4 +- ...kflow_config_catboost_Alpha158_csi500.yaml | 4 +- .../workflow_config_catboost_Alpha360.yaml | 4 +- ...kflow_config_catboost_Alpha360_csi500.yaml | 4 +- ...rkflow_config_doubleensemble_Alpha158.yaml | 4 +- ...config_doubleensemble_Alpha158_csi500.yaml | 4 +- ...rkflow_config_doubleensemble_Alpha360.yaml | 4 +- ...config_doubleensemble_Alpha360_csi500.yaml | 4 +- ...ig_doubleensemble_early_stop_Alpha158.yaml | 4 +- .../GATs/workflow_config_gats_Alpha158.yaml | 4 +- .../GATs/workflow_config_gats_Alpha360.yaml | 4 +- .../GRU/workflow_config_gru_Alpha158.yaml | 4 +- .../GRU/workflow_config_gru_Alpha360.yaml | 4 +- .../HIST/workflow_config_hist_Alpha360.yaml | 6 +- .../IGMTF/workflow_config_igmtf_Alpha360.yaml | 3 +- .../KRNN/workflow_config_krnn_Alpha360.yaml | 4 +- .../LSTM/workflow_config_lstm_Alpha158.yaml | 4 +- .../LSTM/workflow_config_lstm_Alpha360.yaml | 4 +- .../benchmarks/LightGBM/multi_freq_handler.py | 1 - .../workflow_config_lightgbm_Alpha158.yaml | 3 +- ...kflow_config_lightgbm_Alpha158_csi500.yaml | 3 +- ...w_config_lightgbm_Alpha158_multi_freq.yaml | 4 +- .../workflow_config_lightgbm_Alpha360.yaml | 4 +- ...kflow_config_lightgbm_Alpha360_csi500.yaml | 4 +- ..._config_lightgbm_configurable_dataset.yaml | 4 +- .../workflow_config_lightgbm_multi_freq.yaml | 4 +- .../workflow_config_linear_Alpha158.yaml | 4 +- ...orkflow_config_linear_Alpha158_csi500.yaml | 4 +- .../workflow_config_localformer_Alpha158.yaml | 4 +- .../workflow_config_localformer_Alpha360.yaml | 4 +- .../MLP/workflow_config_mlp_Alpha158.yaml | 4 +- .../workflow_config_mlp_Alpha158_csi500.yaml | 4 +- .../MLP/workflow_config_mlp_Alpha360.yaml | 4 +- .../workflow_config_mlp_Alpha360_csi500.yaml | 4 +- .../SFM/workflow_config_sfm_Alpha360.yaml | 4 +- .../workflow_config_sandwich_Alpha360.yaml | 4 +- .../TCN/workflow_config_tcn_Alpha158.yaml | 3 +- .../TCN/workflow_config_tcn_Alpha360.yaml | 3 +- .../TCTS/workflow_config_tcts_Alpha360.yaml | 6 +- .../benchmarks/TFT/data_formatters/base.py | 1 - .../benchmarks/TFT/expt_settings/configs.py | 1 - .../benchmarks/TFT/libs/hyperparam_opt.py | 2 - examples/benchmarks/TFT/libs/tft_model.py | 3 - .../TFT/workflow_config_tft_Alpha158.yaml | 4 +- examples/benchmarks/TRA/example.py | 2 - examples/benchmarks/TRA/src/dataset.py | 2 - examples/benchmarks/TRA/src/model.py | 7 - .../TRA/workflow_config_tra_Alpha158.yaml | 4 +- .../workflow_config_tra_Alpha158_full.yaml | 4 +- .../TRA/workflow_config_tra_Alpha360.yaml | 4 +- .../workflow_config_TabNet_Alpha158.yaml | 4 +- .../workflow_config_TabNet_Alpha360.yaml | 4 +- .../workflow_config_transformer_Alpha158.yaml | 4 +- .../workflow_config_transformer_Alpha360.yaml | 4 +- .../workflow_config_xgboost_Alpha158.yaml | 4 +- .../workflow_config_xgboost_Alpha360.yaml | 4 +- examples/benchmarks_dynamic/DDG-DA/README.md | 4 +- .../benchmarks_dynamic/DDG-DA/workflow.py | 307 ++-------------- .../benchmarks_dynamic/baseline/README.md | 7 +- .../baseline/rolling_benchmark.py | 162 +-------- .../workflow_config_lightgbm_Alpha158.yaml | 3 +- .../workflow_config_linear_Alpha158.yaml | 4 +- examples/highfreq/highfreq_handler.py | 1 - examples/highfreq/workflow.py | 1 - .../LightGBM/hyperparameter_158.py | 1 - .../LightGBM/hyperparameter_360.py | 1 - examples/model_interpreter/feature.py | 1 - examples/portfolio/prepare_riskdata.py | 3 - examples/rolling_process_data/workflow.py | 2 - examples/workflow_by_code.py | 1 - qlib/__init__.py | 1 - qlib/backtest/__init__.py | 1 - qlib/backtest/exchange.py | 1 - qlib/config.py | 1 - qlib/contrib/data/dataset.py | 5 - qlib/contrib/data/highfreq_handler.py | 2 - qlib/contrib/data/highfreq_processor.py | 1 - qlib/contrib/meta/data_selection/dataset.py | 2 + qlib/contrib/model/pytorch_adarnn.py | 2 - qlib/contrib/model/pytorch_alstm.py | 7 - qlib/contrib/model/pytorch_alstm_ts.py | 9 +- qlib/contrib/model/pytorch_gats.py | 4 - qlib/contrib/model/pytorch_gats_ts.py | 8 - qlib/contrib/model/pytorch_gru.py | 7 - qlib/contrib/model/pytorch_gru_ts.py | 9 +- qlib/contrib/model/pytorch_hist.py | 3 - qlib/contrib/model/pytorch_igmtf.py | 4 - qlib/contrib/model/pytorch_localformer.py | 8 - qlib/contrib/model/pytorch_localformer_ts.py | 6 - qlib/contrib/model/pytorch_lstm.py | 6 - qlib/contrib/model/pytorch_lstm_ts.py | 9 +- qlib/contrib/model/pytorch_sfm.py | 6 - qlib/contrib/model/pytorch_tabnet.py | 4 - qlib/contrib/model/pytorch_tcn.py | 6 - qlib/contrib/model/pytorch_tcn_ts.py | 4 - qlib/contrib/model/pytorch_tcts.py | 7 - qlib/contrib/model/pytorch_tra.py | 10 - qlib/contrib/model/pytorch_transformer.py | 8 - qlib/contrib/model/pytorch_transformer_ts.py | 6 - qlib/contrib/model/xgboost.py | 1 - qlib/contrib/report/data/ana.py | 1 - qlib/contrib/report/data/base.py | 1 - qlib/contrib/report/graph.py | 1 - qlib/contrib/rolling/__init__.py | 7 + qlib/contrib/rolling/__main__.py | 16 + qlib/contrib/rolling/base.py | 246 +++++++++++++ qlib/contrib/rolling/ddgda.py | 343 ++++++++++++++++++ qlib/contrib/strategy/optimizer/optimizer.py | 1 - qlib/contrib/strategy/rule_strategy.py | 1 - qlib/contrib/strategy/signal_strategy.py | 2 - qlib/contrib/tuner/config.py | 2 - qlib/contrib/tuner/pipeline.py | 4 - qlib/contrib/tuner/tuner.py | 7 - qlib/data/cache.py | 5 - qlib/data/data.py | 1 - qlib/data/dataset/processor.py | 1 - qlib/data/dataset/utils.py | 8 +- qlib/data/pit.py | 1 - qlib/data/storage/file_storage.py | 3 - qlib/log.py | 1 - qlib/model/riskmodel/poet.py | 1 - qlib/tests/__init__.py | 2 - qlib/utils/__init__.py | 230 ++---------- qlib/utils/index_data.py | 1 - qlib/utils/mod.py | 235 ++++++++++++ qlib/workflow/record_temp.py | 1 - qlib/workflow/task/gen.py | 1 - qlib/workflow/task/utils.py | 34 +- scripts/check_dump_bin.py | 1 - scripts/data_collector/base.py | 2 - scripts/data_collector/br_index/collector.py | 1 - scripts/data_collector/us_index/collector.py | 1 - scripts/dump_pit.py | 1 - tests/backtest/test_high_freq_trading.py | 1 - .../test_handler_storage.py | 4 - tests/misc/test_sepdf.py | 1 - tests/rolling_tests/test_update_pred.py | 1 - tests/storage_tests/test_storage.py | 1 - tests/test_get_data.py | 1 - 148 files changed, 1034 insertions(+), 1027 deletions(-) create mode 100644 qlib/contrib/rolling/__init__.py create mode 100644 qlib/contrib/rolling/__main__.py create mode 100644 qlib/contrib/rolling/base.py create mode 100644 qlib/contrib/rolling/ddgda.py create mode 100644 qlib/utils/mod.py diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index db14fbf3be..e95a9e88c8 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -38,7 +38,7 @@ jobs: TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} run: | twine upload dist/* - + deploy_with_manylinux: runs-on: ubuntu-latest steps: diff --git a/.github/workflows/test_qlib_from_pip.yml b/.github/workflows/test_qlib_from_pip.yml index 346dd49606..f5db06ccba 100644 --- a/.github/workflows/test_qlib_from_pip.yml +++ b/.github/workflows/test_qlib_from_pip.yml @@ -8,6 +8,7 @@ on: jobs: build: + if: ${{ false }} # FIXME: temporarily disable... Due to we are rushing a feature timeout-minutes: 120 runs-on: ${{ matrix.os }} diff --git a/.github/workflows/test_qlib_from_source.yml b/.github/workflows/test_qlib_from_source.yml index 0bd3517d55..7271287dcb 100644 --- a/.github/workflows/test_qlib_from_source.yml +++ b/.github/workflows/test_qlib_from_source.yml @@ -64,7 +64,10 @@ jobs: python -m pip install -e .[dev] - name: Lint with Black + # Python 3.7 will use a black with low level. So we use python with higher version for black check + if: (matrix.python-version != '3.7') run: | + pip install -U black # follow the latest version of black, previous Qlib dependency will downgrade black black . -l 120 --check --diff - name: Make html with sphinx diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ea57aeb0ee..15f00414c9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,6 @@ repos: - repo: https://github.com/psf/black - rev: 22.6.0 + rev: 23.7.0 hooks: - id: black args: ["qlib", "-l 120"] @@ -9,4 +9,4 @@ repos: rev: 4.0.1 hooks: - id: flake8 - args: ["--ignore=E501,F541,E266,E402,W503,E731,E203"] \ No newline at end of file + args: ["--ignore=E501,F541,E266,E402,W503,E731,E203"] diff --git a/docs/component/workflow.rst b/docs/component/workflow.rst index 9b84ae4ca8..19ba980a1f 100644 --- a/docs/component/workflow.rst +++ b/docs/component/workflow.rst @@ -53,9 +53,7 @@ Below is a typical config file of ``qrun``. kwargs: topk: 50 n_drop: 5 - signal: - - - - + signal: backtest: limit_threshold: 0.095 account: 100000000 @@ -281,9 +279,7 @@ The following script is the configuration of `backtest` and the `strategy` used kwargs: topk: 50 n_drop: 5 - signal: - - - - + signal: backtest: limit_threshold: 0.095 account: 100000000 diff --git a/examples/benchmarks/ADARNN/workflow_config_adarnn_Alpha360.yaml b/examples/benchmarks/ADARNN/workflow_config_adarnn_Alpha360.yaml index ac49d01457..ae2bad5cc8 100644 --- a/examples/benchmarks/ADARNN/workflow_config_adarnn_Alpha360.yaml +++ b/examples/benchmarks/ADARNN/workflow_config_adarnn_Alpha360.yaml @@ -28,8 +28,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - model: - dataset: + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/ADD/workflow_config_add_Alpha360.yaml b/examples/benchmarks/ADD/workflow_config_add_Alpha360.yaml index 033d4d22e4..b2168a1b8a 100644 --- a/examples/benchmarks/ADD/workflow_config_add_Alpha360.yaml +++ b/examples/benchmarks/ADD/workflow_config_add_Alpha360.yaml @@ -28,9 +28,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/ALSTM/workflow_config_alstm_Alpha158.yaml b/examples/benchmarks/ALSTM/workflow_config_alstm_Alpha158.yaml index a8e89e3607..568505ee3b 100755 --- a/examples/benchmarks/ALSTM/workflow_config_alstm_Alpha158.yaml +++ b/examples/benchmarks/ALSTM/workflow_config_alstm_Alpha158.yaml @@ -36,9 +36,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/ALSTM/workflow_config_alstm_Alpha360.yaml b/examples/benchmarks/ALSTM/workflow_config_alstm_Alpha360.yaml index 3aa8147fcf..b345cacd98 100644 --- a/examples/benchmarks/ALSTM/workflow_config_alstm_Alpha360.yaml +++ b/examples/benchmarks/ALSTM/workflow_config_alstm_Alpha360.yaml @@ -28,9 +28,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/CatBoost/workflow_config_catboost_Alpha158.yaml b/examples/benchmarks/CatBoost/workflow_config_catboost_Alpha158.yaml index 2eb642741b..635611ffa6 100644 --- a/examples/benchmarks/CatBoost/workflow_config_catboost_Alpha158.yaml +++ b/examples/benchmarks/CatBoost/workflow_config_catboost_Alpha158.yaml @@ -14,9 +14,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/CatBoost/workflow_config_catboost_Alpha158_csi500.yaml b/examples/benchmarks/CatBoost/workflow_config_catboost_Alpha158_csi500.yaml index bb7c42fd0a..c40f0f81ad 100644 --- a/examples/benchmarks/CatBoost/workflow_config_catboost_Alpha158_csi500.yaml +++ b/examples/benchmarks/CatBoost/workflow_config_catboost_Alpha158_csi500.yaml @@ -14,9 +14,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/CatBoost/workflow_config_catboost_Alpha360.yaml b/examples/benchmarks/CatBoost/workflow_config_catboost_Alpha360.yaml index 982963eeae..136ab7e6fc 100644 --- a/examples/benchmarks/CatBoost/workflow_config_catboost_Alpha360.yaml +++ b/examples/benchmarks/CatBoost/workflow_config_catboost_Alpha360.yaml @@ -21,9 +21,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/CatBoost/workflow_config_catboost_Alpha360_csi500.yaml b/examples/benchmarks/CatBoost/workflow_config_catboost_Alpha360_csi500.yaml index da4962b54f..448140702d 100644 --- a/examples/benchmarks/CatBoost/workflow_config_catboost_Alpha360_csi500.yaml +++ b/examples/benchmarks/CatBoost/workflow_config_catboost_Alpha360_csi500.yaml @@ -21,9 +21,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158.yaml b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158.yaml index 85cc0a2703..58a01d63ab 100644 --- a/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158.yaml +++ b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158.yaml @@ -14,9 +14,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158_csi500.yaml b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158_csi500.yaml index b2358c6bfe..ea92fbc7c4 100644 --- a/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158_csi500.yaml +++ b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha158_csi500.yaml @@ -14,9 +14,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha360.yaml b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha360.yaml index 74db1f362f..edb5e960f2 100644 --- a/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha360.yaml +++ b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha360.yaml @@ -21,9 +21,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha360_csi500.yaml b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha360_csi500.yaml index f10355f226..ec8afefb45 100644 --- a/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha360_csi500.yaml +++ b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_Alpha360_csi500.yaml @@ -21,9 +21,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_early_stop_Alpha158.yaml b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_early_stop_Alpha158.yaml index b3c38870e6..3960aca158 100644 --- a/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_early_stop_Alpha158.yaml +++ b/examples/benchmarks/DoubleEnsemble/workflow_config_doubleensemble_early_stop_Alpha158.yaml @@ -14,9 +14,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/GATs/workflow_config_gats_Alpha158.yaml b/examples/benchmarks/GATs/workflow_config_gats_Alpha158.yaml index e056bc845a..0710f31817 100644 --- a/examples/benchmarks/GATs/workflow_config_gats_Alpha158.yaml +++ b/examples/benchmarks/GATs/workflow_config_gats_Alpha158.yaml @@ -35,9 +35,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/GATs/workflow_config_gats_Alpha360.yaml b/examples/benchmarks/GATs/workflow_config_gats_Alpha360.yaml index 2effecd617..095e0bade9 100644 --- a/examples/benchmarks/GATs/workflow_config_gats_Alpha360.yaml +++ b/examples/benchmarks/GATs/workflow_config_gats_Alpha360.yaml @@ -28,9 +28,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/GRU/workflow_config_gru_Alpha158.yaml b/examples/benchmarks/GRU/workflow_config_gru_Alpha158.yaml index 7c525c12a2..a2f03a2302 100755 --- a/examples/benchmarks/GRU/workflow_config_gru_Alpha158.yaml +++ b/examples/benchmarks/GRU/workflow_config_gru_Alpha158.yaml @@ -36,9 +36,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/GRU/workflow_config_gru_Alpha360.yaml b/examples/benchmarks/GRU/workflow_config_gru_Alpha360.yaml index 2daaa01362..f5d837a061 100644 --- a/examples/benchmarks/GRU/workflow_config_gru_Alpha360.yaml +++ b/examples/benchmarks/GRU/workflow_config_gru_Alpha360.yaml @@ -28,9 +28,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/HIST/workflow_config_hist_Alpha360.yaml b/examples/benchmarks/HIST/workflow_config_hist_Alpha360.yaml index b3e96f4854..cd50b33879 100644 --- a/examples/benchmarks/HIST/workflow_config_hist_Alpha360.yaml +++ b/examples/benchmarks/HIST/workflow_config_hist_Alpha360.yaml @@ -28,9 +28,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: @@ -89,4 +87,4 @@ task: - class: PortAnaRecord module_path: qlib.workflow.record_temp kwargs: - config: *port_analysis_config \ No newline at end of file + config: *port_analysis_config diff --git a/examples/benchmarks/IGMTF/workflow_config_igmtf_Alpha360.yaml b/examples/benchmarks/IGMTF/workflow_config_igmtf_Alpha360.yaml index 1fc908ea9a..838e660649 100644 --- a/examples/benchmarks/IGMTF/workflow_config_igmtf_Alpha360.yaml +++ b/examples/benchmarks/IGMTF/workflow_config_igmtf_Alpha360.yaml @@ -28,8 +28,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - model: - dataset: + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/KRNN/workflow_config_krnn_Alpha360.yaml b/examples/benchmarks/KRNN/workflow_config_krnn_Alpha360.yaml index 691607ad14..b5a3e3bc00 100644 --- a/examples/benchmarks/KRNN/workflow_config_krnn_Alpha360.yaml +++ b/examples/benchmarks/KRNN/workflow_config_krnn_Alpha360.yaml @@ -28,9 +28,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/LSTM/workflow_config_lstm_Alpha158.yaml b/examples/benchmarks/LSTM/workflow_config_lstm_Alpha158.yaml index bf3738bc06..522f6443cb 100755 --- a/examples/benchmarks/LSTM/workflow_config_lstm_Alpha158.yaml +++ b/examples/benchmarks/LSTM/workflow_config_lstm_Alpha158.yaml @@ -36,9 +36,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/LSTM/workflow_config_lstm_Alpha360.yaml b/examples/benchmarks/LSTM/workflow_config_lstm_Alpha360.yaml index d550cacb29..e4f9b2fe94 100644 --- a/examples/benchmarks/LSTM/workflow_config_lstm_Alpha360.yaml +++ b/examples/benchmarks/LSTM/workflow_config_lstm_Alpha360.yaml @@ -28,9 +28,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/LightGBM/multi_freq_handler.py b/examples/benchmarks/LightGBM/multi_freq_handler.py index b3e138192d..1d4ba2b82b 100644 --- a/examples/benchmarks/LightGBM/multi_freq_handler.py +++ b/examples/benchmarks/LightGBM/multi_freq_handler.py @@ -48,7 +48,6 @@ def __init__( ) def loader_config(self): - # Results for dataset: df: pd.DataFrame # len(df.columns) == 6 + 6 * 16, len(df.index.get_level_values(level="datetime").unique()) == T # df.columns: close0, close1, ..., close16, open0, ..., open16, ..., vwap16 diff --git a/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml b/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml index 2d441dea92..5ae3168015 100644 --- a/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml +++ b/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158.yaml @@ -14,8 +14,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - model: - dataset: + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158_csi500.yaml b/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158_csi500.yaml index 327e7fffaa..aa017bc9bf 100644 --- a/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158_csi500.yaml +++ b/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158_csi500.yaml @@ -14,8 +14,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - model: - dataset: + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158_multi_freq.yaml b/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158_multi_freq.yaml index 6b58ea4bd2..0e63b23f88 100644 --- a/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158_multi_freq.yaml +++ b/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha158_multi_freq.yaml @@ -33,9 +33,7 @@ port_analysis_config: &port_analysis_config kwargs: topk: 50 n_drop: 5 - signal: - - - - + signal: backtest: verbose: False limit_threshold: 0.095 diff --git a/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha360.yaml b/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha360.yaml index 053c5bd29b..e43a390a26 100644 --- a/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha360.yaml +++ b/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha360.yaml @@ -21,9 +21,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha360_csi500.yaml b/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha360_csi500.yaml index 767050919f..aa3ac8b5ea 100644 --- a/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha360_csi500.yaml +++ b/examples/benchmarks/LightGBM/workflow_config_lightgbm_Alpha360_csi500.yaml @@ -21,9 +21,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/LightGBM/workflow_config_lightgbm_configurable_dataset.yaml b/examples/benchmarks/LightGBM/workflow_config_lightgbm_configurable_dataset.yaml index f1ffc45da2..7a784a5c86 100644 --- a/examples/benchmarks/LightGBM/workflow_config_lightgbm_configurable_dataset.yaml +++ b/examples/benchmarks/LightGBM/workflow_config_lightgbm_configurable_dataset.yaml @@ -29,9 +29,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/LightGBM/workflow_config_lightgbm_multi_freq.yaml b/examples/benchmarks/LightGBM/workflow_config_lightgbm_multi_freq.yaml index 11b277ce60..af867a24ea 100644 --- a/examples/benchmarks/LightGBM/workflow_config_lightgbm_multi_freq.yaml +++ b/examples/benchmarks/LightGBM/workflow_config_lightgbm_multi_freq.yaml @@ -31,9 +31,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/Linear/workflow_config_linear_Alpha158.yaml b/examples/benchmarks/Linear/workflow_config_linear_Alpha158.yaml index 290a8bc42d..e65dae2505 100644 --- a/examples/benchmarks/Linear/workflow_config_linear_Alpha158.yaml +++ b/examples/benchmarks/Linear/workflow_config_linear_Alpha158.yaml @@ -27,9 +27,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/Linear/workflow_config_linear_Alpha158_csi500.yaml b/examples/benchmarks/Linear/workflow_config_linear_Alpha158_csi500.yaml index 53e12b9998..bff2e6a74e 100644 --- a/examples/benchmarks/Linear/workflow_config_linear_Alpha158_csi500.yaml +++ b/examples/benchmarks/Linear/workflow_config_linear_Alpha158_csi500.yaml @@ -27,9 +27,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/Localformer/workflow_config_localformer_Alpha158.yaml b/examples/benchmarks/Localformer/workflow_config_localformer_Alpha158.yaml index 7f5a78e744..e3200f129f 100644 --- a/examples/benchmarks/Localformer/workflow_config_localformer_Alpha158.yaml +++ b/examples/benchmarks/Localformer/workflow_config_localformer_Alpha158.yaml @@ -36,9 +36,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/Localformer/workflow_config_localformer_Alpha360.yaml b/examples/benchmarks/Localformer/workflow_config_localformer_Alpha360.yaml index 9de80a350e..39c0093acf 100644 --- a/examples/benchmarks/Localformer/workflow_config_localformer_Alpha360.yaml +++ b/examples/benchmarks/Localformer/workflow_config_localformer_Alpha360.yaml @@ -28,9 +28,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/MLP/workflow_config_mlp_Alpha158.yaml b/examples/benchmarks/MLP/workflow_config_mlp_Alpha158.yaml index b2012ba8cf..6c85546ca2 100644 --- a/examples/benchmarks/MLP/workflow_config_mlp_Alpha158.yaml +++ b/examples/benchmarks/MLP/workflow_config_mlp_Alpha158.yaml @@ -41,9 +41,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/MLP/workflow_config_mlp_Alpha158_csi500.yaml b/examples/benchmarks/MLP/workflow_config_mlp_Alpha158_csi500.yaml index 8628898d33..745c9b017f 100644 --- a/examples/benchmarks/MLP/workflow_config_mlp_Alpha158_csi500.yaml +++ b/examples/benchmarks/MLP/workflow_config_mlp_Alpha158_csi500.yaml @@ -41,9 +41,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/MLP/workflow_config_mlp_Alpha360.yaml b/examples/benchmarks/MLP/workflow_config_mlp_Alpha360.yaml index 359e792024..b9cccd52e9 100644 --- a/examples/benchmarks/MLP/workflow_config_mlp_Alpha360.yaml +++ b/examples/benchmarks/MLP/workflow_config_mlp_Alpha360.yaml @@ -29,9 +29,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/MLP/workflow_config_mlp_Alpha360_csi500.yaml b/examples/benchmarks/MLP/workflow_config_mlp_Alpha360_csi500.yaml index 3862295f66..2156334630 100644 --- a/examples/benchmarks/MLP/workflow_config_mlp_Alpha360_csi500.yaml +++ b/examples/benchmarks/MLP/workflow_config_mlp_Alpha360_csi500.yaml @@ -29,9 +29,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/SFM/workflow_config_sfm_Alpha360.yaml b/examples/benchmarks/SFM/workflow_config_sfm_Alpha360.yaml index d750a9980b..d992af3427 100644 --- a/examples/benchmarks/SFM/workflow_config_sfm_Alpha360.yaml +++ b/examples/benchmarks/SFM/workflow_config_sfm_Alpha360.yaml @@ -28,9 +28,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/Sandwich/workflow_config_sandwich_Alpha360.yaml b/examples/benchmarks/Sandwich/workflow_config_sandwich_Alpha360.yaml index 717a034710..29e67d67e4 100644 --- a/examples/benchmarks/Sandwich/workflow_config_sandwich_Alpha360.yaml +++ b/examples/benchmarks/Sandwich/workflow_config_sandwich_Alpha360.yaml @@ -28,9 +28,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/TCN/workflow_config_tcn_Alpha158.yaml b/examples/benchmarks/TCN/workflow_config_tcn_Alpha158.yaml index c6f663f948..dcb7508a42 100755 --- a/examples/benchmarks/TCN/workflow_config_tcn_Alpha158.yaml +++ b/examples/benchmarks/TCN/workflow_config_tcn_Alpha158.yaml @@ -36,8 +36,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - model: - dataset: + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/TCN/workflow_config_tcn_Alpha360.yaml b/examples/benchmarks/TCN/workflow_config_tcn_Alpha360.yaml index e383662fc1..4756a93b23 100644 --- a/examples/benchmarks/TCN/workflow_config_tcn_Alpha360.yaml +++ b/examples/benchmarks/TCN/workflow_config_tcn_Alpha360.yaml @@ -28,8 +28,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - model: - dataset: + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/TCTS/workflow_config_tcts_Alpha360.yaml b/examples/benchmarks/TCTS/workflow_config_tcts_Alpha360.yaml index 460a470bb6..7adf97582a 100644 --- a/examples/benchmarks/TCTS/workflow_config_tcts_Alpha360.yaml +++ b/examples/benchmarks/TCTS/workflow_config_tcts_Alpha360.yaml @@ -30,9 +30,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: @@ -95,4 +93,4 @@ task: - class: PortAnaRecord module_path: qlib.workflow.record_temp kwargs: - config: *port_analysis_config \ No newline at end of file + config: *port_analysis_config diff --git a/examples/benchmarks/TFT/data_formatters/base.py b/examples/benchmarks/TFT/data_formatters/base.py index 9df0448bab..9cdce6382d 100644 --- a/examples/benchmarks/TFT/data_formatters/base.py +++ b/examples/benchmarks/TFT/data_formatters/base.py @@ -139,7 +139,6 @@ def get_column_definition(self): # Sanity checks first. # Ensure only one ID and time column exist def _check_single_column(input_type): - length = len([tup for tup in column_definition if tup[2] == input_type]) if length != 1: diff --git a/examples/benchmarks/TFT/expt_settings/configs.py b/examples/benchmarks/TFT/expt_settings/configs.py index 62aa68c38a..55eb32a0b1 100644 --- a/examples/benchmarks/TFT/expt_settings/configs.py +++ b/examples/benchmarks/TFT/expt_settings/configs.py @@ -78,7 +78,6 @@ def data_csv_path(self): @property def hyperparam_iterations(self): - return 240 if self.experiment == "volatility" else 60 def make_data_formatter(self): diff --git a/examples/benchmarks/TFT/libs/hyperparam_opt.py b/examples/benchmarks/TFT/libs/hyperparam_opt.py index e18f5b7163..86f587d7db 100644 --- a/examples/benchmarks/TFT/libs/hyperparam_opt.py +++ b/examples/benchmarks/TFT/libs/hyperparam_opt.py @@ -88,7 +88,6 @@ def load_results(self): params_file = os.path.join(self.hyperparam_folder, "params.csv") if os.path.exists(results_file) and os.path.exists(params_file): - self.results = pd.read_csv(results_file, index_col=0) self.saved_params = pd.read_csv(params_file, index_col=0) @@ -178,7 +177,6 @@ def _get_next(): return parameters for _ in range(self._max_tries): - parameters = _get_next() name = self._get_name(parameters) diff --git a/examples/benchmarks/TFT/libs/tft_model.py b/examples/benchmarks/TFT/libs/tft_model.py index aa055e2947..2a1a2fa152 100644 --- a/examples/benchmarks/TFT/libs/tft_model.py +++ b/examples/benchmarks/TFT/libs/tft_model.py @@ -475,7 +475,6 @@ def get_tft_embeddings(self, all_inputs): embeddings = [] for i in range(num_categorical_variables): - embedding = tf.keras.Sequential( [ tf.keras.layers.InputLayer([time_steps]), @@ -680,7 +679,6 @@ def _batch_single_entity(input_data): data_map = {} for _, sliced in data.groupby(id_col): - col_mappings = {"identifier": [id_col], "time": [time_col], "outputs": [target_col], "inputs": input_cols} for k in col_mappings: @@ -954,7 +952,6 @@ def build_model(self): """ with tf.variable_scope(self.name): - transformer_layer, all_inputs, attention_components = self._build_base_graph() outputs = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(self.output_size * len(self.quantiles)))( diff --git a/examples/benchmarks/TFT/workflow_config_tft_Alpha158.yaml b/examples/benchmarks/TFT/workflow_config_tft_Alpha158.yaml index d83878e3ea..e925fb772d 100644 --- a/examples/benchmarks/TFT/workflow_config_tft_Alpha158.yaml +++ b/examples/benchmarks/TFT/workflow_config_tft_Alpha158.yaml @@ -16,9 +16,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/TRA/example.py b/examples/benchmarks/TRA/example.py index defacf412a..0d52c87750 100644 --- a/examples/benchmarks/TRA/example.py +++ b/examples/benchmarks/TRA/example.py @@ -6,7 +6,6 @@ def main(seed, config_file="configs/config_alstm.yaml"): - # set random seed with open(config_file) as f: config = yaml.safe_load(f) @@ -30,7 +29,6 @@ def main(seed, config_file="configs/config_alstm.yaml"): if __name__ == "__main__": - # set params from cmd parser = argparse.ArgumentParser(allow_abbrev=False) parser.add_argument("--seed", type=int, default=1000, help="random seed") diff --git a/examples/benchmarks/TRA/src/dataset.py b/examples/benchmarks/TRA/src/dataset.py index 6740b1cbdf..de4b2ad411 100644 --- a/examples/benchmarks/TRA/src/dataset.py +++ b/examples/benchmarks/TRA/src/dataset.py @@ -96,7 +96,6 @@ def __init__( drop_last=False, **kwargs, ): - assert horizon > 0, "please specify `horizon` to avoid data leakage" self.seq_len = seq_len @@ -111,7 +110,6 @@ def __init__( super().__init__(handler, segments, **kwargs) def setup_data(self, handler_kwargs: dict = None, **kwargs): - super().setup_data() # change index to diff --git a/examples/benchmarks/TRA/src/model.py b/examples/benchmarks/TRA/src/model.py index cff94388ed..affb115a10 100644 --- a/examples/benchmarks/TRA/src/model.py +++ b/examples/benchmarks/TRA/src/model.py @@ -45,7 +45,6 @@ def __init__( avg_params=True, **kwargs, ): - np.random.seed(seed) torch.manual_seed(seed) @@ -93,7 +92,6 @@ def __init__( self.global_step = -1 def train_epoch(self, data_set): - self.model.train() self.tra.train() @@ -146,7 +144,6 @@ def train_epoch(self, data_set): return total_loss def test_epoch(self, data_set, return_pred=False): - self.model.eval() self.tra.eval() data_set.eval() @@ -204,7 +201,6 @@ def test_epoch(self, data_set, return_pred=False): return metrics, preds def fit(self, dataset, evals_result=dict()): - train_set, valid_set, test_set = dataset.prepare(["train", "valid", "test"]) best_score = -1 @@ -380,7 +376,6 @@ def __init__( self.output_size = hidden_size def forward(self, x): - x = self.input_drop(x) if self.training and self.noise_level > 0: @@ -464,7 +459,6 @@ def __init__( self.output_size = hidden_size def forward(self, x): - x = self.input_drop(x) if self.training and self.noise_level > 0: @@ -514,7 +508,6 @@ def __init__(self, input_size, num_states=1, hidden_size=8, tau=1.0, src_info="L self.predictors = nn.Linear(input_size, num_states) def forward(self, hidden, hist_loss): - preds = self.predictors(hidden) if self.num_states == 1: diff --git a/examples/benchmarks/TRA/workflow_config_tra_Alpha158.yaml b/examples/benchmarks/TRA/workflow_config_tra_Alpha158.yaml index c86f87fc65..02c4ecac39 100644 --- a/examples/benchmarks/TRA/workflow_config_tra_Alpha158.yaml +++ b/examples/benchmarks/TRA/workflow_config_tra_Alpha158.yaml @@ -57,9 +57,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/TRA/workflow_config_tra_Alpha158_full.yaml b/examples/benchmarks/TRA/workflow_config_tra_Alpha158_full.yaml index 75f18f3ee6..9ccf56e865 100644 --- a/examples/benchmarks/TRA/workflow_config_tra_Alpha158_full.yaml +++ b/examples/benchmarks/TRA/workflow_config_tra_Alpha158_full.yaml @@ -51,9 +51,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/TRA/workflow_config_tra_Alpha360.yaml b/examples/benchmarks/TRA/workflow_config_tra_Alpha360.yaml index 9ab5b904ba..29686d7dac 100644 --- a/examples/benchmarks/TRA/workflow_config_tra_Alpha360.yaml +++ b/examples/benchmarks/TRA/workflow_config_tra_Alpha360.yaml @@ -51,9 +51,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/TabNet/workflow_config_TabNet_Alpha158.yaml b/examples/benchmarks/TabNet/workflow_config_TabNet_Alpha158.yaml index d9b94e86c3..7549688b97 100644 --- a/examples/benchmarks/TabNet/workflow_config_TabNet_Alpha158.yaml +++ b/examples/benchmarks/TabNet/workflow_config_TabNet_Alpha158.yaml @@ -28,9 +28,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/TabNet/workflow_config_TabNet_Alpha360.yaml b/examples/benchmarks/TabNet/workflow_config_TabNet_Alpha360.yaml index 830943d6bd..7155d25b18 100644 --- a/examples/benchmarks/TabNet/workflow_config_TabNet_Alpha360.yaml +++ b/examples/benchmarks/TabNet/workflow_config_TabNet_Alpha360.yaml @@ -28,9 +28,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/Transformer/workflow_config_transformer_Alpha158.yaml b/examples/benchmarks/Transformer/workflow_config_transformer_Alpha158.yaml index e36d44c431..ce5105108a 100644 --- a/examples/benchmarks/Transformer/workflow_config_transformer_Alpha158.yaml +++ b/examples/benchmarks/Transformer/workflow_config_transformer_Alpha158.yaml @@ -36,9 +36,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/Transformer/workflow_config_transformer_Alpha360.yaml b/examples/benchmarks/Transformer/workflow_config_transformer_Alpha360.yaml index cab46a4d45..35342de949 100644 --- a/examples/benchmarks/Transformer/workflow_config_transformer_Alpha360.yaml +++ b/examples/benchmarks/Transformer/workflow_config_transformer_Alpha360.yaml @@ -28,9 +28,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/XGBoost/workflow_config_xgboost_Alpha158.yaml b/examples/benchmarks/XGBoost/workflow_config_xgboost_Alpha158.yaml index 5ee38cf701..0c7f55d022 100644 --- a/examples/benchmarks/XGBoost/workflow_config_xgboost_Alpha158.yaml +++ b/examples/benchmarks/XGBoost/workflow_config_xgboost_Alpha158.yaml @@ -14,9 +14,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks/XGBoost/workflow_config_xgboost_Alpha360.yaml b/examples/benchmarks/XGBoost/workflow_config_xgboost_Alpha360.yaml index 7c98bd40cc..8e7b543722 100644 --- a/examples/benchmarks/XGBoost/workflow_config_xgboost_Alpha360.yaml +++ b/examples/benchmarks/XGBoost/workflow_config_xgboost_Alpha360.yaml @@ -21,9 +21,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks_dynamic/DDG-DA/README.md b/examples/benchmarks_dynamic/DDG-DA/README.md index 4d49315bd3..ac4349d91e 100644 --- a/examples/benchmarks_dynamic/DDG-DA/README.md +++ b/examples/benchmarks_dynamic/DDG-DA/README.md @@ -16,12 +16,12 @@ Though the dataset is different, the conclusion remains the same. By applying `D # Run the Code Users can try `DDG-DA` by running the following command: ```bash - python workflow.py run_all + python workflow.py run ``` The default forecasting models are `Linear`. Users can choose other forecasting models by changing the `forecast_model` parameter when `DDG-DA` initializes. For example, users can try `LightGBM` forecasting models by running the following command: ```bash - python workflow.py --forecast_model="gbdt" run_all + python workflow.py --conf_path=../workflow_config_lightgbm_Alpha158.yaml run ``` # Results diff --git a/examples/benchmarks_dynamic/DDG-DA/workflow.py b/examples/benchmarks_dynamic/DDG-DA/workflow.py index fef86726de..7593fe374f 100644 --- a/examples/benchmarks_dynamic/DDG-DA/workflow.py +++ b/examples/benchmarks_dynamic/DDG-DA/workflow.py @@ -1,305 +1,40 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. from pathlib import Path -from qlib.model.meta.task import MetaTask -from qlib.contrib.meta.data_selection.model import MetaModelDS -from qlib.contrib.meta.data_selection.dataset import InternalData, MetaDatasetDS -from qlib.data.dataset.handler import DataHandlerLP +from typing import Union -import pandas as pd import fire -import sys -import pickle -from typing import Optional + from qlib import auto_init -from qlib.model.trainer import TrainerR -from qlib.typehint import Literal -from qlib.utils import init_instance_by_config -from qlib.workflow import R +from qlib.contrib.rolling.ddgda import DDGDA from qlib.tests.data import GetData DIRNAME = Path(__file__).absolute().resolve().parent -sys.path.append(str(DIRNAME.parent / "baseline")) -from rolling_benchmark import RollingBenchmark # NOTE: sys.path is changed for import RollingBenchmark - - -class DDGDA: - """ - please run `python workflow.py run_all` to run the full workflow of the experiment - - **NOTE** - before running the example, please clean your previous results with following command - - `rm -r mlruns` - """ - - def __init__( - self, - sim_task_model: Literal["linear", "gbdt"] = "gbdt", - forecast_model: Literal["linear", "gbdt"] = "linear", - h_path: Optional[str] = None, - test_end: Optional[str] = None, - train_start: Optional[str] = None, - meta_1st_train_end: Optional[str] = None, - task_ext_conf: Optional[dict] = None, - alpha: float = 0.01, - proxy_hd: str = "handler_proxy.pkl", - ): - """ - - Parameters - ---------- - - train_start: Optional[str] - the start datetime for data. It is used in training start time (for both tasks & meta learing) - test_end: Optional[str] - the end datetime for data. It is used in test end time - meta_1st_train_end: Optional[str] - the datetime of training end of the first meta_task - alpha: float - Setting the L2 regularization for ridge - The `alpha` is only passed to MetaModelDS (it is not passed to sim_task_model currently..) - """ - self.step = 20 - # NOTE: - # the horizon must match the meaning in the base task template - self.horizon = 20 - self.meta_exp_name = "DDG-DA" - self.sim_task_model = sim_task_model # The model to capture the distribution of data. - self.forecast_model = forecast_model # downstream forecasting models' type - self.rb_kwargs = { - "h_path": h_path, - "test_end": test_end, - "train_start": train_start, - "task_ext_conf": task_ext_conf, - } - self.alpha = alpha - self.meta_1st_train_end = meta_1st_train_end - self.proxy_hd = proxy_hd - - def get_feature_importance(self): - # this must be lightGBM, because it needs to get the feature importance - rb = RollingBenchmark(model_type="gbdt", **self.rb_kwargs) - task = rb.basic_task() - - with R.start(experiment_name="feature_importance"): - model = init_instance_by_config(task["model"]) - dataset = init_instance_by_config(task["dataset"]) - model.fit(dataset) - - fi = model.get_feature_importance() - - # Because the model use numpy instead of dataframe for training lightgbm - # So the we must use following extra steps to get the right feature importance - df = dataset.prepare(segments=slice(None), col_set="feature", data_key=DataHandlerLP.DK_R) - cols = df.columns - fi_named = {cols[int(k.split("_")[1])]: imp for k, imp in fi.to_dict().items()} - - return pd.Series(fi_named) - - def dump_data_for_proxy_model(self): - """ - Dump data for training meta model. - The meta model will be trained upon the proxy forecasting model. - This dataset is for the proxy forecasting model. - """ - topk = 30 - fi = self.get_feature_importance() - col_selected = fi.nlargest(topk) - - rb = RollingBenchmark(model_type=self.sim_task_model, **self.rb_kwargs) - task = rb.basic_task() - dataset = init_instance_by_config(task["dataset"]) - prep_ds = dataset.prepare(slice(None), col_set=["feature", "label"], data_key=DataHandlerLP.DK_L) - - feature_df = prep_ds["feature"] - label_df = prep_ds["label"] - - feature_selected = feature_df.loc[:, col_selected.index] - - feature_selected = feature_selected.groupby("datetime", group_keys=False).apply( - lambda df: (df - df.mean()).div(df.std()) - ) - feature_selected = feature_selected.fillna(0.0) - - df_all = { - "label": label_df.reindex(feature_selected.index), - "feature": feature_selected, - } - df_all = pd.concat(df_all, axis=1) - df_all.to_pickle(DIRNAME / "fea_label_df.pkl") - - # dump data in handler format for aligning the interface - handler = DataHandlerLP( - data_loader={ - "class": "qlib.data.dataset.loader.StaticDataLoader", - "kwargs": {"config": DIRNAME / "fea_label_df.pkl"}, - } - ) - handler.to_pickle(DIRNAME / self.proxy_hd, dump_all=True) - - @property - def _internal_data_path(self): - return DIRNAME / f"internal_data_s{self.step}.pkl" - - def dump_meta_ipt(self): - """ - Dump data for training meta model. - This function will dump the input data for meta model - """ - # According to the experiments, the choice of the model type is very important for achieving good results - rb = RollingBenchmark(model_type=self.sim_task_model, **self.rb_kwargs) - sim_task = rb.basic_task() - - if self.sim_task_model == "gbdt": - sim_task["model"].setdefault("kwargs", {}).update({"early_stopping_rounds": None, "num_boost_round": 150}) - - exp_name_sim = f"data_sim_s{self.step}" - - internal_data = InternalData(sim_task, self.step, exp_name=exp_name_sim) - internal_data.setup(trainer=TrainerR) - - with self._internal_data_path.open("wb") as f: - pickle.dump(internal_data, f) - - def train_meta_model(self, fill_method="max"): - """ - training a meta model based on a simplified linear proxy model; - """ - - # 1) leverage the simplified proxy forecasting model to train meta model. - # - Only the dataset part is important, in current version of meta model will integrate the - rb = RollingBenchmark(model_type=self.sim_task_model, **self.rb_kwargs) - sim_task = rb.basic_task() - # the train_start for training meta model does not necessarily align with final rolling - train_start = "2008-01-01" if self.rb_kwargs.get("train_start") is None else self.rb_kwargs.get("train_start") - train_end = "2010-12-31" if self.meta_1st_train_end is None else self.meta_1st_train_end - test_start = (pd.Timestamp(train_end) + pd.Timedelta(days=1)).strftime("%Y-%m-%d") - proxy_forecast_model_task = { - # "model": "qlib.contrib.model.linear.LinearModel", - "dataset": { - "class": "qlib.data.dataset.DatasetH", - "kwargs": { - "handler": f"file://{(DIRNAME / self.proxy_hd).absolute()}", - "segments": { - "train": (train_start, train_end), - "test": (test_start, sim_task["dataset"]["kwargs"]["segments"]["test"][1]), - }, - }, - }, - # "record": ["qlib.workflow.record_temp.SignalRecord"] - } - # the proxy_forecast_model_task will be used to create meta tasks. - # The test date of first task will be 2011-01-01. Each test segment will be about 20days - # The tasks include all training tasks and test tasks. - - # 2) preparing meta dataset - kwargs = dict( - task_tpl=proxy_forecast_model_task, - step=self.step, - segments=0.62, # keep test period consistent with the dataset yaml - trunc_days=1 + self.horizon, - hist_step_n=30, - fill_method=fill_method, - rolling_ext_days=0, - ) - # NOTE: - # the input of meta model (internal data) are shared between proxy model and final forecasting model - # but their task test segment are not aligned! It worked in my previous experiment. - # So the misalignment will not affect the effectiveness of the method. - with self._internal_data_path.open("rb") as f: - internal_data = pickle.load(f) - - md = MetaDatasetDS(exp_name=internal_data, **kwargs) - - # 3) train and logging meta model - with R.start(experiment_name=self.meta_exp_name): - R.log_params(**kwargs) - mm = MetaModelDS( - step=self.step, hist_step_n=kwargs["hist_step_n"], lr=0.001, max_epoch=30, seed=43, alpha=self.alpha - ) - mm.fit(md) - R.save_objects(model=mm) - - @property - def _task_path(self): - return DIRNAME / f"tasks_s{self.step}.pkl" - - def meta_inference(self): - """ - Leverage meta-model for inference: - - Given - - baseline tasks - - input for meta model(internal data) - - meta model (its learnt knowledge on proxy forecasting model is expected to transfer to normal forecasting model) - """ - # 1) get meta model - exp = R.get_exp(experiment_name=self.meta_exp_name) - rec = exp.list_recorders(rtype=exp.RT_L)[0] - meta_model: MetaModelDS = rec.load_object("model") - - # 2) - # we are transfer to knowledge of meta model to final forecasting tasks. - # Create MetaTaskDataset for the final forecasting tasks - # Aligning the setting of it to the MetaTaskDataset when training Meta model is necessary - - # 2.1) get previous config - param = rec.list_params() - trunc_days = int(param["trunc_days"]) - step = int(param["step"]) - hist_step_n = int(param["hist_step_n"]) - fill_method = param.get("fill_method", "max") - - rb = RollingBenchmark(model_type=self.forecast_model, **self.rb_kwargs) - task_l = rb.create_rolling_tasks() +BENCH_DIR = DIRNAME.parent / "baseline" - # 2.2) create meta dataset for final dataset - kwargs = dict( - task_tpl=task_l, - step=step, - segments=0.0, # all the tasks are for testing - trunc_days=trunc_days, - hist_step_n=hist_step_n, - fill_method=fill_method, - task_mode=MetaTask.PROC_MODE_TRANSFER, - ) - with self._internal_data_path.open("rb") as f: - internal_data = pickle.load(f) - mds = MetaDatasetDS(exp_name=internal_data, **kwargs) +class DDGDABench(DDGDA): + # The config in the README.md + CONF_LIST = [ + BENCH_DIR / "workflow_config_linear_Alpha158.yaml", + BENCH_DIR / "workflow_config_lightgbm_Alpha158.yaml", + ] - # 3) meta model make inference and get new qlib task - new_tasks = meta_model.inference(mds) - with self._task_path.open("wb") as f: - pickle.dump(new_tasks, f) + DEFAULT_CONF = CONF_LIST[0] # Linear by default due to efficiency - def train_and_eval_tasks(self): - """ - Training the tasks generated by meta model - Then evaluate it - """ - with self._task_path.open("rb") as f: - tasks = pickle.load(f) - rb = RollingBenchmark(rolling_exp="rolling_ds", model_type=self.forecast_model, **self.rb_kwargs) - rb.train_rolling_tasks(tasks) - rb.ens_rolling() - rb.update_rolling_rec() + def __init__(self, conf_path: Union[str, Path] = DEFAULT_CONF, horizon=20, **kwargs) -> None: + # This code is for being compatible with the previous old code + conf_path = Path(conf_path) + super().__init__(conf_path=conf_path, horizon=horizon, working_dir=DIRNAME, **kwargs) - def run_all(self): - # 1) file: handler_proxy.pkl (self.proxy_hd) - self.dump_data_for_proxy_model() - # 2) - # file: internal_data_s20.pkl - # mlflow: data_sim_s20, models for calculating meta_ipt - self.dump_meta_ipt() - # 3) meta model will be stored in `DDG-DA` - self.train_meta_model() - # 4) new_tasks are saved in "tasks_s20.pkl" (reweighter is added) - self.meta_inference() - # 5) load the saved tasks and train model - self.train_and_eval_tasks() + for f in self.CONF_LIST: + if conf_path.samefile(f): + break + else: + self.logger.warning("Model type is not in the benchmark!") if __name__ == "__main__": GetData().qlib_data(exists_skip=True) auto_init() - fire.Fire(DDGDA) + fire.Fire(DDGDABench) diff --git a/examples/benchmarks_dynamic/baseline/README.md b/examples/benchmarks_dynamic/baseline/README.md index 17e10482db..f176514127 100644 --- a/examples/benchmarks_dynamic/baseline/README.md +++ b/examples/benchmarks_dynamic/baseline/README.md @@ -5,11 +5,12 @@ This is the framework of periodically Rolling Retrain (RR) forecasting models. R ## Run the Code Users can try RR by running the following command: ```bash - python rolling_benchmark.py run_all + python rolling_benchmark.py run ``` The default forecasting models are `Linear`. Users can choose other forecasting models by changing the `model_type` parameter. For example, users can try `LightGBM` forecasting models by running the following command: ```bash - python rolling_benchmark.py --model_type="gbdt" run_all -``` \ No newline at end of file + python rolling_benchmark.py --conf_path=workflow_config_lightgbm_Alpha158.yaml run + +``` diff --git a/examples/benchmarks_dynamic/baseline/rolling_benchmark.py b/examples/benchmarks_dynamic/baseline/rolling_benchmark.py index b0c7aea4fa..1ce30ef8a7 100644 --- a/examples/benchmarks_dynamic/baseline/rolling_benchmark.py +++ b/examples/benchmarks_dynamic/baseline/rolling_benchmark.py @@ -1,161 +1,33 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -from typing import Optional -from qlib.model.ens.ensemble import RollingEnsemble -from qlib.utils import init_instance_by_config +from pathlib import Path +from typing import Union + import fire -import yaml -import pandas as pd + from qlib import auto_init -from pathlib import Path -from tqdm.auto import tqdm -from qlib.model.trainer import TrainerR -from qlib.log import get_module_logger -from qlib.utils.data import update_config -from qlib.workflow import R +from qlib.contrib.rolling.base import Rolling from qlib.tests.data import GetData DIRNAME = Path(__file__).absolute().resolve().parent -from qlib.workflow.task.gen import task_generator, RollingGen -from qlib.workflow.task.collect import RecorderCollector -from qlib.workflow.record_temp import PortAnaRecord, SigAnaRecord -class RollingBenchmark: - """ - **NOTE** - before running the example, please clean your previous results with following command - - `rm -r mlruns` +class RollingBenchmark(Rolling): + # The config in the README.md + CONF_LIST = [DIRNAME / "workflow_config_linear_Alpha158.yaml", DIRNAME / "workflow_config_lightgbm_Alpha158.yaml"] - """ + DEFAULT_CONF = CONF_LIST[0] - def __init__( - self, - rolling_exp: str = "rolling_models", - model_type: str = "linear", - h_path: Optional[str] = None, - train_start: Optional[str] = None, - test_end: Optional[str] = None, - task_ext_conf: Optional[dict] = None, - ) -> None: - """ - Parameters - ---------- - rolling_exp : str - The name for the experiments for rolling - model_type : str - The model to be boosted. - h_path : Optional[str] - the dumped data handler; - test_end : Optional[str] - the test end for the data. It is typically used together with the handler - train_start : Optional[str] - the train start for the data. It is typically used together with the handler. - task_ext_conf : Optional[dict] - some option to update the - """ - self.step = 20 - self.horizon = 20 - self.rolling_exp = rolling_exp - self.model_type = model_type - self.h_path = h_path - self.train_start = train_start - self.test_end = test_end - self.logger = get_module_logger("RollingBenchmark") - self.task_ext_conf = task_ext_conf + def __init__(self, conf_path: Union[str, Path] = DEFAULT_CONF, horizon=20, **kwargs) -> None: + # This code is for being compatible with the previous old code + conf_path = Path(conf_path) + super().__init__(conf_path=conf_path, horizon=horizon, **kwargs) - def basic_task(self): - """For fast training rolling""" - if self.model_type == "gbdt": - conf_path = DIRNAME / "workflow_config_lightgbm_Alpha158.yaml" - # dump the processed data on to disk for later loading to speed up the processing - h_path = DIRNAME / "lightgbm_alpha158_handler_horizon{}.pkl".format(self.horizon) - elif self.model_type == "linear": - # We use ridge regression to stabilize the performance - conf_path = DIRNAME / "workflow_config_linear_Alpha158.yaml" - h_path = DIRNAME / "linear_alpha158_handler_horizon{}.pkl".format(self.horizon) + for f in self.CONF_LIST: + if conf_path.samefile(f): + break else: - raise AssertionError("Model type is not supported!") - - if self.h_path is not None: - h_path = Path(self.h_path) - - with conf_path.open("r") as f: - conf = yaml.safe_load(f) - - # modify dataset horizon - conf["task"]["dataset"]["kwargs"]["handler"]["kwargs"]["label"] = [ - "Ref($close, -{}) / Ref($close, -1) - 1".format(self.horizon + 1) - ] - - task = conf["task"] - - if self.task_ext_conf is not None: - task = update_config(task, self.task_ext_conf) - - if not h_path.exists(): - h_conf = task["dataset"]["kwargs"]["handler"] - h = init_instance_by_config(h_conf) - h.to_pickle(h_path, dump_all=True) - - task["dataset"]["kwargs"]["handler"] = f"file://{h_path}" - task["record"] = ["qlib.workflow.record_temp.SignalRecord"] - - if self.train_start is not None: - seg = task["dataset"]["kwargs"]["segments"]["train"] - task["dataset"]["kwargs"]["segments"]["train"] = pd.Timestamp(self.train_start), seg[1] - - if self.test_end is not None: - seg = task["dataset"]["kwargs"]["segments"]["test"] - task["dataset"]["kwargs"]["segments"]["test"] = seg[0], pd.Timestamp(self.test_end) - self.logger.info(task) - return task - - def create_rolling_tasks(self): - task = self.basic_task() - task_l = task_generator( - task, RollingGen(step=self.step, trunc_days=self.horizon + 1) - ) # the last two days should be truncated to avoid information leakage - return task_l - - def train_rolling_tasks(self, task_l=None): - if task_l is None: - task_l = self.create_rolling_tasks() - trainer = TrainerR(experiment_name=self.rolling_exp) - trainer(task_l) - - COMB_EXP = "rolling" - - def ens_rolling(self): - rc = RecorderCollector( - experiment=self.rolling_exp, - artifacts_key=["pred", "label"], - process_list=[RollingEnsemble()], - # rec_key_func=lambda rec: (self.COMB_EXP, rec.info["id"]), - artifacts_path={"pred": "pred.pkl", "label": "label.pkl"}, - ) - res = rc() - with R.start(experiment_name=self.COMB_EXP): - R.log_params(exp_name=self.rolling_exp) - R.save_objects(**{"pred.pkl": res["pred"], "label.pkl": res["label"]}) - - def update_rolling_rec(self): - """ - Evaluate the combined rolling results - """ - for _, rec in R.list_recorders(experiment_name=self.COMB_EXP).items(): - for rt_cls in SigAnaRecord, PortAnaRecord: - rt = rt_cls(recorder=rec, skip_existing=True) - rt.generate() - print(f"Your evaluation results can be found in the experiment named `{self.COMB_EXP}`.") - - def run_all(self): - # the results will be save in mlruns. - # 1) each rolling task is saved in rolling_models - self.train_rolling_tasks() - # 2) combined rolling tasks and evaluation results are saved in rolling - self.ens_rolling() - self.update_rolling_rec() + self.logger.warning("Model type is not in the benchmark!") if __name__ == "__main__": diff --git a/examples/benchmarks_dynamic/baseline/workflow_config_lightgbm_Alpha158.yaml b/examples/benchmarks_dynamic/baseline/workflow_config_lightgbm_Alpha158.yaml index 2d441dea92..5ae3168015 100644 --- a/examples/benchmarks_dynamic/baseline/workflow_config_lightgbm_Alpha158.yaml +++ b/examples/benchmarks_dynamic/baseline/workflow_config_lightgbm_Alpha158.yaml @@ -14,8 +14,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - model: - dataset: + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/benchmarks_dynamic/baseline/workflow_config_linear_Alpha158.yaml b/examples/benchmarks_dynamic/baseline/workflow_config_linear_Alpha158.yaml index 78ec4e6129..a5c272f288 100644 --- a/examples/benchmarks_dynamic/baseline/workflow_config_linear_Alpha158.yaml +++ b/examples/benchmarks_dynamic/baseline/workflow_config_linear_Alpha158.yaml @@ -27,9 +27,7 @@ port_analysis_config: &port_analysis_config class: TopkDropoutStrategy module_path: qlib.contrib.strategy kwargs: - signal: - - - - + signal: topk: 50 n_drop: 5 backtest: diff --git a/examples/highfreq/highfreq_handler.py b/examples/highfreq/highfreq_handler.py index c15c3ec41e..7df564b7b9 100644 --- a/examples/highfreq/highfreq_handler.py +++ b/examples/highfreq/highfreq_handler.py @@ -14,7 +14,6 @@ def __init__( fit_end_time=None, drop_raw=True, ): - infer_processors = check_transform_proc(infer_processors, fit_start_time, fit_end_time) learn_processors = check_transform_proc(learn_processors, fit_start_time, fit_end_time) diff --git a/examples/highfreq/workflow.py b/examples/highfreq/workflow.py index c631d72e7d..02948c5a12 100644 --- a/examples/highfreq/workflow.py +++ b/examples/highfreq/workflow.py @@ -18,7 +18,6 @@ class HighfreqWorkflow: - SPEC_CONF = {"custom_ops": [DayLast, FFillNan, BFillNan, Date, Select, IsNull, Cut], "expression_cache": None} MARKET = "all" diff --git a/examples/hyperparameter/LightGBM/hyperparameter_158.py b/examples/hyperparameter/LightGBM/hyperparameter_158.py index 8c3e9f3e8d..7520390a68 100644 --- a/examples/hyperparameter/LightGBM/hyperparameter_158.py +++ b/examples/hyperparameter/LightGBM/hyperparameter_158.py @@ -35,7 +35,6 @@ def objective(trial): if __name__ == "__main__": - provider_uri = "~/.qlib/qlib_data/cn_data" GetData().qlib_data(target_dir=provider_uri, region=REG_CN, exists_skip=True) qlib.init(provider_uri=provider_uri, region="cn") diff --git a/examples/hyperparameter/LightGBM/hyperparameter_360.py b/examples/hyperparameter/LightGBM/hyperparameter_360.py index 322c0fa42b..7ba28c78fe 100644 --- a/examples/hyperparameter/LightGBM/hyperparameter_360.py +++ b/examples/hyperparameter/LightGBM/hyperparameter_360.py @@ -38,7 +38,6 @@ def objective(trial): if __name__ == "__main__": - provider_uri = "~/.qlib/qlib_data/cn_data" GetData().qlib_data(target_dir=provider_uri, region=REG_CN, exists_skip=True) qlib.init(provider_uri=provider_uri, region=REG_CN) diff --git a/examples/model_interpreter/feature.py b/examples/model_interpreter/feature.py index bfc58fc845..8ad673d0e2 100644 --- a/examples/model_interpreter/feature.py +++ b/examples/model_interpreter/feature.py @@ -11,7 +11,6 @@ if __name__ == "__main__": - # use default data provider_uri = "~/.qlib/qlib_data/cn_data" # target_dir GetData().qlib_data(target_dir=provider_uri, region=REG_CN, exists_skip=True) diff --git a/examples/portfolio/prepare_riskdata.py b/examples/portfolio/prepare_riskdata.py index 3168e2f379..e502a1ff78 100644 --- a/examples/portfolio/prepare_riskdata.py +++ b/examples/portfolio/prepare_riskdata.py @@ -9,7 +9,6 @@ def prepare_data(riskdata_root="./riskdata", T=240, start_time="2016-01-01"): - universe = D.features(D.instruments("csi300"), ["$close"], start_time=start_time).swaplevel().sort_index() price_all = ( @@ -20,7 +19,6 @@ def prepare_data(riskdata_root="./riskdata", T=240, start_time="2016-01-01"): riskmodel = StructuredCovEstimator() for i in range(T - 1, len(price_all)): - date = price_all.index[i] ref_date = price_all.index[i - T + 1] @@ -47,7 +45,6 @@ def prepare_data(riskdata_root="./riskdata", T=240, start_time="2016-01-01"): if __name__ == "__main__": - import qlib qlib.init(provider_uri="~/.qlib/qlib_data/cn_data") diff --git a/examples/rolling_process_data/workflow.py b/examples/rolling_process_data/workflow.py index 434d365e52..d1c03866a4 100644 --- a/examples/rolling_process_data/workflow.py +++ b/examples/rolling_process_data/workflow.py @@ -13,7 +13,6 @@ class RollingDataWorkflow: - MARKET = "csi300" start_time = "2010-01-01" end_time = "2019-12-31" @@ -93,7 +92,6 @@ def rolling_process(self): dataset = init_instance_by_config(dataset_config) for rolling_offset in range(self.rolling_cnt): - print(f"===========rolling{rolling_offset} start===========") if rolling_offset: dataset.config( diff --git a/examples/workflow_by_code.py b/examples/workflow_by_code.py index 0c4d73a510..94de5c082f 100644 --- a/examples/workflow_by_code.py +++ b/examples/workflow_by_code.py @@ -17,7 +17,6 @@ if __name__ == "__main__": - # use default data provider_uri = "~/.qlib/qlib_data/cn_data" # target_dir GetData().qlib_data(target_dir=provider_uri, region=REG_CN, exists_skip=True) diff --git a/qlib/__init__.py b/qlib/__init__.py index a963a8c285..3355ac04f8 100644 --- a/qlib/__init__.py +++ b/qlib/__init__.py @@ -77,7 +77,6 @@ def init(default_conf="client", **kwargs): def _mount_nfs_uri(provider_uri, mount_path, auto_mount: bool = False): - LOG = get_module_logger("mount nfs", level=logging.INFO) if mount_path is None: raise ValueError(f"Invalid mount path: {mount_path}!") diff --git a/qlib/backtest/__init__.py b/qlib/backtest/__init__.py index bb8ca731bc..d784aed57e 100644 --- a/qlib/backtest/__init__.py +++ b/qlib/backtest/__init__.py @@ -182,7 +182,6 @@ def get_strategy_executor( exchange_kwargs: dict = {}, pos_type: str = "Position", ) -> Tuple[BaseStrategy, BaseExecutor]: - # NOTE: # - for avoiding recursive import # - typing annotations is not reliable diff --git a/qlib/backtest/exchange.py b/qlib/backtest/exchange.py index a752a9f8cb..1ab0d07a75 100644 --- a/qlib/backtest/exchange.py +++ b/qlib/backtest/exchange.py @@ -638,7 +638,6 @@ def generate_order_for_target_amount_position( random.seed(0) random.shuffle(sorted_ids) for stock_id in sorted_ids: - # Do not generate order for the non-tradable stocks if not self.is_stock_tradable(stock_id=stock_id, start_time=start_time, end_time=end_time): continue diff --git a/qlib/config.py b/qlib/config.py index 7b726c6581..7910dab736 100644 --- a/qlib/config.py +++ b/qlib/config.py @@ -293,7 +293,6 @@ class DataPathManager: """ def __init__(self, provider_uri: Union[str, Path, dict], mount_path: Union[str, Path, dict]): - """ The relation of `provider_uri` and `mount_path` - `mount_path` is used only if provider_uri is an NFS path diff --git a/qlib/contrib/data/dataset.py b/qlib/contrib/data/dataset.py index 9ce522cc06..8b40dba1fc 100644 --- a/qlib/contrib/data/dataset.py +++ b/qlib/contrib/data/dataset.py @@ -130,7 +130,6 @@ def __init__( input_size=None, **kwargs, ): - assert num_states == 0 or horizon > 0, "please specify `horizon` to avoid data leakage" assert memory_mode in ["sample", "daily"], "unsupported memory mode" assert memory_mode == "sample" or batch_size < 0, "daily memory requires daily sampling (`batch_size < 0`)" @@ -153,7 +152,6 @@ def __init__( super().__init__(handler, segments, **kwargs) def setup_data(self, handler_kwargs: dict = None, **kwargs): - super().setup_data(**kwargs) if handler_kwargs is not None: @@ -288,7 +286,6 @@ def __iter__(self): daily_count = [] # store number of samples for each day for j in indices[i : i + batch_size]: - # normal sampling: self.batch_size > 0 => slices is a list => slices_subset is a slice # daily sampling: self.batch_size < 0 => slices is a nested list => slices_subset is a list slices_subset = slices[j] @@ -297,7 +294,6 @@ def __iter__(self): # each slices_subset contains a list of slices for multiple stocks # NOTE: daily sampling is used in 1) eval mode, 2) train mode with self.batch_size < 0 if self.batch_size < 0: - # store daily index idx = self._daily_index.index[j] # daily_index.index is the index of the original data daily_index.append(idx) @@ -320,7 +316,6 @@ def __iter__(self): slices_subset = [slices_subset] for slc in slices_subset: - # legacy support for Alpha360 data by `input_size` if self.input_size: data.append(self._data[slc.stop - 1].reshape(self.input_size, -1).T) diff --git a/qlib/contrib/data/highfreq_handler.py b/qlib/contrib/data/highfreq_handler.py index 638fbf0e80..8eed4814f2 100644 --- a/qlib/contrib/data/highfreq_handler.py +++ b/qlib/contrib/data/highfreq_handler.py @@ -17,7 +17,6 @@ def __init__( fit_end_time=None, drop_raw=True, ): - infer_processors = check_transform_proc(infer_processors, fit_start_time, fit_end_time) learn_processors = check_transform_proc(learn_processors, fit_start_time, fit_end_time) @@ -318,7 +317,6 @@ def __init__( inst_processors=None, drop_raw=True, ): - infer_processors = check_transform_proc(infer_processors, fit_start_time, fit_end_time) learn_processors = check_transform_proc(learn_processors, fit_start_time, fit_end_time) diff --git a/qlib/contrib/data/highfreq_processor.py b/qlib/contrib/data/highfreq_processor.py index f7041e9f4c..db2a6e39b4 100644 --- a/qlib/contrib/data/highfreq_processor.py +++ b/qlib/contrib/data/highfreq_processor.py @@ -29,7 +29,6 @@ def __init__( feature_save_dir: str, norm_groups: Dict[str, int], ): - self.fit_start_time = fit_start_time self.fit_end_time = fit_end_time self.feature_save_dir = feature_save_dir diff --git a/qlib/contrib/meta/data_selection/dataset.py b/qlib/contrib/meta/data_selection/dataset.py index e3689d964f..9349a12fe5 100644 --- a/qlib/contrib/meta/data_selection/dataset.py +++ b/qlib/contrib/meta/data_selection/dataset.py @@ -49,6 +49,8 @@ def setup(self, trainer=TrainerR, trainer_kwargs={}): # 1) prepare the prediction of proxy models perf_task_tpl = deepcopy(self.task_tpl) # this task is supposed to contains no complicated objects + # The only thing we want to save is the prediction + perf_task_tpl["record"] = ["qlib.workflow.record_temp.SignalRecord"] trainer = auto_filter_kwargs(trainer)(experiment_name=self.exp_name, **trainer_kwargs) # NOTE: diff --git a/qlib/contrib/model/pytorch_adarnn.py b/qlib/contrib/model/pytorch_adarnn.py index 4b0db7f4b7..ca5e8ba865 100644 --- a/qlib/contrib/model/pytorch_adarnn.py +++ b/qlib/contrib/model/pytorch_adarnn.py @@ -246,7 +246,6 @@ def fit( evals_result=dict(), save_path=None, ): - df_train, df_valid = dataset.prepare( ["train", "valid"], col_set=["feature", "label"], @@ -318,7 +317,6 @@ def infer(self, x_test): preds = [] for begin in range(sample_num)[:: self.batch_size]: - if sample_num - begin < self.batch_size: end = sample_num else: diff --git a/qlib/contrib/model/pytorch_alstm.py b/qlib/contrib/model/pytorch_alstm.py index b0770e2bdd..2fe7cce3b0 100644 --- a/qlib/contrib/model/pytorch_alstm.py +++ b/qlib/contrib/model/pytorch_alstm.py @@ -146,7 +146,6 @@ def loss_fn(self, pred, label): raise ValueError("unknown loss `%s`" % self.loss) def metric_fn(self, pred, label): - mask = torch.isfinite(label) if self.metric in ("", "loss"): @@ -155,7 +154,6 @@ def metric_fn(self, pred, label): raise ValueError("unknown metric `%s`" % self.metric) def train_epoch(self, x_train, y_train): - x_train_values = x_train.values y_train_values = np.squeeze(y_train.values) @@ -165,7 +163,6 @@ def train_epoch(self, x_train, y_train): np.random.shuffle(indices) for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: break @@ -181,7 +178,6 @@ def train_epoch(self, x_train, y_train): self.train_optimizer.step() def test_epoch(self, data_x, data_y): - # prepare training data x_values = data_x.values y_values = np.squeeze(data_y.values) @@ -194,7 +190,6 @@ def test_epoch(self, data_x, data_y): indices = np.arange(len(x_values)) for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: break @@ -217,7 +212,6 @@ def fit( evals_result=dict(), save_path=None, ): - df_train, df_valid, df_test = dataset.prepare( ["train", "valid", "test"], col_set=["feature", "label"], @@ -282,7 +276,6 @@ def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): preds = [] for begin in range(sample_num)[:: self.batch_size]: - if sample_num - begin < self.batch_size: end = sample_num else: diff --git a/qlib/contrib/model/pytorch_alstm_ts.py b/qlib/contrib/model/pytorch_alstm_ts.py index 3ab8ed8ab5..008d789402 100644 --- a/qlib/contrib/model/pytorch_alstm_ts.py +++ b/qlib/contrib/model/pytorch_alstm_ts.py @@ -156,7 +156,6 @@ def loss_fn(self, pred, label, weight=None): raise ValueError("unknown loss `%s`" % self.loss) def metric_fn(self, pred, label): - mask = torch.isfinite(label) if self.metric in ("", "loss"): @@ -165,10 +164,9 @@ def metric_fn(self, pred, label): raise ValueError("unknown metric `%s`" % self.metric) def train_epoch(self, data_loader): - self.ALSTM_model.train() - for (data, weight) in data_loader: + for data, weight in data_loader: feature = data[:, :, 0:-1].to(self.device) label = data[:, -1, -1].to(self.device) @@ -181,14 +179,12 @@ def train_epoch(self, data_loader): self.train_optimizer.step() def test_epoch(self, data_loader): - self.ALSTM_model.eval() scores = [] losses = [] - for (data, weight) in data_loader: - + for data, weight in data_loader: feature = data[:, :, 0:-1].to(self.device) # feature[torch.isnan(feature)] = 0 label = data[:, -1, -1].to(self.device) @@ -295,7 +291,6 @@ def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): preds = [] for data in test_loader: - feature = data[:, :, 0:-1].to(self.device) with torch.no_grad(): diff --git a/qlib/contrib/model/pytorch_gats.py b/qlib/contrib/model/pytorch_gats.py index 1274088773..63ebd480a4 100644 --- a/qlib/contrib/model/pytorch_gats.py +++ b/qlib/contrib/model/pytorch_gats.py @@ -154,7 +154,6 @@ def loss_fn(self, pred, label): raise ValueError("unknown loss `%s`" % self.loss) def metric_fn(self, pred, label): - mask = torch.isfinite(label) if self.metric in ("", "loss"): @@ -175,7 +174,6 @@ def get_daily_inter(self, df, shuffle=False): return daily_index, daily_count def train_epoch(self, x_train, y_train): - x_train_values = x_train.values y_train_values = np.squeeze(y_train.values) self.GAT_model.train() @@ -197,7 +195,6 @@ def train_epoch(self, x_train, y_train): self.train_optimizer.step() def test_epoch(self, data_x, data_y): - # prepare training data x_values = data_x.values y_values = np.squeeze(data_y.values) @@ -230,7 +227,6 @@ def fit( evals_result=dict(), save_path=None, ): - df_train, df_valid, df_test = dataset.prepare( ["train", "valid", "test"], col_set=["feature", "label"], diff --git a/qlib/contrib/model/pytorch_gats_ts.py b/qlib/contrib/model/pytorch_gats_ts.py index 1b75efe890..b1239f78e1 100644 --- a/qlib/contrib/model/pytorch_gats_ts.py +++ b/qlib/contrib/model/pytorch_gats_ts.py @@ -32,7 +32,6 @@ def __init__(self, data_source): self.daily_index[0] = 0 def __iter__(self): - for idx, count in zip(self.daily_index, self.daily_count): yield np.arange(idx, idx + count) @@ -173,7 +172,6 @@ def loss_fn(self, pred, label): raise ValueError("unknown loss `%s`" % self.loss) def metric_fn(self, pred, label): - mask = torch.isfinite(label) if self.metric in ("", "loss"): @@ -194,11 +192,9 @@ def get_daily_inter(self, df, shuffle=False): return daily_index, daily_count def train_epoch(self, data_loader): - self.GAT_model.train() for data in data_loader: - data = data.squeeze() feature = data[:, :, 0:-1].to(self.device) label = data[:, -1, -1].to(self.device) @@ -212,14 +208,12 @@ def train_epoch(self, data_loader): self.train_optimizer.step() def test_epoch(self, data_loader): - self.GAT_model.eval() scores = [] losses = [] for data in data_loader: - data = data.squeeze() feature = data[:, :, 0:-1].to(self.device) # feature[torch.isnan(feature)] = 0 @@ -240,7 +234,6 @@ def fit( evals_result=dict(), save_path=None, ): - dl_train = dataset.prepare("train", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L) dl_valid = dataset.prepare("valid", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L) if dl_train.empty or dl_valid.empty: @@ -329,7 +322,6 @@ def predict(self, dataset): preds = [] for data in test_loader: - data = data.squeeze() feature = data[:, :, 0:-1].to(self.device) diff --git a/qlib/contrib/model/pytorch_gru.py b/qlib/contrib/model/pytorch_gru.py index 10998236bb..2a476a657d 100755 --- a/qlib/contrib/model/pytorch_gru.py +++ b/qlib/contrib/model/pytorch_gru.py @@ -146,7 +146,6 @@ def loss_fn(self, pred, label): raise ValueError("unknown loss `%s`" % self.loss) def metric_fn(self, pred, label): - mask = torch.isfinite(label) if self.metric in ("", "loss"): @@ -155,7 +154,6 @@ def metric_fn(self, pred, label): raise ValueError("unknown metric `%s`" % self.metric) def train_epoch(self, x_train, y_train): - x_train_values = x_train.values y_train_values = np.squeeze(y_train.values) @@ -165,7 +163,6 @@ def train_epoch(self, x_train, y_train): np.random.shuffle(indices) for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: break @@ -181,7 +178,6 @@ def train_epoch(self, x_train, y_train): self.train_optimizer.step() def test_epoch(self, data_x, data_y): - # prepare training data x_values = data_x.values y_values = np.squeeze(data_y.values) @@ -194,7 +190,6 @@ def test_epoch(self, data_x, data_y): indices = np.arange(len(x_values)) for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: break @@ -217,7 +212,6 @@ def fit( evals_result=dict(), save_path=None, ): - df_train, df_valid, df_test = dataset.prepare( ["train", "valid", "test"], col_set=["feature", "label"], @@ -282,7 +276,6 @@ def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): preds = [] for begin in range(sample_num)[:: self.batch_size]: - if sample_num - begin < self.batch_size: end = sample_num else: diff --git a/qlib/contrib/model/pytorch_gru_ts.py b/qlib/contrib/model/pytorch_gru_ts.py index b588392a21..2e5076ea67 100755 --- a/qlib/contrib/model/pytorch_gru_ts.py +++ b/qlib/contrib/model/pytorch_gru_ts.py @@ -154,7 +154,6 @@ def loss_fn(self, pred, label, weight=None): raise ValueError("unknown loss `%s`" % self.loss) def metric_fn(self, pred, label): - mask = torch.isfinite(label) if self.metric in ("", "loss"): @@ -163,10 +162,9 @@ def metric_fn(self, pred, label): raise ValueError("unknown metric `%s`" % self.metric) def train_epoch(self, data_loader): - self.GRU_model.train() - for (data, weight) in data_loader: + for data, weight in data_loader: feature = data[:, :, 0:-1].to(self.device) label = data[:, -1, -1].to(self.device) @@ -179,14 +177,12 @@ def train_epoch(self, data_loader): self.train_optimizer.step() def test_epoch(self, data_loader): - self.GRU_model.eval() scores = [] losses = [] - for (data, weight) in data_loader: - + for data, weight in data_loader: feature = data[:, :, 0:-1].to(self.device) # feature[torch.isnan(feature)] = 0 label = data[:, -1, -1].to(self.device) @@ -293,7 +289,6 @@ def predict(self, dataset): preds = [] for data in test_loader: - feature = data[:, :, 0:-1].to(self.device) with torch.no_grad(): diff --git a/qlib/contrib/model/pytorch_hist.py b/qlib/contrib/model/pytorch_hist.py index f7b565dc54..5c3cd66a31 100644 --- a/qlib/contrib/model/pytorch_hist.py +++ b/qlib/contrib/model/pytorch_hist.py @@ -160,7 +160,6 @@ def loss_fn(self, pred, label): raise ValueError("unknown loss `%s`" % self.loss) def metric_fn(self, pred, label): - mask = torch.isfinite(label) if self.metric == "ic": @@ -189,7 +188,6 @@ def get_daily_inter(self, df, shuffle=False): return daily_index, daily_count def train_epoch(self, x_train, y_train, stock_index): - stock2concept_matrix = np.load(self.stock2concept) x_train_values = x_train.values y_train_values = np.squeeze(y_train.values) @@ -214,7 +212,6 @@ def train_epoch(self, x_train, y_train, stock_index): self.train_optimizer.step() def test_epoch(self, data_x, data_y, stock_index): - # prepare training data stock2concept_matrix = np.load(self.stock2concept) x_values = data_x.values diff --git a/qlib/contrib/model/pytorch_igmtf.py b/qlib/contrib/model/pytorch_igmtf.py index d38ef9ad48..46a25c00f4 100644 --- a/qlib/contrib/model/pytorch_igmtf.py +++ b/qlib/contrib/model/pytorch_igmtf.py @@ -153,7 +153,6 @@ def loss_fn(self, pred, label): raise ValueError("unknown loss `%s`" % self.loss) def metric_fn(self, pred, label): - mask = torch.isfinite(label) if self.metric == "ic": @@ -201,7 +200,6 @@ def get_train_hidden(self, x_train): return train_hidden, train_hidden_day def train_epoch(self, x_train, y_train, train_hidden, train_hidden_day): - x_train_values = x_train.values y_train_values = np.squeeze(y_train.values) @@ -222,7 +220,6 @@ def train_epoch(self, x_train, y_train, train_hidden, train_hidden_day): self.train_optimizer.step() def test_epoch(self, data_x, data_y, train_hidden, train_hidden_day): - # prepare training data x_values = data_x.values y_values = np.squeeze(data_y.values) @@ -254,7 +251,6 @@ def fit( evals_result=dict(), save_path=None, ): - df_train, df_valid = dataset.prepare( ["train", "valid"], col_set=["feature", "label"], diff --git a/qlib/contrib/model/pytorch_localformer.py b/qlib/contrib/model/pytorch_localformer.py index 6e7d911803..830bc59f03 100644 --- a/qlib/contrib/model/pytorch_localformer.py +++ b/qlib/contrib/model/pytorch_localformer.py @@ -46,7 +46,6 @@ def __init__( seed=None, **kwargs ): - # set hyper-parameters. self.d_model = d_model self.dropout = dropout @@ -96,7 +95,6 @@ def loss_fn(self, pred, label): raise ValueError("unknown loss `%s`" % self.loss) def metric_fn(self, pred, label): - mask = torch.isfinite(label) if self.metric in ("", "loss"): @@ -105,7 +103,6 @@ def metric_fn(self, pred, label): raise ValueError("unknown metric `%s`" % self.metric) def train_epoch(self, x_train, y_train): - x_train_values = x_train.values y_train_values = np.squeeze(y_train.values) @@ -115,7 +112,6 @@ def train_epoch(self, x_train, y_train): np.random.shuffle(indices) for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: break @@ -131,7 +127,6 @@ def train_epoch(self, x_train, y_train): self.train_optimizer.step() def test_epoch(self, data_x, data_y): - # prepare training data x_values = data_x.values y_values = np.squeeze(data_y.values) @@ -144,7 +139,6 @@ def test_epoch(self, data_x, data_y): indices = np.arange(len(x_values)) for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: break @@ -167,7 +161,6 @@ def fit( evals_result=dict(), save_path=None, ): - df_train, df_valid, df_test = dataset.prepare( ["train", "valid", "test"], col_set=["feature", "label"], @@ -232,7 +225,6 @@ def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): preds = [] for begin in range(sample_num)[:: self.batch_size]: - if sample_num - begin < self.batch_size: end = sample_num else: diff --git a/qlib/contrib/model/pytorch_localformer_ts.py b/qlib/contrib/model/pytorch_localformer_ts.py index 18ef7f1122..b05c2d311a 100644 --- a/qlib/contrib/model/pytorch_localformer_ts.py +++ b/qlib/contrib/model/pytorch_localformer_ts.py @@ -44,7 +44,6 @@ def __init__( seed=None, **kwargs ): - # set hyper-parameters. self.d_model = d_model self.dropout = dropout @@ -96,7 +95,6 @@ def loss_fn(self, pred, label): raise ValueError("unknown loss `%s`" % self.loss) def metric_fn(self, pred, label): - mask = torch.isfinite(label) if self.metric in ("", "loss"): @@ -105,7 +103,6 @@ def metric_fn(self, pred, label): raise ValueError("unknown metric `%s`" % self.metric) def train_epoch(self, data_loader): - self.model.train() for data in data_loader: @@ -121,14 +118,12 @@ def train_epoch(self, data_loader): self.train_optimizer.step() def test_epoch(self, data_loader): - self.model.eval() scores = [] losses = [] for data in data_loader: - feature = data[:, :, 0:-1].to(self.device) label = data[:, -1, -1].to(self.device) @@ -148,7 +143,6 @@ def fit( evals_result=dict(), save_path=None, ): - dl_train = dataset.prepare("train", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L) dl_valid = dataset.prepare("valid", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L) if dl_train.empty or dl_valid.empty: diff --git a/qlib/contrib/model/pytorch_lstm.py b/qlib/contrib/model/pytorch_lstm.py index a68cf5eacb..168be6ca56 100755 --- a/qlib/contrib/model/pytorch_lstm.py +++ b/qlib/contrib/model/pytorch_lstm.py @@ -142,7 +142,6 @@ def loss_fn(self, pred, label): raise ValueError("unknown loss `%s`" % self.loss) def metric_fn(self, pred, label): - mask = torch.isfinite(label) if self.metric in ("", "loss"): @@ -151,7 +150,6 @@ def metric_fn(self, pred, label): raise ValueError("unknown metric `%s`" % self.metric) def train_epoch(self, x_train, y_train): - x_train_values = x_train.values y_train_values = np.squeeze(y_train.values) @@ -161,7 +159,6 @@ def train_epoch(self, x_train, y_train): np.random.shuffle(indices) for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: break @@ -177,7 +174,6 @@ def train_epoch(self, x_train, y_train): self.train_optimizer.step() def test_epoch(self, data_x, data_y): - # prepare training data x_values = data_x.values y_values = np.squeeze(data_y.values) @@ -190,7 +186,6 @@ def test_epoch(self, data_x, data_y): indices = np.arange(len(x_values)) for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: break @@ -212,7 +207,6 @@ def fit( evals_result=dict(), save_path=None, ): - df_train, df_valid, df_test = dataset.prepare( ["train", "valid", "test"], col_set=["feature", "label"], diff --git a/qlib/contrib/model/pytorch_lstm_ts.py b/qlib/contrib/model/pytorch_lstm_ts.py index f1a3c55e87..8ecafc2d5d 100755 --- a/qlib/contrib/model/pytorch_lstm_ts.py +++ b/qlib/contrib/model/pytorch_lstm_ts.py @@ -150,7 +150,6 @@ def loss_fn(self, pred, label, weight): raise ValueError("unknown loss `%s`" % self.loss) def metric_fn(self, pred, label): - mask = torch.isfinite(label) if self.metric in ("", "loss"): @@ -159,10 +158,9 @@ def metric_fn(self, pred, label): raise ValueError("unknown metric `%s`" % self.metric) def train_epoch(self, data_loader): - self.LSTM_model.train() - for (data, weight) in data_loader: + for data, weight in data_loader: feature = data[:, :, 0:-1].to(self.device) label = data[:, -1, -1].to(self.device) @@ -175,14 +173,12 @@ def train_epoch(self, data_loader): self.train_optimizer.step() def test_epoch(self, data_loader): - self.LSTM_model.eval() scores = [] losses = [] - for (data, weight) in data_loader: - + for data, weight in data_loader: feature = data[:, :, 0:-1].to(self.device) # feature[torch.isnan(feature)] = 0 label = data[:, -1, -1].to(self.device) @@ -288,7 +284,6 @@ def predict(self, dataset): preds = [] for data in test_loader: - feature = data[:, :, 0:-1].to(self.device) with torch.no_grad(): diff --git a/qlib/contrib/model/pytorch_sfm.py b/qlib/contrib/model/pytorch_sfm.py index 29bae94a38..e79f475d69 100644 --- a/qlib/contrib/model/pytorch_sfm.py +++ b/qlib/contrib/model/pytorch_sfm.py @@ -306,7 +306,6 @@ def use_gpu(self): return self.device != torch.device("cpu") def test_epoch(self, data_x, data_y): - # prepare training data x_values = data_x.values y_values = np.squeeze(data_y.values) @@ -319,7 +318,6 @@ def test_epoch(self, data_x, data_y): indices = np.arange(len(x_values)) for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: break @@ -336,7 +334,6 @@ def test_epoch(self, data_x, data_y): return np.mean(losses), np.mean(scores) def train_epoch(self, x_train, y_train): - x_train_values = x_train.values y_train_values = np.squeeze(y_train.values) @@ -346,7 +343,6 @@ def train_epoch(self, x_train, y_train): np.random.shuffle(indices) for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: break @@ -367,7 +363,6 @@ def fit( evals_result=dict(), save_path=None, ): - df_train, df_valid = dataset.prepare( ["train", "valid"], col_set=["feature", "label"], @@ -431,7 +426,6 @@ def loss_fn(self, pred, label): raise ValueError("unknown loss `%s`" % self.loss) def metric_fn(self, pred, label): - mask = torch.isfinite(label) if self.metric in ("", "loss"): diff --git a/qlib/contrib/model/pytorch_tabnet.py b/qlib/contrib/model/pytorch_tabnet.py index adc7354fe0..3c698edade 100644 --- a/qlib/contrib/model/pytorch_tabnet.py +++ b/qlib/contrib/model/pytorch_tabnet.py @@ -256,7 +256,6 @@ def test_epoch(self, data_x, data_y): indices = np.arange(len(x_values)) for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: break feature = x_values[indices[i : i + self.batch_size]].float().to(self.device) @@ -283,7 +282,6 @@ def train_epoch(self, x_train, y_train): np.random.shuffle(indices) for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: break @@ -308,7 +306,6 @@ def pretrain_epoch(self, x_train): self.tabnet_decoder.train() for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: break @@ -339,7 +336,6 @@ def pretrain_test_epoch(self, x_train): losses = [] for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: break diff --git a/qlib/contrib/model/pytorch_tcn.py b/qlib/contrib/model/pytorch_tcn.py index 2af7a04ea0..38e289342d 100755 --- a/qlib/contrib/model/pytorch_tcn.py +++ b/qlib/contrib/model/pytorch_tcn.py @@ -154,7 +154,6 @@ def loss_fn(self, pred, label): raise ValueError("unknown loss `%s`" % self.loss) def metric_fn(self, pred, label): - mask = torch.isfinite(label) if self.metric in ("", "loss"): @@ -163,7 +162,6 @@ def metric_fn(self, pred, label): raise ValueError("unknown metric `%s`" % self.metric) def train_epoch(self, x_train, y_train): - x_train_values = x_train.values y_train_values = np.squeeze(y_train.values) @@ -173,7 +171,6 @@ def train_epoch(self, x_train, y_train): np.random.shuffle(indices) for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: break @@ -200,7 +197,6 @@ def test_epoch(self, data_x, data_y): indices = np.arange(len(x_values)) for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: break @@ -223,7 +219,6 @@ def fit( evals_result=dict(), save_path=None, ): - df_train, df_valid, df_test = dataset.prepare( ["train", "valid", "test"], col_set=["feature", "label"], @@ -286,7 +281,6 @@ def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): preds = [] for begin in range(sample_num)[:: self.batch_size]: - if sample_num - begin < self.batch_size: end = sample_num else: diff --git a/qlib/contrib/model/pytorch_tcn_ts.py b/qlib/contrib/model/pytorch_tcn_ts.py index bb2e5ea5bd..605da62c49 100755 --- a/qlib/contrib/model/pytorch_tcn_ts.py +++ b/qlib/contrib/model/pytorch_tcn_ts.py @@ -155,7 +155,6 @@ def loss_fn(self, pred, label): raise ValueError("unknown loss `%s`" % self.loss) def metric_fn(self, pred, label): - mask = torch.isfinite(label) if self.metric in ("", "loss"): @@ -164,7 +163,6 @@ def metric_fn(self, pred, label): raise ValueError("unknown metric `%s`" % self.metric) def train_epoch(self, data_loader): - self.TCN_model.train() for data in data_loader: @@ -181,7 +179,6 @@ def train_epoch(self, data_loader): self.train_optimizer.step() def test_epoch(self, data_loader): - self.TCN_model.eval() scores = [] @@ -277,7 +274,6 @@ def predict(self, dataset): preds = [] for data in test_loader: - feature = data[:, :, 0:-1].to(self.device) with torch.no_grad(): diff --git a/qlib/contrib/model/pytorch_tcts.py b/qlib/contrib/model/pytorch_tcts.py index b46835cb65..651bd03d23 100644 --- a/qlib/contrib/model/pytorch_tcts.py +++ b/qlib/contrib/model/pytorch_tcts.py @@ -119,7 +119,6 @@ def __init__( ) def loss_fn(self, pred, label, weight): - if self.mode == "hard": loc = torch.argmax(weight, 1) loss = (pred - label[np.arange(weight.shape[0]), loc]) ** 2 @@ -157,7 +156,6 @@ def train_epoch(self, x_train, y_train, x_valid, y_valid): for i in range(self.steps): for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: break @@ -191,7 +189,6 @@ def train_epoch(self, x_train, y_train, x_valid, y_valid): # fix forecasting model and valid weight model for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: break @@ -212,7 +209,6 @@ def train_epoch(self, x_train, y_train, x_valid, y_valid): self.weight_optimizer.step() def test_epoch(self, data_x, data_y): - # prepare training data x_values = data_x.values y_values = np.squeeze(data_y.values) @@ -224,7 +220,6 @@ def test_epoch(self, data_x, data_y): indices = np.arange(len(x_values)) for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: break @@ -282,7 +277,6 @@ def training( verbose=True, save_path=None, ): - self.fore_model = GRUModel( d_feat=self.d_feat, hidden_size=self.hidden_size, @@ -366,7 +360,6 @@ def predict(self, dataset): preds = [] for begin in range(sample_num)[:: self.batch_size]: - if sample_num - begin < self.batch_size: end = sample_num else: diff --git a/qlib/contrib/model/pytorch_tra.py b/qlib/contrib/model/pytorch_tra.py index 46d362c689..964febf11c 100644 --- a/qlib/contrib/model/pytorch_tra.py +++ b/qlib/contrib/model/pytorch_tra.py @@ -84,7 +84,6 @@ def __init__( transport_method="none", memory_mode="sample", ): - self.logger = get_module_logger("TRA") assert memory_mode in ["sample", "daily"], "invalid memory mode" @@ -136,7 +135,6 @@ def __init__( self._init_model() def _init_model(self): - self.logger.info("init TRAModel...") self.model = eval(self.model_type)(**self.model_config).to(device) @@ -176,7 +174,6 @@ def _init_model(self): self.global_step = -1 def train_epoch(self, epoch, data_set, is_pretrain=False): - self.model.train() self.tra.train() data_set.train() @@ -274,7 +271,6 @@ def train_epoch(self, epoch, data_set, is_pretrain=False): return total_loss def test_epoch(self, epoch, data_set, return_pred=False, prefix="test", is_pretrain=False): - self.model.eval() self.tra.eval() data_set.eval() @@ -360,7 +356,6 @@ def test_epoch(self, epoch, data_set, return_pred=False, prefix="test", is_pretr return metrics, preds, probs, P_all def _fit(self, train_set, valid_set, test_set, evals_result, is_pretrain=True): - best_score = -1 best_epoch = 0 stop_rounds = 0 @@ -419,7 +414,6 @@ def _fit(self, train_set, valid_set, test_set, evals_result, is_pretrain=True): return best_score def fit(self, dataset, evals_result=dict()): - assert isinstance(dataset, MTSDatasetH), "TRAModel only supports `qlib.contrib.data.dataset.MTSDatasetH`" train_set, valid_set, test_set = dataset.prepare(["train", "valid", "test"]) @@ -503,7 +497,6 @@ def fit(self, dataset, evals_result=dict()): json.dump(info, f) def predict(self, dataset, segment="test"): - assert isinstance(dataset, MTSDatasetH), "TRAModel only supports `qlib.contrib.data.dataset.MTSDatasetH`" if not self.fitted: @@ -571,7 +564,6 @@ def __init__( self.output_size = hidden_size def forward(self, x): - if self.input_proj is not None: x = self.input_proj(x) @@ -647,7 +639,6 @@ def __init__( self.output_size = hidden_size def forward(self, x): - x = x.permute(1, 0, 2).contiguous() # the first dim need to be time x = self.pe(x) @@ -713,7 +704,6 @@ def reset_parameters(self): child.reset_parameters() def forward(self, hidden, hist_loss): - preds = self.predictors(hidden) if self.num_states == 1: # no need for router when having only one prediction diff --git a/qlib/contrib/model/pytorch_transformer.py b/qlib/contrib/model/pytorch_transformer.py index 66e5b2c4e9..f4b7a06eb6 100644 --- a/qlib/contrib/model/pytorch_transformer.py +++ b/qlib/contrib/model/pytorch_transformer.py @@ -45,7 +45,6 @@ def __init__( seed=None, **kwargs ): - # set hyper-parameters. self.d_model = d_model self.dropout = dropout @@ -95,7 +94,6 @@ def loss_fn(self, pred, label): raise ValueError("unknown loss `%s`" % self.loss) def metric_fn(self, pred, label): - mask = torch.isfinite(label) if self.metric in ("", "loss"): @@ -104,7 +102,6 @@ def metric_fn(self, pred, label): raise ValueError("unknown metric `%s`" % self.metric) def train_epoch(self, x_train, y_train): - x_train_values = x_train.values y_train_values = np.squeeze(y_train.values) @@ -114,7 +111,6 @@ def train_epoch(self, x_train, y_train): np.random.shuffle(indices) for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: break @@ -130,7 +126,6 @@ def train_epoch(self, x_train, y_train): self.train_optimizer.step() def test_epoch(self, data_x, data_y): - # prepare training data x_values = data_x.values y_values = np.squeeze(data_y.values) @@ -143,7 +138,6 @@ def test_epoch(self, data_x, data_y): indices = np.arange(len(x_values)) for i in range(len(indices))[:: self.batch_size]: - if len(indices) - i < self.batch_size: break @@ -166,7 +160,6 @@ def fit( evals_result=dict(), save_path=None, ): - df_train, df_valid, df_test = dataset.prepare( ["train", "valid", "test"], col_set=["feature", "label"], @@ -231,7 +224,6 @@ def predict(self, dataset: DatasetH, segment: Union[Text, slice] = "test"): preds = [] for begin in range(sample_num)[:: self.batch_size]: - if sample_num - begin < self.batch_size: end = sample_num else: diff --git a/qlib/contrib/model/pytorch_transformer_ts.py b/qlib/contrib/model/pytorch_transformer_ts.py index 6cffded9c9..84b093805c 100644 --- a/qlib/contrib/model/pytorch_transformer_ts.py +++ b/qlib/contrib/model/pytorch_transformer_ts.py @@ -43,7 +43,6 @@ def __init__( seed=None, **kwargs ): - # set hyper-parameters. self.d_model = d_model self.dropout = dropout @@ -93,7 +92,6 @@ def loss_fn(self, pred, label): raise ValueError("unknown loss `%s`" % self.loss) def metric_fn(self, pred, label): - mask = torch.isfinite(label) if self.metric in ("", "loss"): @@ -102,7 +100,6 @@ def metric_fn(self, pred, label): raise ValueError("unknown metric `%s`" % self.metric) def train_epoch(self, data_loader): - self.model.train() for data in data_loader: @@ -118,14 +115,12 @@ def train_epoch(self, data_loader): self.train_optimizer.step() def test_epoch(self, data_loader): - self.model.eval() scores = [] losses = [] for data in data_loader: - feature = data[:, :, 0:-1].to(self.device) label = data[:, -1, -1].to(self.device) @@ -145,7 +140,6 @@ def fit( evals_result=dict(), save_path=None, ): - dl_train = dataset.prepare("train", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L) dl_valid = dataset.prepare("valid", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L) diff --git a/qlib/contrib/model/xgboost.py b/qlib/contrib/model/xgboost.py index d38655ebdc..67bedafa87 100755 --- a/qlib/contrib/model/xgboost.py +++ b/qlib/contrib/model/xgboost.py @@ -30,7 +30,6 @@ def fit( reweighter=None, **kwargs ): - df_train, df_valid = dataset.prepare( ["train", "valid"], col_set=["feature", "label"], diff --git a/qlib/contrib/report/data/ana.py b/qlib/contrib/report/data/ana.py index 782a92d5a5..567ef311d5 100644 --- a/qlib/contrib/report/data/ana.py +++ b/qlib/contrib/report/data/ana.py @@ -30,7 +30,6 @@ def calc_stat_values(self): """The statistics of features are finished in the underlying analysers""" def plot_all(self, *args, **kwargs): - ax_gen = iter(sub_fig_generator(row_n=len(self._fea_ana_l), *args, **kwargs)) for col in self._dataset: diff --git a/qlib/contrib/report/data/base.py b/qlib/contrib/report/data/base.py index 1e7e092afb..a91eda48e6 100644 --- a/qlib/contrib/report/data/base.py +++ b/qlib/contrib/report/data/base.py @@ -28,7 +28,6 @@ def skip(self, col): return False def plot_all(self, *args, **kwargs): - ax_gen = iter(sub_fig_generator(*args, **kwargs)) for col in self._dataset: if not self.skip(col): diff --git a/qlib/contrib/report/graph.py b/qlib/contrib/report/graph.py index c5f932978b..f9cf517ea7 100644 --- a/qlib/contrib/report/graph.py +++ b/qlib/contrib/report/graph.py @@ -15,7 +15,6 @@ class BaseGraph: - _name = None def __init__( diff --git a/qlib/contrib/rolling/__init__.py b/qlib/contrib/rolling/__init__.py new file mode 100644 index 0000000000..b940486fdf --- /dev/null +++ b/qlib/contrib/rolling/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +""" +The difference between me and the scripts in examples/benchmarks/benchmarks_dynamic +- This module only focus provide a general rolling implementation. + Anything specific that benchmark is placed in examples/benchmarks/benchmarks_dynamic +""" diff --git a/qlib/contrib/rolling/__main__.py b/qlib/contrib/rolling/__main__.py new file mode 100644 index 0000000000..461c0e777e --- /dev/null +++ b/qlib/contrib/rolling/__main__.py @@ -0,0 +1,16 @@ +import fire +from qlib import auto_init +from qlib.contrib.rolling.base import Rolling +from qlib.utils.mod import find_all_classes + +if __name__ == "__main__": + sub_commands = {} + for cls in find_all_classes("qlib.contrib.rolling", Rolling): + sub_commands[cls.__module__.split(".")[-1]] = cls + # The sub_commands will be like + # {'base': , ...} + # So the you can run it with commands like command below + # - `python -m qlib.contrib.rolling base --conf_path run` + # - base can be replace with other module names + auto_init() + fire.Fire(sub_commands) diff --git a/qlib/contrib/rolling/base.py b/qlib/contrib/rolling/base.py new file mode 100644 index 0000000000..d179efb38b --- /dev/null +++ b/qlib/contrib/rolling/base.py @@ -0,0 +1,246 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +from copy import deepcopy +from pathlib import Path +from typing import List, Optional, Union + +import fire +import pandas as pd +import yaml + +from qlib import auto_init +from qlib.log import get_module_logger +from qlib.model.ens.ensemble import RollingEnsemble +from qlib.model.trainer import TrainerR +from qlib.utils import get_cls_kwargs, init_instance_by_config +from qlib.utils.data import update_config +from qlib.workflow import R +from qlib.workflow.record_temp import SignalRecord +from qlib.workflow.task.collect import RecorderCollector +from qlib.workflow.task.gen import RollingGen, task_generator +from qlib.workflow.task.utils import replace_task_handler_with_cache + + +class Rolling: + """ + The motivation of Rolling Module + - It only focus **offlinely** turn a specific task to rollinng + - To make the implementation easier, following factors are ignored. + - The tasks is dependent (e.g. time series). + + Related modules and difference from me: + - MetaController: It is learning how to handle a task (e.g. learning to learn). + - But rolling is about how to split a single task into tasks in time series and run them. + - OnlineStrategy: It is focusing on serving a model, the model can be updated time dependently in time. + - Rolling is much simpler and is only for testing rolling models offline. It does not want to share the interface with OnlineStrategy. + + The code about rolling is shared in `task_generator` & `RollingGen` level between me and the above modules + But it is for different purpose, so other parts are not shared. + + + .. code-block:: shell + + # here is an typical use case of the module. + python -m qlib.contrib.rolling.base --conf_path run + + **NOTE** + before running the example, please clean your previous results with following command + - `rm -r mlruns` + - Because it is very hard to permanently delete a experiment (it will be moved into .trash and raise error when creating experiment with same name). + + """ + + def __init__( + self, + conf_path: Union[str, Path], + exp_name: Optional[str] = None, + horizon: Optional[int] = 20, + step: int = 20, + h_path: Optional[str] = None, + train_start: Optional[str] = None, + test_end: Optional[str] = None, + task_ext_conf: Optional[dict] = None, + rolling_exp: Optional[str] = None, + ) -> None: + """ + Parameters + ---------- + conf_path : str + Path to the config for rolling. + exp_name : Optional[str] + The exp name of the outputs (Output is a record which contains the concatenated predictions of rolling records). + horizon: Optional[int] = 20, + The horizon of the prediction target. + This is used to override the prediction horizon of the file. + h_path : Optional[str] + the dumped data handler; + It may come from other data source. It will override the data handler in the config. + test_end : Optional[str] + the test end for the data. It is typically used together with the handler + You can do the same thing with task_ext_conf in a more complicated way + train_start : Optional[str] + the train start for the data. It is typically used together with the handler. + You can do the same thing with task_ext_conf in a more complicated way + task_ext_conf : Optional[dict] + some option to update the task config. + rolling_exp : Optional[str] + The name for the experiments for rolling. + It will contains a lot of record in an experiment. Each record corresponds to a specific rolling. + Please note that it is different from the final experiments + """ + self.logger = get_module_logger("Rolling") + self.conf_path = Path(conf_path) + self.exp_name = exp_name + self._rid = None # the final combined recorder id in `exp_name` + + self.step = step + assert horizon is not None, "Current version does not support extracting horizon from the underlying dataset" + self.horizon = horizon + if rolling_exp is None: + datetime_suffix = pd.Timestamp.now().strftime("%Y%m%d%H%M%S") + self.rolling_exp = f"rolling_models_{datetime_suffix}" + else: + self.rolling_exp = rolling_exp + self.logger.warning( + "Using user specifiied name for rolling models. So the experiment names duplicateds. " + "Please manually remove your experiment for rolling model with command like `rm -r mlruns`." + " Otherwise it will prevents the creating of experimen with same name" + ) + self.train_start = train_start + self.test_end = test_end + self.task_ext_conf = task_ext_conf + self.h_path = h_path + + # FIXME: + # - the qlib_init section will be ignored by me. + # - So we have to design a priority mechanism to solve this issue. + + def _raw_conf(self) -> dict: + with self.conf_path.open("r") as f: + return yaml.safe_load(f) + + def _replace_hanler_with_cache(self, task: dict): + """ + Due to the data processing part in original rolling is slow. So we have to + This class tries to add more feature + """ + if self.h_path is not None: + h_path = Path(self.h_path) + task["dataset"]["kwargs"]["handler"] = f"file://{h_path}" + else: + task = replace_task_handler_with_cache(task, self.conf_path.parent) + return task + + def _update_start_end_time(self, task: dict): + if self.train_start is not None: + seg = task["dataset"]["kwargs"]["segments"]["train"] + task["dataset"]["kwargs"]["segments"]["train"] = pd.Timestamp(self.train_start), seg[1] + + if self.test_end is not None: + seg = task["dataset"]["kwargs"]["segments"]["test"] + task["dataset"]["kwargs"]["segments"]["test"] = seg[0], pd.Timestamp(self.test_end) + return task + + def basic_task(self, enable_handler_cache: Optional[bool] = True): + """ + The basic task may not be the exactly same as the config from `conf_path` from __init__ due to + - some parameters could be overriding by some parameters from __init__ + - user could implementing sublcass to change it for higher performance + """ + task: dict = self._raw_conf()["task"] + task = deepcopy(task) + + # modify dataset horizon + # NOTE: + # It assumpts that the label can be modifiled in the handler's kwargs + # But is not always a valid. It is only valid in the predefined dataset `Alpha158` & `Alpha360` + if self.horizon is None: + # TODO: + # - get horizon automatically from the expression!!!! + raise NotImplementedError(f"This type of input is not supported") + else: + self.logger.info("The prediction horizon is overrided") + task["dataset"]["kwargs"]["handler"]["kwargs"]["label"] = [ + "Ref($close, -{}) / Ref($close, -1) - 1".format(self.horizon + 1) + ] + + if enable_handler_cache: + task = self._replace_hanler_with_cache(task) + task = self._update_start_end_time(task) + + if self.task_ext_conf is not None: + task = update_config(task, self.task_ext_conf) + self.logger.info(task) + return task + + def get_task_list(self) -> List[dict]: + """return a batch of tasks for rolling.""" + task = self.basic_task() + task_l = task_generator( + task, RollingGen(step=self.step, trunc_days=self.horizon + 1) + ) # the last two days should be truncated to avoid information leakage + for t in task_l: + # when we rolling tasks. No further analyis is needed. + # analyis are postponed to the final ensemble. + t["record"] = ["qlib.workflow.record_temp.SignalRecord"] + return task_l + + def _train_rolling_tasks(self): + task_l = self.get_task_list() + self.logger.info("Deleting previous Rolling results") + try: + # TODO: mlflow does not support permanently delete experiment + # it will be moved to .trash and prevents creating the experiments with the same name + R.delete_exp(experiment_name=self.rolling_exp) # We should remove the rolling experiments. + except ValueError: + self.logger.info("No previous rolling results") + trainer = TrainerR(experiment_name=self.rolling_exp) + trainer(task_l) + + def _ens_rolling(self): + rc = RecorderCollector( + experiment=self.rolling_exp, + artifacts_key=["pred", "label"], + process_list=[RollingEnsemble()], + # rec_key_func=lambda rec: (self.COMB_EXP, rec.info["id"]), + artifacts_path={"pred": "pred.pkl", "label": "label.pkl"}, + ) + res = rc() + with R.start(experiment_name=self.exp_name): + R.log_params(exp_name=self.rolling_exp) + R.save_objects(**{"pred.pkl": res["pred"], "label.pkl": res["label"]}) + self._rid = R.get_recorder().id + + def _update_rolling_rec(self): + """ + Evaluate the combined rolling results + """ + rec = R.get_recorder(experiment_name=self.exp_name, recorder_id=self._rid) + # Follow the original analyser + records = self._raw_conf()["task"].get("record", []) + if isinstance(records, dict): # prevent only one dict + records = [records] + for record in records: + if issubclass(get_cls_kwargs(record)[0], SignalRecord): + # skip the signal record. + continue + r = init_instance_by_config( + record, + recorder=rec, + default_module="qlib.workflow.record_temp", + ) + r.generate() + print(f"Your evaluation results can be found in the experiment named `{self.exp_name}`.") + + def run(self): + # the results will be save in mlruns. + # 1) each rolling task is saved in rolling_models + self._train_rolling_tasks() + # 2) combined rolling tasks and evaluation results are saved in rolling + self._ens_rolling() + self._update_rolling_rec() + + +if __name__ == "__main__": + auto_init() + fire.Fire(Rolling) diff --git a/qlib/contrib/rolling/ddgda.py b/qlib/contrib/rolling/ddgda.py new file mode 100644 index 0000000000..25fb4c36e2 --- /dev/null +++ b/qlib/contrib/rolling/ddgda.py @@ -0,0 +1,343 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +from pathlib import Path +import pickle +from typing import Optional, Union + +import pandas as pd +import yaml + +from qlib.contrib.meta.data_selection.dataset import InternalData, MetaDatasetDS +from qlib.contrib.meta.data_selection.model import MetaModelDS +from qlib.data.dataset.handler import DataHandlerLP +from qlib.model.meta.task import MetaTask +from qlib.model.trainer import TrainerR +from qlib.typehint import Literal +from qlib.utils import init_instance_by_config +from qlib.workflow import R +from qlib.workflow.task.utils import replace_task_handler_with_cache + +from .base import Rolling + +# LGBM is designed for feature importance & similarity +LGBM_MODEL = """ +class: LGBModel +module_path: qlib.contrib.model.gbdt +kwargs: + loss: mse + colsample_bytree: 0.8879 + learning_rate: 0.2 + subsample: 0.8789 + lambda_l1: 205.6999 + lambda_l2: 580.9768 + max_depth: 8 + num_leaves: 210 + num_threads: 20 +""" +# covnert the yaml to dict +LGBM_MODEL = yaml.load(LGBM_MODEL, Loader=yaml.FullLoader) + +LINEAR_MODEL = """ +class: LinearModel +module_path: qlib.contrib.model.linear +kwargs: + estimator: ridge + alpha: 0.05 +""" +LINEAR_MODEL = yaml.load(LINEAR_MODEL, Loader=yaml.FullLoader) + +PROC_ARGS = """ +infer_processors: + - class: RobustZScoreNorm + kwargs: + fields_group: feature + clip_outlier: true + - class: Fillna + kwargs: + fields_group: feature +learn_processors: + - class: DropnaLabel + - class: CSRankNorm + kwargs: + fields_group: label +""" +PROC_ARGS = yaml.load(PROC_ARGS, Loader=yaml.FullLoader) + +UTIL_MODEL_TYPE = Literal["linear", "gbdt"] + + +class DDGDA(Rolling): + """ + It is a rolling based on DDG-DA + + **NOTE** + before running the example, please clean your previous results with following command + - `rm -r mlruns` + """ + + def __init__( + self, + sim_task_model: UTIL_MODEL_TYPE = "gbdt", + meta_1st_train_end: Optional[str] = None, + alpha: float = 0.01, + working_dir: Optional[Union[str, Path]] = None, + **kwargs, + ): + """ + + Parameters + ---------- + sim_task_model: Literal["linear", "gbdt"] = "gbdt", + The model for calculating similarity between data. + meta_1st_train_end: Optional[str] + the datetime of training end of the first meta_task + alpha: float + Setting the L2 regularization for ridge + The `alpha` is only passed to MetaModelDS (it is not passed to sim_task_model currently..) + """ + # NOTE: + # the horizon must match the meaning in the base task template + self.meta_exp_name = "DDG-DA" + self.sim_task_model: UTIL_MODEL_TYPE = sim_task_model # The model to capture the distribution of data. + self.alpha = alpha + self.meta_1st_train_end = meta_1st_train_end + super().__init__(**kwargs) + self.working_dir = self.conf_path.parent if working_dir is None else Path(working_dir) + self.proxy_hd = self.working_dir / "handler_proxy.pkl" + + def _adjust_task(self, task: dict, astype: UTIL_MODEL_TYPE): + """ + some task are use for special purpose. + For example: + - GBDT for calculating feature importance + - Linear or GBDT for calculating similarity + - Datset (well processed) that aligned to Linear that for meta learning + """ + # NOTE: here is just for aligning with previous implementation + # It is not necessary for the current implementation + handler = task["dataset"].setdefault("kwargs", {}).setdefault("handler", {}) + if astype == "gbdt": + task["model"] = LGBM_MODEL + if isinstance(handler, dict): + for k in ["infer_processors", "learn_processors"]: + if k in handler.setdefault("kwargs", {}): + handler["kwargs"].pop(k) + elif astype == "linear": + task["model"] = LINEAR_MODEL + handler["kwargs"].update(PROC_ARGS) + else: + raise ValueError(f"astype not supported: {astype}") + return task + + def _get_feature_importance(self): + # this must be lightGBM, because it needs to get the feature importance + task = self.basic_task(enable_handler_cache=False) + task = self._adjust_task(task, astype="gbdt") + task = replace_task_handler_with_cache(task, self.working_dir) + + with R.start(experiment_name="feature_importance"): + model = init_instance_by_config(task["model"]) + dataset = init_instance_by_config(task["dataset"]) + model.fit(dataset) + + fi = model.get_feature_importance() + # Because the model use numpy instead of dataframe for training lightgbm + # So the we must use following extra steps to get the right feature importance + df = dataset.prepare(segments=slice(None), col_set="feature", data_key=DataHandlerLP.DK_R) + cols = df.columns + fi_named = {cols[int(k.split("_")[1])]: imp for k, imp in fi.to_dict().items()} + + return pd.Series(fi_named) + + def _dump_data_for_proxy_model(self): + """ + Dump data for training meta model. + The meta model will be trained upon the proxy forecasting model. + This dataset is for the proxy forecasting model. + """ + topk = 30 + fi = self._get_feature_importance() + col_selected = fi.nlargest(topk) + # NOTE: adjusting to `self.sim_task_model` just for aligning with previous implementation. + task = self._adjust_task(self.basic_task(enable_handler_cache=False), self.sim_task_model) + task = replace_task_handler_with_cache(task, self.working_dir) + + dataset = init_instance_by_config(task["dataset"]) + prep_ds = dataset.prepare(slice(None), col_set=["feature", "label"], data_key=DataHandlerLP.DK_L) + + feature_df = prep_ds["feature"] + label_df = prep_ds["label"] + + feature_selected = feature_df.loc[:, col_selected.index] + + feature_selected = feature_selected.groupby("datetime", group_keys=False).apply( + lambda df: (df - df.mean()).div(df.std()) + ) + feature_selected = feature_selected.fillna(0.0) + + df_all = { + "label": label_df.reindex(feature_selected.index), + "feature": feature_selected, + } + df_all = pd.concat(df_all, axis=1) + df_all.to_pickle(self.working_dir / "fea_label_df.pkl") + + # dump data in handler format for aligning the interface + handler = DataHandlerLP( + data_loader={ + "class": "qlib.data.dataset.loader.StaticDataLoader", + "kwargs": {"config": self.working_dir / "fea_label_df.pkl"}, + } + ) + handler.to_pickle(self.working_dir / self.proxy_hd, dump_all=True) + + @property + def _internal_data_path(self): + return self.working_dir / f"internal_data_s{self.step}.pkl" + + def _dump_meta_ipt(self): + """ + Dump data for training meta model. + This function will dump the input data for meta model + """ + # According to the experiments, the choice of the model type is very important for achieving good results + sim_task = self._adjust_task(self.basic_task(enable_handler_cache=False), astype=self.sim_task_model) + sim_task = replace_task_handler_with_cache(sim_task, self.working_dir) + + if self.sim_task_model == "gbdt": + sim_task["model"].setdefault("kwargs", {}).update({"early_stopping_rounds": None, "num_boost_round": 150}) + + exp_name_sim = f"data_sim_s{self.step}" + + internal_data = InternalData(sim_task, self.step, exp_name=exp_name_sim) + internal_data.setup(trainer=TrainerR) + + with self._internal_data_path.open("wb") as f: + pickle.dump(internal_data, f) + + def _train_meta_model(self, fill_method="max"): + """ + training a meta model based on a simplified linear proxy model; + """ + + # 1) leverage the simplified proxy forecasting model to train meta model. + # - Only the dataset part is important, in current version of meta model will integrate the + + # the train_start for training meta model does not necessarily align with final rolling + train_start = "2008-01-01" if self.train_start is None else self.train_start + train_end = "2010-12-31" if self.meta_1st_train_end is None else self.meta_1st_train_end + test_start = (pd.Timestamp(train_end) + pd.Timedelta(days=1)).strftime("%Y-%m-%d") + proxy_forecast_model_task = { + # "model": "qlib.contrib.model.linear.LinearModel", + "dataset": { + "class": "qlib.data.dataset.DatasetH", + "kwargs": { + "handler": f"file://{(self.working_dir / self.proxy_hd).absolute()}", + "segments": { + "train": (train_start, train_end), + "test": (test_start, self.basic_task()["dataset"]["kwargs"]["segments"]["test"][1]), + }, + }, + }, + # "record": ["qlib.workflow.record_temp.SignalRecord"] + } + # the proxy_forecast_model_task will be used to create meta tasks. + # The test date of first task will be 2011-01-01. Each test segment will be about 20days + # The tasks include all training tasks and test tasks. + + # 2) preparing meta dataset + kwargs = dict( + task_tpl=proxy_forecast_model_task, + step=self.step, + segments=0.62, # keep test period consistent with the dataset yaml + trunc_days=1 + self.horizon, + hist_step_n=30, + fill_method=fill_method, + rolling_ext_days=0, + ) + # NOTE: + # the input of meta model (internal data) are shared between proxy model and final forecasting model + # but their task test segment are not aligned! It worked in my previous experiment. + # So the misalignment will not affect the effectiveness of the method. + with self._internal_data_path.open("rb") as f: + internal_data = pickle.load(f) + + md = MetaDatasetDS(exp_name=internal_data, **kwargs) + + # 3) train and logging meta model + with R.start(experiment_name=self.meta_exp_name): + R.log_params(**kwargs) + mm = MetaModelDS( + step=self.step, hist_step_n=kwargs["hist_step_n"], lr=0.001, max_epoch=30, seed=43, alpha=self.alpha + ) + mm.fit(md) + R.save_objects(model=mm) + + @property + def _task_path(self): + return self.working_dir / f"tasks_s{self.step}.pkl" + + def get_task_list(self): + """ + Leverage meta-model for inference: + - Given + - baseline tasks + - input for meta model(internal data) + - meta model (its learnt knowledge on proxy forecasting model is expected to transfer to normal forecasting model) + """ + # 1) get meta model + exp = R.get_exp(experiment_name=self.meta_exp_name) + rec = exp.list_recorders(rtype=exp.RT_L)[0] + meta_model: MetaModelDS = rec.load_object("model") + + # 2) + # we are transfer to knowledge of meta model to final forecasting tasks. + # Create MetaTaskDataset for the final forecasting tasks + # Aligning the setting of it to the MetaTaskDataset when training Meta model is necessary + + # 2.1) get previous config + param = rec.list_params() + trunc_days = int(param["trunc_days"]) + step = int(param["step"]) + hist_step_n = int(param["hist_step_n"]) + fill_method = param.get("fill_method", "max") + + task_l = super().get_task_list() + + # 2.2) create meta dataset for final dataset + kwargs = dict( + task_tpl=task_l, + step=step, + segments=0.0, # all the tasks are for testing + trunc_days=trunc_days, + hist_step_n=hist_step_n, + fill_method=fill_method, + task_mode=MetaTask.PROC_MODE_TRANSFER, + ) + + with self._internal_data_path.open("rb") as f: + internal_data = pickle.load(f) + mds = MetaDatasetDS(exp_name=internal_data, **kwargs) + + # 3) meta model make inference and get new qlib task + new_tasks = meta_model.inference(mds) + with self._task_path.open("wb") as f: + pickle.dump(new_tasks, f) + return new_tasks + + def run(self): + # prepare the meta model for rolling --------- + # 1) file: handler_proxy.pkl (self.proxy_hd) + self._dump_data_for_proxy_model() + # 2) + # file: internal_data_s20.pkl + # mlflow: data_sim_s20, models for calculating meta_ipt + self._dump_meta_ipt() + # 3) meta model will be stored in `DDG-DA` + self._train_meta_model() + + # Run rolling -------------------------------- + # 4) new_tasks are saved in "tasks_s20.pkl" (reweighter is added) + # - the meta inference are done when calling `get_task_list` + # 5) load the saved tasks and train model + super().run() diff --git a/qlib/contrib/strategy/optimizer/optimizer.py b/qlib/contrib/strategy/optimizer/optimizer.py index a70929e275..a5fb763127 100644 --- a/qlib/contrib/strategy/optimizer/optimizer.py +++ b/qlib/contrib/strategy/optimizer/optimizer.py @@ -112,7 +112,6 @@ def __call__( return w def _optimize(self, S: np.ndarray, r: Optional[np.ndarray] = None, w0: Optional[np.ndarray] = None) -> np.ndarray: - # inverse volatility if self.method == self.OPT_INV: if r is not None: diff --git a/qlib/contrib/strategy/rule_strategy.py b/qlib/contrib/strategy/rule_strategy.py index 30facf3a37..f2b9197393 100644 --- a/qlib/contrib/strategy/rule_strategy.py +++ b/qlib/contrib/strategy/rule_strategy.py @@ -522,7 +522,6 @@ def generate_trade_decision(self, execute_result=None): _order_amount = min(_order_amount, self.trade_amount[order.stock_id]) if _order_amount > 1e-5: - _order = Order( stock_id=order.stock_id, amount=_order_amount, diff --git a/qlib/contrib/strategy/signal_strategy.py b/qlib/contrib/strategy/signal_strategy.py index 16ffff82d7..09ed1cd280 100644 --- a/qlib/contrib/strategy/signal_strategy.py +++ b/qlib/contrib/strategy/signal_strategy.py @@ -435,7 +435,6 @@ def __init__( self._riskdata_cache = {} def get_risk_data(self, date): - if date in self._riskdata_cache: return self._riskdata_cache[date] @@ -462,7 +461,6 @@ def get_risk_data(self, date): return self._riskdata_cache[date] def generate_target_weight_position(self, score, current, trade_start_time, trade_end_time): - trade_date = trade_start_time pre_date = get_pre_trading_date(trade_date, future=True) # previous trade date diff --git a/qlib/contrib/tuner/config.py b/qlib/contrib/tuner/config.py index 6e37f0097a..7a8534a20f 100644 --- a/qlib/contrib/tuner/config.py +++ b/qlib/contrib/tuner/config.py @@ -11,7 +11,6 @@ class TunerConfigManager: def __init__(self, config_path): - if not config_path: raise ValueError("Config path is invalid.") self.config_path = config_path @@ -58,7 +57,6 @@ def __init__(self, config, TUNER_CONFIG_MANAGER): class OptimizationConfig: def __init__(self, config, TUNER_CONFIG_MANAGER): - self.report_type = config.get("report_type", "pred_long") if self.report_type not in [ "pred_long", diff --git a/qlib/contrib/tuner/pipeline.py b/qlib/contrib/tuner/pipeline.py index db48c46cf6..34977fa55f 100644 --- a/qlib/contrib/tuner/pipeline.py +++ b/qlib/contrib/tuner/pipeline.py @@ -15,11 +15,9 @@ class Pipeline: - GLOBAL_BEST_PARAMS_NAME = "global_best_params.json" def __init__(self, tuner_config_manager): - self.logger = get_module_logger("Pipeline", sh_level=logging.INFO) self.tuner_config_manager = tuner_config_manager @@ -37,7 +35,6 @@ def __init__(self, tuner_config_manager): self.best_tuner_index = None def run(self): - TimeInspector.set_time_mark() for tuner_index, tuner_config in enumerate(self.pipeline_config): tuner = self.init_tuner(tuner_index, tuner_config) @@ -77,7 +74,6 @@ def init_tuner(self, tuner_index, tuner_config): return tuner_class(tuner_config, self.optim_config) def save_tuner_exp_info(self): - TimeInspector.set_time_mark() save_path = os.path.join(self.pipeline_ex_config.tuner_ex_dir, Pipeline.GLOBAL_BEST_PARAMS_NAME) with open(save_path, "w") as fp: diff --git a/qlib/contrib/tuner/tuner.py b/qlib/contrib/tuner/tuner.py index c183b28aed..7705ce8b73 100644 --- a/qlib/contrib/tuner/tuner.py +++ b/qlib/contrib/tuner/tuner.py @@ -24,7 +24,6 @@ class Tuner: def __init__(self, tuner_config, optim_config): - self.logger = get_module_logger("Tuner", sh_level=logging.INFO) self.tuner_config = tuner_config @@ -42,7 +41,6 @@ def __init__(self, tuner_config, optim_config): self.space = self.setup_space() def tune(self): - TimeInspector.set_time_mark() fmin( fn=self.objective, @@ -84,7 +82,6 @@ def save_local_best_params(self): class QLibTuner(Tuner): - ESTIMATOR_CONFIG_NAME = "estimator_config.yaml" EXP_INFO_NAME = "exp_info.json" EXP_RESULT_DIR = "sacred/{}" @@ -92,7 +89,6 @@ class QLibTuner(Tuner): LOCAL_BEST_PARAMS_NAME = "local_best_params.json" def objective(self, params): - # 1. Setup an config for a specific estimator process estimator_path = self.setup_estimator_config(params) self.logger.info("Searching params: {} ".format(params)) @@ -120,7 +116,6 @@ def objective(self, params): return {"loss": res, "status": status} def fetch_result(self): - # 1. Get experiment information exp_info_path = os.path.join(self.ex_dir, QLibTuner.EXP_INFO_NAME) with open(exp_info_path) as fp: @@ -155,7 +150,6 @@ def fetch_result(self): return np.abs(res.values[0] - 1) def setup_estimator_config(self, params): - estimator_config = copy.deepcopy(self.tuner_config) estimator_config["model"].update({"args": params["model_space"]}) estimator_config["strategy"].update({"args": params["strategy_space"]}) @@ -212,7 +206,6 @@ def setup_space(self): return space def save_local_best_params(self): - TimeInspector.set_time_mark() local_best_params_path = os.path.join(self.ex_dir, QLibTuner.LOCAL_BEST_PARAMS_NAME) with open(local_best_params_path, "w") as fp: diff --git a/qlib/data/cache.py b/qlib/data/cache.py index addd28871d..3264dcd020 100644 --- a/qlib/data/cache.py +++ b/qlib/data/cache.py @@ -583,7 +583,6 @@ def gen_expression_cache(self, expression_data, cache_path, instrument, field, f r.tofile(str(cache_path)) def update(self, sid, cache_uri, freq: str = "day"): - cp_cache_uri = self.get_cache_dir(freq).joinpath(sid).joinpath(cache_uri) meta_path = cp_cache_uri.with_suffix(".meta") if not self.check_cache_exists(cp_cache_uri, suffix_list=[".meta"]): @@ -696,7 +695,6 @@ def read_data_from_cache(cls, cache_path: Union[str, Path], start_time, end_time def _dataset( self, instruments, fields, start_time=None, end_time=None, freq="day", disk_cache=0, inst_processors=[] ): - if disk_cache == 0: # In this case, data_set cache is configured but will not be used. return self.provider.dataset( @@ -801,7 +799,6 @@ class IndexManager: KEY = "df" def __init__(self, cache_path: Union[str, Path]): - self.index_path = cache_path.with_suffix(".index") self._data = None self.logger = get_module_logger(self.__class__.__name__) @@ -1126,7 +1123,6 @@ def _uri(self, instruments, fields, start_time, end_time, freq, disk_cache=1, in def dataset( self, instruments, fields, start_time=None, end_time=None, freq="day", disk_cache=0, inst_processors=[] ): - if "local" in C.dataset_provider.lower(): # use LocalDatasetProvider return self.provider.dataset( @@ -1189,7 +1185,6 @@ def calendar(self, start_time=None, end_time=None, freq="day", future=False): uri = self._uri(start_time, end_time, freq, future) result, expire = MemCacheExpire.get_cache(H["c"], uri) if result is None or expire: - result = self.provider.calendar(start_time, end_time, freq, future) MemCacheExpire.set_cache(H["c"], uri, result) diff --git a/qlib/data/data.py b/qlib/data/data.py index 809b8d1c32..116827f232 100644 --- a/qlib/data/data.py +++ b/qlib/data/data.py @@ -1096,7 +1096,6 @@ def dataset( else: return data else: - """ Call the server to generate the data-set cache, get the uri of the cache file. Then load the data from the file on NFS directly. diff --git a/qlib/data/dataset/processor.py b/qlib/data/dataset/processor.py index f7204cf780..63acd937e6 100644 --- a/qlib/data/dataset/processor.py +++ b/qlib/data/dataset/processor.py @@ -132,7 +132,6 @@ def __init__(self, fields_group="feature", col_list=[]): self.col_list = col_list def __call__(self, df): - cols = get_group_columns(df, self.fields_group) all_cols = df.columns diff_cols = np.setdiff1d(all_cols.get_level_values(-1), cols.get_level_values(-1)) diff --git a/qlib/data/dataset/utils.py b/qlib/data/dataset/utils.py index 4761fb383c..f19dfe08fa 100644 --- a/qlib/data/dataset/utils.py +++ b/qlib/data/dataset/utils.py @@ -71,15 +71,11 @@ def fetch_df_by_index( if fetch_orig: for slc in idx_slc: if slc != slice(None, None): - return df.loc[ - pd.IndexSlice[idx_slc], - ] + return df.loc[pd.IndexSlice[idx_slc],] # noqa: E231 else: # pylint: disable=W0120 return df else: - return df.loc[ - pd.IndexSlice[idx_slc], - ] + return df.loc[pd.IndexSlice[idx_slc],] # noqa: E231 def fetch_df_by_col(df: pd.DataFrame, col_set: Union[str, List[str]]) -> pd.DataFrame: diff --git a/qlib/data/pit.py b/qlib/data/pit.py index 2294cfc0f4..673c3cb7b2 100644 --- a/qlib/data/pit.py +++ b/qlib/data/pit.py @@ -22,7 +22,6 @@ class P(ElemOperator): def _load_internal(self, instrument, start_index, end_index, freq): - _calendar = Cal.calendar(freq=freq) resample_data = np.empty(end_index - start_index + 1, dtype="float32") diff --git a/qlib/data/storage/file_storage.py b/qlib/data/storage/file_storage.py index ea72f53e4e..67554a7d24 100644 --- a/qlib/data/storage/file_storage.py +++ b/qlib/data/storage/file_storage.py @@ -191,7 +191,6 @@ def __len__(self) -> int: class FileInstrumentStorage(FileStorageMixin, InstrumentStorage): - INSTRUMENT_SEP = "\t" INSTRUMENT_START_FIELD = "start_datetime" INSTRUMENT_END_FIELD = "end_datetime" @@ -261,7 +260,6 @@ def __getitem__(self, k: InstKT) -> InstVT: return self._read_instrument()[k] def update(self, *args, **kwargs) -> None: - if len(args) > 1: raise TypeError(f"update expected at most 1 arguments, got {len(args)}") inst = self._read_instrument() @@ -360,7 +358,6 @@ def __getitem__(self, i: Union[int, slice]) -> Union[Tuple[int, float], pd.Serie storage_end_index = self.end_index with self.uri.open("rb") as fp: if isinstance(i, int): - if storage_start_index > i: raise IndexError(f"{i}: start index is {storage_start_index}") fp.seek(4 * (i - storage_start_index) + 4) diff --git a/qlib/log.py b/qlib/log.py index 115abc137f..f7683d5116 100644 --- a/qlib/log.py +++ b/qlib/log.py @@ -84,7 +84,6 @@ def __call__(self, module_name, level: Optional[int] = None) -> QlibLogger: class TimeInspector: - timer_logger = get_module_logger("timer") time_marks = [] diff --git a/qlib/model/riskmodel/poet.py b/qlib/model/riskmodel/poet.py index 8946b2ac5c..42388d84cb 100644 --- a/qlib/model/riskmodel/poet.py +++ b/qlib/model/riskmodel/poet.py @@ -43,7 +43,6 @@ def __init__(self, num_factors: int = 0, thresh: float = 1.0, thresh_method: str self.thresh_method = thresh_method def _predict(self, X: np.ndarray) -> np.ndarray: - Y = X.T # NOTE: to match POET's implementation p, n = Y.shape diff --git a/qlib/tests/__init__.py b/qlib/tests/__init__.py index 52c924918a..97ff00c579 100644 --- a/qlib/tests/__init__.py +++ b/qlib/tests/__init__.py @@ -14,7 +14,6 @@ class TestAutoData(unittest.TestCase): - _setup_kwargs = {} provider_uri = "~/.qlib/qlib_data/cn_data_simple" # target_dir provider_uri_1day = "~/.qlib/qlib_data/cn_data" # target_dir @@ -286,6 +285,5 @@ class TestMockData(unittest.TestCase): @classmethod def setUpClass(cls) -> None: - provider_uri = "Not necessary." init(region=REG_TW, provider_uri=provider_uri, expression_cache=None, dataset_cache=None, **cls._setup_kwargs) diff --git a/qlib/utils/__init__.py b/qlib/utils/__init__.py index 910a4c08b2..9e63c104a1 100644 --- a/qlib/utils/__init__.py +++ b/qlib/utils/__init__.py @@ -7,12 +7,9 @@ from __future__ import print_function import os -import pickle import re -import sys import copy import json -from qlib.typehint import InstConf import yaml import redis import bisect @@ -22,15 +19,11 @@ import hashlib import datetime import requests -import importlib -import contextlib import collections import numpy as np import pandas as pd from pathlib import Path -from typing import List, Dict, Union, Tuple, Any, Optional, Callable -from types import ModuleType -from urllib.parse import urlparse +from typing import List, Union, Optional, Callable from packaging import version from .file import get_or_create_path, save_multiple_parts_file, unpack_archive_with_buffer, get_tmp_file_with_buffer from ..config import C @@ -288,182 +281,6 @@ def parse_field(field): return field -def get_module_by_module_path(module_path: Union[str, ModuleType]): - """Load module path - - :param module_path: - :return: - :raises: ModuleNotFoundError - """ - if module_path is None: - raise ModuleNotFoundError("None is passed in as parameters as module_path") - - if isinstance(module_path, ModuleType): - module = module_path - else: - if module_path.endswith(".py"): - module_name = re.sub("^[^a-zA-Z_]+", "", re.sub("[^0-9a-zA-Z_]", "", module_path[:-3].replace("/", "_"))) - module_spec = importlib.util.spec_from_file_location(module_name, module_path) - module = importlib.util.module_from_spec(module_spec) - sys.modules[module_name] = module - module_spec.loader.exec_module(module) - else: - module = importlib.import_module(module_path) - return module - - -def split_module_path(module_path: str) -> Tuple[str, str]: - """ - - Parameters - ---------- - module_path : str - e.g. "a.b.c.ClassName" - - Returns - ------- - Tuple[str, str] - e.g. ("a.b.c", "ClassName") - """ - *m_path, cls = module_path.split(".") - m_path = ".".join(m_path) - return m_path, cls - - -def get_callable_kwargs(config: InstConf, default_module: Union[str, ModuleType] = None) -> (type, dict): - """ - extract class/func and kwargs from config info - - Parameters - ---------- - config : [dict, str] - similar to config - please refer to the doc of init_instance_by_config - - default_module : Python module or str - It should be a python module to load the class type - This function will load class from the config['module_path'] first. - If config['module_path'] doesn't exists, it will load the class from default_module. - - Returns - ------- - (type, dict): - the class/func object and it's arguments. - - Raises - ------ - ModuleNotFoundError - """ - if isinstance(config, dict): - key = "class" if "class" in config else "func" - if isinstance(config[key], str): - # 1) get module and class - # - case 1): "a.b.c.ClassName" - # - case 2): {"class": "ClassName", "module_path": "a.b.c"} - m_path, cls = split_module_path(config[key]) - if m_path == "": - m_path = config.get("module_path", default_module) - module = get_module_by_module_path(m_path) - - # 2) get callable - _callable = getattr(module, cls) # may raise AttributeError - else: - _callable = config[key] # the class type itself is passed in - kwargs = config.get("kwargs", {}) - elif isinstance(config, str): - # a.b.c.ClassName - m_path, cls = split_module_path(config) - module = get_module_by_module_path(default_module if m_path == "" else m_path) - - _callable = getattr(module, cls) - kwargs = {} - else: - raise NotImplementedError(f"This type of input is not supported") - return _callable, kwargs - - -get_cls_kwargs = get_callable_kwargs # NOTE: this is for compatibility for the previous version - - -def init_instance_by_config( - config: InstConf, - default_module=None, - accept_types: Union[type, Tuple[type]] = (), - try_kwargs: Dict = {}, - **kwargs, -) -> Any: - """ - get initialized instance with config - - Parameters - ---------- - config : InstConf - - default_module : Python module - Optional. It should be a python module. - NOTE: the "module_path" will be override by `module` arguments - - This function will load class from the config['module_path'] first. - If config['module_path'] doesn't exists, it will load the class from default_module. - - accept_types: Union[type, Tuple[type]] - Optional. If the config is a instance of specific type, return the config directly. - This will be passed into the second parameter of isinstance. - - try_kwargs: Dict - Try to pass in kwargs in `try_kwargs` when initialized the instance - If error occurred, it will fail back to initialization without try_kwargs. - - Returns - ------- - object: - An initialized object based on the config info - """ - if isinstance(config, accept_types): - return config - - if isinstance(config, (str, Path)): - if isinstance(config, str): - # path like 'file:////obj.pkl' - pr = urlparse(config) - if pr.scheme == "file": - pr_path = os.path.join(pr.netloc, pr.path) if bool(pr.path) else pr.netloc - with open(os.path.normpath(pr_path), "rb") as f: - return pickle.load(f) - else: - with config.open("rb") as f: - return pickle.load(f) - - klass, cls_kwargs = get_callable_kwargs(config, default_module=default_module) - - try: - return klass(**cls_kwargs, **try_kwargs, **kwargs) - except (TypeError,): - # TypeError for handling errors like - # 1: `XXX() got multiple values for keyword argument 'YYY'` - # 2: `XXX() got an unexpected keyword argument 'YYY' - return klass(**cls_kwargs, **kwargs) - - -@contextlib.contextmanager -def class_casting(obj: object, cls: type): - """ - Python doesn't provide the downcasting mechanism. - We use the trick here to downcast the class - - Parameters - ---------- - obj : object - the object to be cast - cls : type - the target class type - """ - orig_cls = obj.__class__ - obj.__class__ = cls - yield - obj.__class__ = orig_cls - - def compare_dict_value(src_data: dict, dst_data: dict): """Compare dict value @@ -744,7 +561,6 @@ def exists_qlib_data(qlib_dir): return False # check calendar bin for _calendar in calendars_dir.iterdir(): - if ("_future" not in _calendar.name) and ( not list(features_dir.rglob(f"*.{_calendar.name.split('.')[0]}.bin")) ): @@ -872,9 +688,9 @@ def get_item_from_obj(config: dict, name_path: str) -> object: cur_cfg = config for k in name_path.split("."): if isinstance(cur_cfg, dict): - cur_cfg = cur_cfg[k] + cur_cfg = cur_cfg[k] # may raise KeyError elif k.isdigit(): - cur_cfg = cur_cfg[int(k)] + cur_cfg = cur_cfg[int(k)] # may raise IndexError else: raise ValueError(f"Error when getting {k} from cur_cfg") return cur_cfg @@ -910,6 +726,21 @@ def fill_placeholder(config: dict, config_extend: dict): top = 0 tail = 1 item_queue = [config] + + def try_replace_placeholder(value): + if value in config_extend.keys(): + value = config_extend[value] + else: + m = re.match(r"<(?P[^<>]+)>", value) + if m is not None: + try: + value = get_item_from_obj(config, m.groupdict()["name_path"]) + except (KeyError, ValueError, IndexError): + get_module_logger("fill_placeholder").info( + f"{value} lookes like a placeholder, but it can't match to any given values" + ) + return value + while top < tail: now_item = item_queue[top] top += 1 @@ -917,17 +748,13 @@ def fill_placeholder(config: dict, config_extend: dict): item_keys = range(len(now_item)) elif isinstance(now_item, dict): item_keys = now_item.keys() - for key in item_keys: + for key in item_keys: # noqa if isinstance(now_item[key], (list, dict)): item_queue.append(now_item[key]) tail += 1 elif isinstance(now_item[key], str): - if now_item[key] in config_extend.keys(): - now_item[key] = config_extend[now_item[key]] - else: - m = re.match(r"<(?P[^<>]+)>", now_item[key]) - if m is not None: - now_item[key] = get_item_from_obj(config, m.groupdict()["name_path"]) + # If it is a string, try to replace it with placeholder + now_item[key] = try_replace_placeholder(now_item[key]) return config @@ -1049,6 +876,15 @@ def fname_to_code(fname: str): return fname +from .mod import ( + get_module_by_module_path, + split_module_path, + get_callable_kwargs, + get_cls_kwargs, + init_instance_by_config, + class_casting, +) + __all__ = [ "get_or_create_path", "save_multiple_parts_file", @@ -1056,4 +892,10 @@ def fname_to_code(fname: str): "get_tmp_file_with_buffer", "set_log_with_config", "init_instance_by_config", + "get_module_by_module_path", + "split_module_path", + "get_callable_kwargs", + "get_cls_kwargs", + "init_instance_by_config", + "class_casting", ] diff --git a/qlib/utils/index_data.py b/qlib/utils/index_data.py index b62bc02ced..113f9802d7 100644 --- a/qlib/utils/index_data.py +++ b/qlib/utils/index_data.py @@ -351,7 +351,6 @@ class IndexData(metaclass=index_data_ops_creator): loc_idx_cls = LocIndexer def __init__(self, data: np.ndarray, *indices: Union[List, pd.Index, Index]): - self.data = data self.indices = indices diff --git a/qlib/utils/mod.py b/qlib/utils/mod.py new file mode 100644 index 0000000000..e539572606 --- /dev/null +++ b/qlib/utils/mod.py @@ -0,0 +1,235 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +""" +All module related class, e.g. : +- importing a module, class +- walkiing a module +- operations on class or module... +""" + +import contextlib +import importlib +import os +from pathlib import Path +import pickle +import pkgutil +import re +import sys +from types import ModuleType +from typing import Any, Dict, List, Tuple, Union +from urllib.parse import urlparse + +from qlib.typehint import InstConf + + +def get_module_by_module_path(module_path: Union[str, ModuleType]): + """Load module path + + :param module_path: + :return: + :raises: ModuleNotFoundError + """ + if module_path is None: + raise ModuleNotFoundError("None is passed in as parameters as module_path") + + if isinstance(module_path, ModuleType): + module = module_path + else: + if module_path.endswith(".py"): + module_name = re.sub("^[^a-zA-Z_]+", "", re.sub("[^0-9a-zA-Z_]", "", module_path[:-3].replace("/", "_"))) + module_spec = importlib.util.spec_from_file_location(module_name, module_path) + module = importlib.util.module_from_spec(module_spec) + sys.modules[module_name] = module + module_spec.loader.exec_module(module) + else: + module = importlib.import_module(module_path) + return module + + +def split_module_path(module_path: str) -> Tuple[str, str]: + """ + + Parameters + ---------- + module_path : str + e.g. "a.b.c.ClassName" + + Returns + ------- + Tuple[str, str] + e.g. ("a.b.c", "ClassName") + """ + *m_path, cls = module_path.split(".") + m_path = ".".join(m_path) + return m_path, cls + + +def get_callable_kwargs(config: InstConf, default_module: Union[str, ModuleType] = None) -> (type, dict): + """ + extract class/func and kwargs from config info + + Parameters + ---------- + config : [dict, str] + similar to config + please refer to the doc of init_instance_by_config + + default_module : Python module or str + It should be a python module to load the class type + This function will load class from the config['module_path'] first. + If config['module_path'] doesn't exists, it will load the class from default_module. + + Returns + ------- + (type, dict): + the class/func object and it's arguments. + + Raises + ------ + ModuleNotFoundError + """ + if isinstance(config, dict): + key = "class" if "class" in config else "func" + if isinstance(config[key], str): + # 1) get module and class + # - case 1): "a.b.c.ClassName" + # - case 2): {"class": "ClassName", "module_path": "a.b.c"} + m_path, cls = split_module_path(config[key]) + if m_path == "": + m_path = config.get("module_path", default_module) + module = get_module_by_module_path(m_path) + + # 2) get callable + _callable = getattr(module, cls) # may raise AttributeError + else: + _callable = config[key] # the class type itself is passed in + kwargs = config.get("kwargs", {}) + elif isinstance(config, str): + # a.b.c.ClassName + m_path, cls = split_module_path(config) + module = get_module_by_module_path(default_module if m_path == "" else m_path) + + _callable = getattr(module, cls) + kwargs = {} + else: + raise NotImplementedError(f"This type of input is not supported") + return _callable, kwargs + + +get_cls_kwargs = get_callable_kwargs # NOTE: this is for compatibility for the previous version + + +def init_instance_by_config( + config: InstConf, + default_module=None, + accept_types: Union[type, Tuple[type]] = (), + try_kwargs: Dict = {}, + **kwargs, +) -> Any: + """ + get initialized instance with config + + Parameters + ---------- + config : InstConf + + default_module : Python module + Optional. It should be a python module. + NOTE: the "module_path" will be override by `module` arguments + + This function will load class from the config['module_path'] first. + If config['module_path'] doesn't exists, it will load the class from default_module. + + accept_types: Union[type, Tuple[type]] + Optional. If the config is a instance of specific type, return the config directly. + This will be passed into the second parameter of isinstance. + + try_kwargs: Dict + Try to pass in kwargs in `try_kwargs` when initialized the instance + If error occurred, it will fail back to initialization without try_kwargs. + + Returns + ------- + object: + An initialized object based on the config info + """ + if isinstance(config, accept_types): + return config + + if isinstance(config, (str, Path)): + if isinstance(config, str): + # path like 'file:////obj.pkl' + pr = urlparse(config) + if pr.scheme == "file": + pr_path = os.path.join(pr.netloc, pr.path) if bool(pr.path) else pr.netloc + with open(os.path.normpath(pr_path), "rb") as f: + return pickle.load(f) + else: + with config.open("rb") as f: + return pickle.load(f) + + klass, cls_kwargs = get_callable_kwargs(config, default_module=default_module) + + try: + return klass(**cls_kwargs, **try_kwargs, **kwargs) + except (TypeError,): + # TypeError for handling errors like + # 1: `XXX() got multiple values for keyword argument 'YYY'` + # 2: `XXX() got an unexpected keyword argument 'YYY' + return klass(**cls_kwargs, **kwargs) + + +@contextlib.contextmanager +def class_casting(obj: object, cls: type): + """ + Python doesn't provide the downcasting mechanism. + We use the trick here to downcast the class + + Parameters + ---------- + obj : object + the object to be cast + cls : type + the target class type + """ + orig_cls = obj.__class__ + obj.__class__ = cls + yield + obj.__class__ = orig_cls + + +def find_all_classes(module_path: Union[str, ModuleType], cls: type) -> List[type]: + """ + Find all the classes recursively that inherit from `cls` in a given module. + - `cls` itself is also included + + >>> from qlib.data.dataset.handler import DataHandler + >>> find_all_classes("qlib.contrib.data.handler", DataHandler) + [, , , , ] + + TODO: + - skip import error + + """ + if isinstance(module_path, ModuleType): + mod = module_path + else: + mod = importlib.import_module(module_path) + + cls_list = [] + + def _append_cls(obj): + # Leverage the closure trick to reuse code + if isinstance(obj, type) and issubclass(obj, cls) and cls not in cls_list: + cls_list.append(obj) + + for attr in dir(mod): + _append_cls(getattr(mod, attr)) + + if hasattr(mod, "__path__"): + # if the model is a package + for _, modname, _ in pkgutil.iter_modules(mod.__path__): + sub_mod = importlib.import_module(f"{mod.__package__}.{modname}") + for m_cls in find_all_classes(sub_mod, cls): + _append_cls(m_cls) + return cls_list diff --git a/qlib/workflow/record_temp.py b/qlib/workflow/record_temp.py index fdb3f6c92a..d8b0a79a31 100644 --- a/qlib/workflow/record_temp.py +++ b/qlib/workflow/record_temp.py @@ -136,7 +136,6 @@ def check(self, include_self: bool = False, parents: bool = True): whether the records are stored properly. """ if include_self: - # Some mlflow backend will not list the directly recursively. # So we force to the directly artifacts = {} diff --git a/qlib/workflow/task/gen.py b/qlib/workflow/task/gen.py index 77bd2cbc11..bd98e501db 100644 --- a/qlib/workflow/task/gen.py +++ b/qlib/workflow/task/gen.py @@ -339,7 +339,6 @@ def set_horizon(self, task: dict, hr: int): def generate(self, task: dict): res = [] for hr in self.horizon: - # Add horizon t = copy.deepcopy(task) self.set_horizon(t, hr) diff --git a/qlib/workflow/task/utils.py b/qlib/workflow/task/utils.py index a914ea54fe..19837b3c79 100644 --- a/qlib/workflow/task/utils.py +++ b/qlib/workflow/task/utils.py @@ -1,23 +1,25 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. - """ Some tools for task management. """ import bisect +from copy import deepcopy import pandas as pd from qlib.data import D +from qlib.utils import hash_args +from qlib.utils.mod import init_instance_by_config from qlib.workflow import R from qlib.config import C from qlib.log import get_module_logger from pymongo import MongoClient from pymongo.database import Database from typing import Union +from pathlib import Path def get_mongodb() -> Database: - """ Get database in MongoDB, which means you need to declare the address and the name of a database at first. @@ -276,3 +278,31 @@ def shift(self, seg: tuple, step: int, rtype=SHIFT_SD) -> tuple: return self.get(start_idx), self.get(end_idx) else: raise NotImplementedError(f"This type of input is not supported") + + +def replace_task_handler_with_cache(task: dict, cache_dir: Union[str, Path] = ".") -> dict: + """ + Replace the handler in task with a cache handler. + It will automatically cache the file and save it in cache_dir. + + >>> import qlib + >>> qlib.auto_init() + >>> import datetime + >>> # it is simplified task + >>> task = {"dataset": {"kwargs":{'handler': {'class': 'Alpha158', 'module_path': 'qlib.contrib.data.handler', 'kwargs': {'start_time': datetime.date(2008, 1, 1), 'end_time': datetime.date(2020, 8, 1), 'fit_start_time': datetime.date(2008, 1, 1), 'fit_end_time': datetime.date(2014, 12, 31), 'instruments': 'CSI300'}}}}} + >>> new_task = replace_task_handler_with_cache(task) + >>> print(new_task) + {'dataset': {'kwargs': {'handler': 'file...Alpha158.3584f5f8b4.pkl'}}} + + """ + cache_dir = Path(cache_dir) + task = deepcopy(task) + handler = task["dataset"]["kwargs"]["handler"] + if isinstance(handler, dict): + hash = hash_args(handler) + h_path = cache_dir / f"{handler['class']}.{hash[:10]}.pkl" + if not h_path.exists(): + h = init_instance_by_config(handler) + h.to_pickle(h_path, dump_all=True) + task["dataset"]["kwargs"]["handler"] = f"file://{h_path}" + return task diff --git a/scripts/check_dump_bin.py b/scripts/check_dump_bin.py index ef8023219e..7ae8a26ab0 100644 --- a/scripts/check_dump_bin.py +++ b/scripts/check_dump_bin.py @@ -15,7 +15,6 @@ class CheckBin: - NOT_IN_FEATURES = "not in features" COMPARE_FALSE = "compare False" COMPARE_TRUE = "compare True" diff --git a/scripts/data_collector/base.py b/scripts/data_collector/base.py index e3cf1fcacb..386bb1b2c0 100644 --- a/scripts/data_collector/base.py +++ b/scripts/data_collector/base.py @@ -18,7 +18,6 @@ class BaseCollector(abc.ABC): - CACHE_FLAG = "CACHED" NORMAL_FLAG = "NORMAL" @@ -185,7 +184,6 @@ def cache_small_data(self, symbol, df): return self.NORMAL_FLAG def _collector(self, instrument_list): - error_symbol = [] res = Parallel(n_jobs=self.max_workers)( delayed(self._simple_collector)(_inst) for _inst in tqdm(instrument_list) diff --git a/scripts/data_collector/br_index/collector.py b/scripts/data_collector/br_index/collector.py index 0dc12eff66..7d32170f06 100644 --- a/scripts/data_collector/br_index/collector.py +++ b/scripts/data_collector/br_index/collector.py @@ -21,7 +21,6 @@ class IBOVIndex(IndexBase): - ibov_index_composition = "https://raw.githubusercontent.com/igor17400/IBOV-HCI/main/historic_composition/{}.csv" years_4_month_periods = [] diff --git a/scripts/data_collector/us_index/collector.py b/scripts/data_collector/us_index/collector.py index 97cbce8252..cb0c3fc955 100644 --- a/scripts/data_collector/us_index/collector.py +++ b/scripts/data_collector/us_index/collector.py @@ -143,7 +143,6 @@ def filter_df(self, df: pd.DataFrame) -> pd.DataFrame: class NASDAQ100Index(WIKIIndex): - HISTORY_COMPANIES_URL = ( "https://indexes.nasdaqomx.com/Index/WeightingData?id=NDX&tradeDate={trade_date}T00%3A00%3A00.000&timeOfDay=SOD" ) diff --git a/scripts/dump_pit.py b/scripts/dump_pit.py index cda872c09f..c328eb67a8 100644 --- a/scripts/dump_pit.py +++ b/scripts/dump_pit.py @@ -237,7 +237,6 @@ def _dump_pit( pass with open(data_file, "rb+") as fd, open(index_file, "rb+") as fi: - # update index if needed for i, row in df_sub.iterrows(): # get index diff --git a/tests/backtest/test_high_freq_trading.py b/tests/backtest/test_high_freq_trading.py index fd934914d8..a538464db4 100644 --- a/tests/backtest/test_high_freq_trading.py +++ b/tests/backtest/test_high_freq_trading.py @@ -27,7 +27,6 @@ def _gen_orders(self, inst, date, pos) -> pd.DataFrame: return pd.DataFrame(orders, columns=headers) def test_trading(self): - # date = "2020-02-03" # inst = "SH600068" # pos = 2.0167 diff --git a/tests/data_mid_layer_tests/test_handler_storage.py b/tests/data_mid_layer_tests/test_handler_storage.py index 0d8ad4d570..a8bb730f7b 100644 --- a/tests/data_mid_layer_tests/test_handler_storage.py +++ b/tests/data_mid_layer_tests/test_handler_storage.py @@ -21,7 +21,6 @@ def __init__( fit_end_time=None, drop_raw=True, ): - infer_processors = check_transform_proc(infer_processors, fit_start_time, fit_end_time) learn_processors = check_transform_proc(learn_processors, fit_start_time, fit_end_time) @@ -51,7 +50,6 @@ def get_feature_config(self): class TestHandlerStorage(TestAutoData): - market = "all" start_time = "2010-01-01" @@ -82,7 +80,6 @@ def test_handler_storage(self): ) with TimeInspector.logt("random fetch with DataFrame Storage"): - # single stock for i in range(100): random_index = np.random.randint(len(instruments), size=1)[0] @@ -96,7 +93,6 @@ def test_handler_storage(self): data_handler.fetch(selector=(fetch_stocks, slice(fetch_start_time, fetch_end_time)), level=None) with TimeInspector.logt("random fetch with HashingStock Storage"): - # single stock for i in range(100): random_index = np.random.randint(len(instruments), size=1)[0] diff --git a/tests/misc/test_sepdf.py b/tests/misc/test_sepdf.py index 9fdc0bb2dd..76bd0e6bd6 100644 --- a/tests/misc/test_sepdf.py +++ b/tests/misc/test_sepdf.py @@ -11,7 +11,6 @@ def to_str(self, obj): return "".join(str(obj).split()) def test_index_data(self): - np.random.seed(42) index = [ diff --git a/tests/rolling_tests/test_update_pred.py b/tests/rolling_tests/test_update_pred.py index 3246119487..b3ca2e0368 100644 --- a/tests/rolling_tests/test_update_pred.py +++ b/tests/rolling_tests/test_update_pred.py @@ -77,7 +77,6 @@ def test_update_pred(self): @pytest.mark.slow def test_update_label(self): - task = copy.deepcopy(CSI300_GBDT_TASK) task["record"] = { diff --git a/tests/storage_tests/test_storage.py b/tests/storage_tests/test_storage.py index 50b16a041a..92fed34ecd 100644 --- a/tests/storage_tests/test_storage.py +++ b/tests/storage_tests/test_storage.py @@ -22,7 +22,6 @@ class TestStorage(TestAutoData): def test_calendar_storage(self): - calendar = CalendarStorage(freq="day", future=False, provider_uri=self.provider_uri) assert isinstance(calendar[:], Iterable), f"{calendar.__class__.__name__}.__getitem__(s: slice) is not Iterable" assert isinstance(calendar.data, Iterable), f"{calendar.__class__.__name__}.data is not Iterable" diff --git a/tests/test_get_data.py b/tests/test_get_data.py index 94e685e1fb..125b9203e6 100644 --- a/tests/test_get_data.py +++ b/tests/test_get_data.py @@ -33,7 +33,6 @@ def tearDownClass(cls) -> None: shutil.rmtree(str(DATA_DIR.resolve())) def test_0_qlib_data(self): - GetData().qlib_data( name="qlib_data_simple", target_dir=QLIB_DIR, region="cn", interval="1d", delete_old=False, exists_skip=True ) From 6cefe4a8052ba2c651c61ed1142ce724eae11a58 Mon Sep 17 00:00:00 2001 From: Linlang <30293408+SunsetWolf@users.noreply.github.com> Date: Tue, 18 Jul 2023 19:25:08 +0800 Subject: [PATCH 20/37] Fixed pyqlib version issue on macos (#1605) * change_publish * Update .github/workflows/python-publish.yml --------- Co-authored-by: Linlang Co-authored-by: you-n-g --- .github/workflows/python-publish.yml | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index e95a9e88c8..5d88b2959a 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -19,7 +19,24 @@ jobs: steps: - uses: actions/checkout@v2 - - name: Set up Python + # This is because on macos systems you can install pyqlib using + # `pip install pyqlib` installs, it does not recognize the + # `pyqlib--cp38-cp38-macosx_11_0_x86_64.whl` and `pyqlib--cp38-cp37m-macosx_11_0_x86_64.whl`. + # So we limit the version of python, in order to generate a version of qlib that is usable for macos: `pyqlib--cp38-cp37m + # `pyqlib--cp38-cp38-macosx_10_15_x86_64.whl` and `pyqlib--cp38-cp37m-macosx_10_15_x86_64.whl`. + # Python 3.7.16, 3.8.16 can build macosx_10_15. But Python 3.7.17, 3.8.17 can build macosx_11_0 + - name: Set up Python ${{ matrix.python-version }} + if: matrix.os == 'macos-11' && matrix.python-version == '3.7' + uses: actions/setup-python@v2 + with: + python-version: "3.7.16" + - name: Set up Python ${{ matrix.python-version }} + if: matrix.os == 'macos-11' && matrix.python-version == '3.8' + uses: actions/setup-python@v2 + with: + python-version: "3.8.16" + - name: Set up Python ${{ matrix.python-version }} + if: matrix.os != 'macos-11' uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} @@ -27,7 +44,7 @@ jobs: run: | python -m pip install --upgrade pip pip install setuptools wheel twine - - name: Build wheel on Windows + - name: Build wheel on ${{ matrix.os }} run: | pip install numpy pip install cython From a65fca88bb1047867288d4b10b9e9a3458cf711e Mon Sep 17 00:00:00 2001 From: you-n-g Date: Tue, 18 Jul 2023 19:28:17 +0800 Subject: [PATCH 21/37] Update __init__.py --- qlib/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qlib/__init__.py b/qlib/__init__.py index 3355ac04f8..9bc13ff284 100644 --- a/qlib/__init__.py +++ b/qlib/__init__.py @@ -2,7 +2,7 @@ # Licensed under the MIT License. from pathlib import Path -__version__ = "0.9.2.99" +__version__ = "0.9.3" __version__bak = __version__ # This version is backup for QlibConfig.reset_qlib_version import os from typing import Union From 9e990e585b8ff96ad7a6c9d29374f4b00c2c6893 Mon Sep 17 00:00:00 2001 From: you-n-g Date: Tue, 18 Jul 2023 20:54:15 +0800 Subject: [PATCH 22/37] Bump Version & Fix CI (#1606) * Bump Version & Fix CI * Update test_qlib_from_pip.yml --- .github/workflows/test_qlib_from_pip.yml | 1 - qlib/__init__.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/test_qlib_from_pip.yml b/.github/workflows/test_qlib_from_pip.yml index f5db06ccba..346dd49606 100644 --- a/.github/workflows/test_qlib_from_pip.yml +++ b/.github/workflows/test_qlib_from_pip.yml @@ -8,7 +8,6 @@ on: jobs: build: - if: ${{ false }} # FIXME: temporarily disable... Due to we are rushing a feature timeout-minutes: 120 runs-on: ${{ matrix.os }} diff --git a/qlib/__init__.py b/qlib/__init__.py index 9bc13ff284..ed95f589e4 100644 --- a/qlib/__init__.py +++ b/qlib/__init__.py @@ -2,7 +2,7 @@ # Licensed under the MIT License. from pathlib import Path -__version__ = "0.9.3" +__version__ = "0.9.3.99" __version__bak = __version__ # This version is backup for QlibConfig.reset_qlib_version import os from typing import Union From e5df2763c6e64d7b0f6d8473094db9352778d499 Mon Sep 17 00:00:00 2001 From: Linlang <30293408+SunsetWolf@users.noreply.github.com> Date: Wed, 19 Jul 2023 17:33:47 +0800 Subject: [PATCH 23/37] fix_ci (#1608) Co-authored-by: Linlang --- .github/workflows/test_qlib_from_pip.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/test_qlib_from_pip.yml b/.github/workflows/test_qlib_from_pip.yml index 346dd49606..bde41d8026 100644 --- a/.github/workflows/test_qlib_from_pip.yml +++ b/.github/workflows/test_qlib_from_pip.yml @@ -43,9 +43,6 @@ jobs: - name: Qlib installation test run: | python -m pip install pyqlib - # Specify the numpy version because the numpy upgrade caused the CI test to fail, - # and this line of code will be removed when the next version of qlib is released. - python -m pip install "numpy<1.23" - name: Install Lightgbm for MacOS if: ${{ matrix.os == 'macos-11' || matrix.os == 'macos-latest' }} From ee50f7cb0d4091c0fb8850e0ca0750024be72200 Mon Sep 17 00:00:00 2001 From: JJ <103335846+computerscienceiscool@users.noreply.github.com> Date: Wed, 26 Jul 2023 01:37:59 -0700 Subject: [PATCH 24/37] Update introduction.rst (#1579) Fixed a spelling mistake. I changed deicsions to decisions. --- docs/introduction/introduction.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/introduction/introduction.rst b/docs/introduction/introduction.rst index 52d58e1639..f83a921c8d 100644 --- a/docs/introduction/introduction.rst +++ b/docs/introduction/introduction.rst @@ -51,7 +51,7 @@ Name Description modules. With these signals `Decision Generator` will generate the target trading decisions(i.e. portfolio, orders) If RL-based Strategies are adopted, the `Policy` is learned in a end-to-end way, - the trading deicsions are generated directly. + the trading decisions are generated directly. Decisions will be executed by `Execution Env` (i.e. the trading market). There may be multiple levels of `Strategy` and `Executor` (e.g. an *order executor trading strategy and intraday order executor* From 98640389c41658dcc307ee47adbb50da8606fb74 Mon Sep 17 00:00:00 2001 From: Gene <44857064+GeneLiuXe@users.noreply.github.com> Date: Wed, 26 Jul 2023 16:38:22 +0800 Subject: [PATCH 25/37] Update README.md (#1553) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 539700a910..131c699648 100644 --- a/README.md +++ b/README.md @@ -360,7 +360,7 @@ Here is a list of models built on `Qlib`. Your PR of new Quant models is highly welcomed. -The performance of each model on the `Alpha158` and `Alpha360` dataset can be found [here](examples/benchmarks/README.md). +The performance of each model on the `Alpha158` and `Alpha360` datasets can be found [here](examples/benchmarks/README.md). ### Run a single model All the models listed above are runnable with ``Qlib``. Users can find the config files we provide and some details about the model through the [benchmarks](examples/benchmarks) folder. More information can be retrieved at the model files listed above. From 2d0162df4498d7c3545573f074e17acbf14c6403 Mon Sep 17 00:00:00 2001 From: JJ <103335846+computerscienceiscool@users.noreply.github.com> Date: Wed, 26 Jul 2023 01:42:53 -0700 Subject: [PATCH 26/37] Update introduction.rst (#1578) --- docs/introduction/introduction.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/introduction/introduction.rst b/docs/introduction/introduction.rst index f83a921c8d..9455a8e3c8 100644 --- a/docs/introduction/introduction.rst +++ b/docs/introduction/introduction.rst @@ -36,7 +36,7 @@ Name Description the training process of models which enable algorithms controlling the training process. -`Learning Framework` layer The `Forecast Model` and `Trading Agent` are learnable. They are learned +`Learning Framework` layer The `Forecast Model` and `Trading Agent` are trainable. They are trained based on the `Learning Framework` layer and then applied to multiple scenarios in `Workflow` layer. The supported learning paradigms can be categorized into reinforcement learning and supervised learning. The learning framework From e2019f8ea1cc3f053c59e23bea207a1195d4dd73 Mon Sep 17 00:00:00 2001 From: Fivele-Li <128388363+Fivele-Li@users.noreply.github.com> Date: Tue, 1 Aug 2023 19:02:04 +0800 Subject: [PATCH 27/37] depress warning with pandas option_context (#1524) Co-authored-by: Cadenza-Li <362237642@qq.com> --- README.md | 2 +- examples/benchmarks/README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 131c699648..2355c5c6d2 100644 --- a/README.md +++ b/README.md @@ -139,7 +139,7 @@ This table demonstrates the supported Python version of `Qlib`: | Python 3.9 | :x: | :heavy_check_mark: | :x: | **Note**: -1. **Conda** is suggested for managing your Python environment. +1. **Conda** is suggested for managing your Python environment. In some cases, using Python outside of a `conda` environment may result in missing header files, causing the installation failure of certain packages. 1. Please pay attention that installing cython in Python 3.6 will raise some error when installing ``Qlib`` from source. If users use Python 3.6 on their machines, it is recommended to *upgrade* Python to version 3.7 or use `conda`'s Python to install ``Qlib`` from source. 1. For Python 3.9, `Qlib` supports running workflows such as training models, doing backtest and plot most of the related figures (those included in [notebook](examples/workflow_by_code.ipynb)). However, plotting for the *model performance* is not supported for now and we will fix this when the dependent packages are upgraded in the future. 1. `Qlib`Requires `tables` package, `hdf5` in tables does not support python3.9. diff --git a/examples/benchmarks/README.md b/examples/benchmarks/README.md index 41799205ef..6189518a1e 100644 --- a/examples/benchmarks/README.md +++ b/examples/benchmarks/README.md @@ -136,7 +136,7 @@ If you want to contribute your new models, you can follow the steps below. - `README.md`: a brief introduction to your models - `workflow_config__.yaml`: a configuration which can read by `qrun`. You are encouraged to run your model in all datasets. 3. You can integrate your model as a module [in this folder](https://github.com/microsoft/qlib/tree/main/qlib/contrib/model). -4. Please update your results in the above **Benchmark Tables**, e.g. [Alpha360](#alpha158-dataset), [Alpha158](#alpha158-dataset)(the values of each metric are the mean and std calculated based on **20 Runs** with different random seeds. You can accomplish the above operations through the automated [script](https://github.com/microsoft/qlib/blob/main/examples/run_all_model.py#LL286C22-L286C22) provided by Qlib, and get the final result in the .md file. if you don't have enough computational resource, you can ask for help in the PR). +4. Please update your results in the above **Benchmark Tables**, e.g. [Alpha360](#alpha158-dataset), [Alpha158](#alpha158-dataset)(the values of each metric are the mean and std calculated based on **20 Runs** with different random seeds. You can accomplish the above operations through the automated [script](https://github.com/microsoft/qlib/blob/main/examples/run_all_model.py) provided by Qlib, and get the final result in the .md file. if you don't have enough computational resource, you can ask for help in the PR). 5. Update the info in the index page in the [news list](https://github.com/microsoft/qlib#newspaper-whats-new----sparkling_heart) and [model list](https://github.com/microsoft/qlib#quant-model-paper-zoo). Finally, you can send PR for review. ([here is an example](https://github.com/microsoft/qlib/pull/1040)) From 42ba74666cacae9391cda41599238b0ef6aad33b Mon Sep 17 00:00:00 2001 From: Linlang <30293408+SunsetWolf@users.noreply.github.com> Date: Wed, 2 Aug 2023 20:14:54 +0800 Subject: [PATCH 28/37] fix docs (#1618) Co-authored-by: Linlang --- docs/component/data.rst | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/docs/component/data.rst b/docs/component/data.rst index 5a2d458f68..7cd2701a59 100644 --- a/docs/component/data.rst +++ b/docs/component/data.rst @@ -140,10 +140,11 @@ Users can also provide their own data in CSV format. However, the CSV data **mus where the data are in the following format: - .. code-block:: - - symbol,close - SH600000,120 + +-----------+-------+ + | symbol | close | + +===========+=======+ + | SH600000 | 120 | + +-----------+-------+ - CSV file **must** includes a column for the date, and when dumping the data, user must specify the date column name. Here is an example: @@ -153,11 +154,13 @@ Users can also provide their own data in CSV format. However, the CSV data **mus where the data are in the following format: - .. code-block:: - - symbol,date,close,open,volume - SH600000,2020-11-01,120,121,12300000 - SH600000,2020-11-02,123,120,12300000 + +---------+------------+-------+------+----------+ + | symbol | date | close | open | volume | + +=========+============+=======+======+==========+ + | SH600000| 2020-11-01 | 120 | 121 | 12300000 | + +---------+------------+-------+------+----------+ + | SH600000| 2020-11-02 | 123 | 120 | 12300000 | + +---------+------------+-------+------+----------+ Supposed that users prepare their CSV format data in the directory ``~/.qlib/csv_data/my_data``, they can run the following command to start the conversion. From b624ddf00b244db1ab742d9f9d6102ba86dac481 Mon Sep 17 00:00:00 2001 From: Di Date: Fri, 4 Aug 2023 17:41:12 +0800 Subject: [PATCH 29/37] Add multi pass portfolio analysis record (#1546) * Add multi pass port ana record * Add list function * Add documentation and support tag * Add drop in replacement example * reformat * Change according to comments * update format * Update record_temp.py Fix type hint * Update record_temp.py --- ..._config_linear_Alpha158_multi_pass_bt.yaml | 78 +++++++++ qlib/workflow/record_temp.py | 153 ++++++++++++++++-- 2 files changed, 221 insertions(+), 10 deletions(-) create mode 100644 examples/benchmarks/Linear/workflow_config_linear_Alpha158_multi_pass_bt.yaml diff --git a/examples/benchmarks/Linear/workflow_config_linear_Alpha158_multi_pass_bt.yaml b/examples/benchmarks/Linear/workflow_config_linear_Alpha158_multi_pass_bt.yaml new file mode 100644 index 0000000000..edd9d81e41 --- /dev/null +++ b/examples/benchmarks/Linear/workflow_config_linear_Alpha158_multi_pass_bt.yaml @@ -0,0 +1,78 @@ +qlib_init: + provider_uri: "~/.qlib/qlib_data/cn_data" + region: cn +market: &market csi300 +benchmark: &benchmark SH000300 +data_handler_config: &data_handler_config + start_time: 2008-01-01 + end_time: 2020-08-01 + fit_start_time: 2008-01-01 + fit_end_time: 2014-12-31 + instruments: *market + infer_processors: + - class: RobustZScoreNorm + kwargs: + fields_group: feature + clip_outlier: true + - class: Fillna + kwargs: + fields_group: feature + learn_processors: + - class: DropnaLabel + - class: CSRankNorm + kwargs: + fields_group: label +port_analysis_config: &port_analysis_config + strategy: + class: TopkDropoutStrategy + module_path: qlib.contrib.strategy + kwargs: + signal: + - + - + topk: 50 + n_drop: 5 + backtest: + start_time: 2017-01-01 + end_time: 2020-08-01 + account: 100000000 + benchmark: *benchmark + exchange_kwargs: + limit_threshold: 0.095 + deal_price: close + open_cost: 0.0005 + close_cost: 0.0015 + min_cost: 5 +task: + model: + class: LinearModel + module_path: qlib.contrib.model.linear + kwargs: + estimator: ols + dataset: + class: DatasetH + module_path: qlib.data.dataset + kwargs: + handler: + class: Alpha158 + module_path: qlib.contrib.data.handler + kwargs: *data_handler_config + segments: + train: [2008-01-01, 2014-12-31] + valid: [2015-01-01, 2016-12-31] + test: [2017-01-01, 2020-08-01] + record: + - class: SignalRecord + module_path: qlib.workflow.record_temp + kwargs: + model: + dataset: + - class: SigAnaRecord + module_path: qlib.workflow.record_temp + kwargs: + ana_long_short: True + ann_scaler: 252 + - class: MultiPassPortAnaRecord + module_path: qlib.workflow.record_temp + kwargs: + config: *port_analysis_config diff --git a/qlib/workflow/record_temp.py b/qlib/workflow/record_temp.py index d8b0a79a31..4c230e6e5e 100644 --- a/qlib/workflow/record_temp.py +++ b/qlib/workflow/record_temp.py @@ -4,8 +4,10 @@ import logging import warnings import pandas as pd +import numpy as np +from tqdm import trange from pprint import pprint -from typing import Union, List, Optional +from typing import Union, List, Optional, Dict from qlib.utils.exceptions import LoadObjectError from ..contrib.evaluate import risk_analysis, indicator_analysis @@ -17,6 +19,7 @@ from ..utils import fill_placeholder, flatten_dict, class_casting, get_date_by_shift from ..utils.time import Freq from ..utils.data import deepcopy_basic_type +from ..utils.exceptions import QlibException from ..contrib.eva.alpha import calc_ic, calc_long_short_return, calc_long_short_prec @@ -230,9 +233,16 @@ def generate(self, *args, **kwargs): except FileNotFoundError: logger.warning("The dependent data does not exists. Generation skipped.") return - return self._generate(*args, **kwargs) + artifact_dict = self._generate(*args, **kwargs) + if isinstance(artifact_dict, dict): + self.save(**artifact_dict) + return artifact_dict - def _generate(self, *args, **kwargs): + def _generate(self, *args, **kwargs) -> Dict[str, object]: + """ + Run the concrete generating task, return the dictionary of the generated results. + The caller method will save the results to the recorder. + """ raise NotImplementedError(f"Please implement the `_generate` method") @@ -336,8 +346,8 @@ def _generate(self, label: Optional[pd.DataFrame] = None, **kwargs): } ) self.recorder.log_metrics(**metrics) - self.save(**objects) pprint(metrics) + return objects def list(self): paths = ["ic.pkl", "ric.pkl"] @@ -468,17 +478,18 @@ def _generate(self, **kwargs): if self.backtest_config["end_time"] is None: self.backtest_config["end_time"] = get_date_by_shift(dt_values.max(), 1) + artifact_objects = {} # custom strategy and get backtest portfolio_metric_dict, indicator_dict = normal_backtest( executor=self.executor_config, strategy=self.strategy_config, **self.backtest_config ) for _freq, (report_normal, positions_normal) in portfolio_metric_dict.items(): - self.save(**{f"report_normal_{_freq}.pkl": report_normal}) - self.save(**{f"positions_normal_{_freq}.pkl": positions_normal}) + artifact_objects.update({f"report_normal_{_freq}.pkl": report_normal}) + artifact_objects.update({f"positions_normal_{_freq}.pkl": positions_normal}) for _freq, indicators_normal in indicator_dict.items(): - self.save(**{f"indicators_normal_{_freq}.pkl": indicators_normal[0]}) - self.save(**{f"indicators_normal_{_freq}_obj.pkl": indicators_normal[1]}) + artifact_objects.update({f"indicators_normal_{_freq}.pkl": indicators_normal[0]}) + artifact_objects.update({f"indicators_normal_{_freq}_obj.pkl": indicators_normal[1]}) for _analysis_freq in self.risk_analysis_freq: if _analysis_freq not in portfolio_metric_dict: @@ -500,7 +511,7 @@ def _generate(self, **kwargs): analysis_dict = flatten_dict(analysis_df["risk"].unstack().T.to_dict()) self.recorder.log_metrics(**{f"{_analysis_freq}.{k}": v for k, v in analysis_dict.items()}) # save results - self.save(**{f"port_analysis_{_analysis_freq}.pkl": analysis_df}) + artifact_objects.update({f"port_analysis_{_analysis_freq}.pkl": analysis_df}) logger.info( f"Portfolio analysis record 'port_analysis_{_analysis_freq}.pkl' has been saved as the artifact of the Experiment {self.recorder.experiment_id}" ) @@ -525,12 +536,13 @@ def _generate(self, **kwargs): analysis_dict = analysis_df["value"].to_dict() self.recorder.log_metrics(**{f"{_analysis_freq}.{k}": v for k, v in analysis_dict.items()}) # save results - self.save(**{f"indicator_analysis_{_analysis_freq}.pkl": analysis_df}) + artifact_objects.update({f"indicator_analysis_{_analysis_freq}.pkl": analysis_df}) logger.info( f"Indicator analysis record 'indicator_analysis_{_analysis_freq}.pkl' has been saved as the artifact of the Experiment {self.recorder.experiment_id}" ) pprint(f"The following are analysis results of indicators({_analysis_freq}).") pprint(analysis_df) + return artifact_objects def list(self): list_path = [] @@ -553,3 +565,124 @@ def list(self): else: warnings.warn(f"indicator_analysis freq {_analysis_freq} is not found") return list_path + + +class MultiPassPortAnaRecord(PortAnaRecord): + """ + This is the Multiple Pass Portfolio Analysis Record class that run backtest multiple times and generates the analysis results such as those of backtest. This class inherits the ``PortAnaRecord`` class. + + If shuffle_init_score enabled, the prediction score of the first backtest date will be shuffled, so that initial position will be random. + The shuffle_init_score will only works when the signal is used as placeholder. The placeholder will be replaced by pred.pkl saved in recorder. + + Parameters + ---------- + recorder : Recorder + The recorder used to save the backtest results. + pass_num : int + The number of backtest passes. + shuffle_init_score : bool + Whether to shuffle the prediction score of the first backtest date. + """ + + depend_cls = SignalRecord + + def __init__(self, recorder, pass_num=10, shuffle_init_score=True, **kwargs): + """ + Parameters + ---------- + recorder : Recorder + The recorder used to save the backtest results. + pass_num : int + The number of backtest passes. + shuffle_init_score : bool + Whether to shuffle the prediction score of the first backtest date. + """ + self.pass_num = pass_num + self.shuffle_init_score = shuffle_init_score + + super().__init__(recorder, **kwargs) + + # Save original strategy so that pred df can be replaced in next generate + self.original_strategy = deepcopy_basic_type(self.strategy_config) + if not isinstance(self.original_strategy, dict): + raise QlibException("MultiPassPortAnaRecord require the passed in strategy to be a dict") + if "signal" not in self.original_strategy.get("kwargs", {}): + raise QlibException("MultiPassPortAnaRecord require the passed in strategy to have signal as a parameter") + + def random_init(self): + pred_df = self.load("pred.pkl") + + all_pred_dates = pred_df.index.get_level_values("datetime") + bt_start_date = pd.to_datetime(self.backtest_config.get("start_time")) + if bt_start_date is None: + first_bt_pred_date = all_pred_dates.min() + else: + first_bt_pred_date = all_pred_dates[all_pred_dates >= bt_start_date].min() + + # Shuffle the first backtest date's pred score + first_date_score = pred_df.loc[first_bt_pred_date]["score"] + np.random.shuffle(first_date_score.values) + + # Use shuffled signal as the strategy signal + self.strategy_config = deepcopy_basic_type(self.original_strategy) + self.strategy_config["kwargs"]["signal"] = pred_df + + def _generate(self, **kwargs): + risk_analysis_df_map = {} + + # Collect each frequency's analysis df as df list + for i in trange(self.pass_num): + if self.shuffle_init_score: + self.random_init() + + # Not check for cache file list + single_run_artifacts = super()._generate(**kwargs) + + for _analysis_freq in self.risk_analysis_freq: + risk_analysis_df_list = risk_analysis_df_map.get(_analysis_freq, []) + risk_analysis_df_map[_analysis_freq] = risk_analysis_df_list + + analysis_df = single_run_artifacts[f"port_analysis_{_analysis_freq}.pkl"] + analysis_df["run_id"] = i + risk_analysis_df_list.append(analysis_df) + + result_artifacts = {} + # Concat df list + for _analysis_freq in self.risk_analysis_freq: + combined_df = pd.concat(risk_analysis_df_map[_analysis_freq]) + + # Calculate return and information ratio's mean, std and mean/std + multi_pass_port_analysis_df = combined_df.groupby(level=[0, 1]).apply( + lambda x: pd.Series( + {"mean": x["risk"].mean(), "std": x["risk"].std(), "mean_std": x["risk"].mean() / x["risk"].std()} + ) + ) + + # Only look at "annualized_return" and "information_ratio" + multi_pass_port_analysis_df = multi_pass_port_analysis_df.loc[ + (slice(None), ["annualized_return", "information_ratio"]), : + ] + pprint(multi_pass_port_analysis_df) + + # Save new df + result_artifacts.update({f"multi_pass_port_analysis_{_analysis_freq}.pkl": multi_pass_port_analysis_df}) + + # Log metrics + metrics = flatten_dict( + { + "mean": multi_pass_port_analysis_df["mean"].unstack().T.to_dict(), + "std": multi_pass_port_analysis_df["std"].unstack().T.to_dict(), + "mean_std": multi_pass_port_analysis_df["mean_std"].unstack().T.to_dict(), + } + ) + self.recorder.log_metrics(**metrics) + return result_artifacts + + def list(self): + list_path = [] + for _analysis_freq in self.risk_analysis_freq: + if _analysis_freq in self.all_freq: + list_path.append(f"multi_pass_port_analysis_{_analysis_freq}.pkl") + else: + warnings.warn(f"risk_analysis freq {_analysis_freq} is not found") + return list_path From e9fbb4fdce1c08c034dd67530add567f10ceda6a Mon Sep 17 00:00:00 2001 From: Di Date: Fri, 18 Aug 2023 17:41:02 +0800 Subject: [PATCH 30/37] Add exploration noise to rl training collector (#1481) * Update vessel.py Add exploration_noise=True to training collector * Update vessel.py Reformat --- qlib/rl/trainer/vessel.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/qlib/rl/trainer/vessel.py b/qlib/rl/trainer/vessel.py index 6cd2eb3e91..b7912b488b 100644 --- a/qlib/rl/trainer/vessel.py +++ b/qlib/rl/trainer/vessel.py @@ -168,7 +168,9 @@ def train(self, vector_env: FiniteVectorEnv) -> Dict[str, Any]: self.policy.train() with vector_env.collector_guard(): - collector = Collector(self.policy, vector_env, VectorReplayBuffer(self.buffer_size, len(vector_env))) + collector = Collector( + self.policy, vector_env, VectorReplayBuffer(self.buffer_size, len(vector_env)), exploration_noise=True + ) # Number of episodes collected in each training iteration can be overridden by fast dev run. if self.trainer.fast_dev_run is not None: From 10e27d56cf85fb12454427bd87e7965ab520260f Mon Sep 17 00:00:00 2001 From: Fivele-Li <128388363+Fivele-Li@users.noreply.github.com> Date: Thu, 24 Aug 2023 21:24:50 +0800 Subject: [PATCH 31/37] Troubleshooting pip version issues in CI (#1504) * CI failed to run on 23.1 and 23.1.1 * add pyproject.toml * upgrade pip in slow.yml * upgrade build-system requires * troubleshooting pytest problem * troubleshooting pytest problem * troubleshooting pytest problem * troubleshooting pytest problem * add qlib root path to python sys.path * add qlib root path to $PYTHONPATH * add qlib root path to $PYTHONPATH * add qlib root path to $PYTHONPATH * modify pytest root; * remove set env * change_pytest_command_CI * change_pytest_command_CI * fix_ci * fix_ci * fix_ci * fix_ci * fix_ci * fix_ci * fix_ci * remove_toml * recover_toml --------- Co-authored-by: lijinhui <362237642@qq.com> Co-authored-by: linlang --- .github/workflows/test_qlib_from_source.yml | 4 +--- .github/workflows/test_qlib_from_source_slow.yml | 4 +--- pyproject.toml | 2 ++ tests/test_pit.py | 4 +++- 4 files changed, 7 insertions(+), 7 deletions(-) create mode 100644 pyproject.toml diff --git a/.github/workflows/test_qlib_from_source.yml b/.github/workflows/test_qlib_from_source.yml index 7271287dcb..acf37208fd 100644 --- a/.github/workflows/test_qlib_from_source.yml +++ b/.github/workflows/test_qlib_from_source.yml @@ -38,10 +38,8 @@ jobs: python-version: ${{ matrix.python-version }} - name: Update pip to the latest version - # pip release version 23.1 on Apr.15 2023, CI failed to run, Please refer to #1495 ofr detailed logs. - # The pip version has been temporarily fixed to 23.0 run: | - python -m pip install pip==23.0 + python -m pip install --upgrade pip - name: Installing pytorch for macos if: ${{ matrix.os == 'macos-11' || matrix.os == 'macos-latest' }} diff --git a/.github/workflows/test_qlib_from_source_slow.yml b/.github/workflows/test_qlib_from_source_slow.yml index 1dfcc0179c..caab6f444e 100644 --- a/.github/workflows/test_qlib_from_source_slow.yml +++ b/.github/workflows/test_qlib_from_source_slow.yml @@ -38,10 +38,8 @@ jobs: python-version: ${{ matrix.python-version }} - name: Set up Python tools - # pip release version 23.1 on Apr.15 2023, CI failed to run, Please refer to #1495 ofr detailed logs. - # The pip version has been temporarily fixed to 23.0 run: | - python -m pip install pip==23.0 + python -m pip install --upgrade pip pip install --upgrade cython numpy pip install -e .[dev] diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000000..6350d092c7 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,2 @@ +[build-system] +requires = ["setuptools", "numpy", "Cython"] diff --git a/tests/test_pit.py b/tests/test_pit.py index 329413eadd..8320e1d361 100644 --- a/tests/test_pit.py +++ b/tests/test_pit.py @@ -13,7 +13,9 @@ from qlib.data import D from qlib.tests.data import GetData -from scripts.dump_pit import DumpPitData + +sys.path.append(str(Path(__file__).resolve().parent.parent.joinpath("scripts"))) +from dump_pit import DumpPitData sys.path.append(str(Path(__file__).resolve().parent.parent.joinpath("scripts/data_collector/pit"))) from collector import Run From b300af7581324c3a010e8cdba59e3ca0f6a0a0f9 Mon Sep 17 00:00:00 2001 From: Fivele-Li <128388363+Fivele-Li@users.noreply.github.com> Date: Fri, 1 Sep 2023 18:12:49 +0800 Subject: [PATCH 32/37] suppress the SettingWithCopyWarning of pandas (#1513) * df value is set as expected, suppress the warning; * depress warning with pandas option_context --------- Co-authored-by: Cadenza-Li <362237642@qq.com> --- qlib/data/dataset/processor.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/qlib/data/dataset/processor.py b/qlib/data/dataset/processor.py index 63acd937e6..714693d181 100644 --- a/qlib/data/dataset/processor.py +++ b/qlib/data/dataset/processor.py @@ -318,9 +318,13 @@ def __call__(self, df): # try not modify original dataframe if not isinstance(self.fields_group, list): self.fields_group = [self.fields_group] - for g in self.fields_group: - cols = get_group_columns(df, g) - df[cols] = df[cols].groupby("datetime", group_keys=False).apply(self.zscore_func) + # depress warning by references: + # https://stackoverflow.com/questions/20625582/how-to-deal-with-settingwithcopywarning-in-pandas + # https://pandas.pydata.org/pandas-docs/stable/user_guide/options.html#getting-and-setting-options + with pd.option_context("mode.chained_assignment", None): + for g in self.fields_group: + cols = get_group_columns(df, g) + df[cols] = df[cols].groupby("datetime", group_keys=False).apply(self.zscore_func) return df From 8e446aafe88a9358369c5b71ffad904f475c357d Mon Sep 17 00:00:00 2001 From: zhuan <47859523+kimzhuan@users.noreply.github.com> Date: Fri, 15 Sep 2023 17:18:04 +0800 Subject: [PATCH 33/37] Update requirements.txt (#1521) --- scripts/data_collector/br_index/requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/data_collector/br_index/requirements.txt b/scripts/data_collector/br_index/requirements.txt index c77e932879..e0ad5e8be8 100644 --- a/scripts/data_collector/br_index/requirements.txt +++ b/scripts/data_collector/br_index/requirements.txt @@ -1,6 +1,6 @@ async-generator==1.10 attrs==21.4.0 -certifi==2021.10.8 +certifi==2022.12.7 cffi==1.15.0 charset-normalizer==2.0.12 cryptography==36.0.1 @@ -8,7 +8,7 @@ fire==0.4.0 h11==0.13.0 idna==3.3 loguru==0.6.0 -lxml==4.8.0 +lxml==4.9.1 multitasking==0.0.10 numpy==1.22.2 outcome==1.1.0 From 8bcf09ea5e28a8ba0574573517efe1e3297ce149 Mon Sep 17 00:00:00 2001 From: John Lyu Date: Fri, 7 Jul 2023 13:38:11 +0800 Subject: [PATCH 34/37] pred current is confusing --- qlib/contrib/strategy/cost_control.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/qlib/contrib/strategy/cost_control.py b/qlib/contrib/strategy/cost_control.py index 326e29652b..3a2bce84da 100644 --- a/qlib/contrib/strategy/cost_control.py +++ b/qlib/contrib/strategy/cost_control.py @@ -5,6 +5,7 @@ """ +import pandas as pd from .order_generator import OrderGenWInteract from .signal_strategy import WeightStrategyBase import copy @@ -66,6 +67,8 @@ def generate_target_weight_position(self, score, current, trade_start_time, trad # TODO: # If the current stock list is more than topk(eg. The weights are modified # by risk control), the weight will not be handled correctly. + if isinstance(score, pd.DataFrame): + score = score.iloc[:, 0] buy_signal_stocks = set(score.sort_values(ascending=False).iloc[: self.topk].index) cur_stock_weight = current.get_stock_weight_dict(only_stock=True) From 97c6799f37fa25f8b42fc642e83258da040e5c0b Mon Sep 17 00:00:00 2001 From: John Lyu Date: Wed, 19 Jul 2023 14:47:54 +0800 Subject: [PATCH 35/37] add build system requirements --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 6350d092c7..63b87a8a5e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,2 +1,2 @@ [build-system] -requires = ["setuptools", "numpy", "Cython"] +requires = ["setuptools", "cython", "numpy"] \ No newline at end of file From 265fdc9166c5f7a7ce7cc551f553f904e6cf83f3 Mon Sep 17 00:00:00 2001 From: John Lyu Date: Thu, 20 Jul 2023 14:51:27 +0800 Subject: [PATCH 36/37] add pos and neg operator --- qlib/data/base.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/qlib/data/base.py b/qlib/data/base.py index 496ae38ee2..c1c71a3350 100644 --- a/qlib/data/base.py +++ b/qlib/data/base.py @@ -138,6 +138,12 @@ def __ror__(self, other): from .ops import Or # pylint: disable=C0415 return Or(other, self) + + def __pos__(self): + return self + + def __neg__(self): + return 0 - self def load(self, instrument, start_index, end_index, *args): """load feature From f3ce11ae4bcc636c7a2616d614f9c750d3fe03f5 Mon Sep 17 00:00:00 2001 From: John Lyu Date: Thu, 20 Jul 2023 16:04:36 +0800 Subject: [PATCH 37/37] fix stock is delisted --- qlib/backtest/exchange.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/qlib/backtest/exchange.py b/qlib/backtest/exchange.py index 1ab0d07a75..a8a117f9e5 100644 --- a/qlib/backtest/exchange.py +++ b/qlib/backtest/exchange.py @@ -511,6 +511,9 @@ def get_deal_price( self.logger.warning(f"(stock_id:{stock_id}, trade_time:{(start_time, end_time)}, {pstr}): {deal_price}!!!") self.logger.warning(f"setting deal_price to close price") deal_price = self.get_close(stock_id, start_time, end_time, method) + # if stock is delisted, the deal_price(close) will be None,set to 0 + if deal_price is None or np.isnan(deal_price): + deal_price = 0.0 return deal_price def get_factor(