Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Qlib highfreq doc & Update DatatSet Init Method #257

Merged
merged 8 commits into from
Feb 5, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
add docs & fix reinit of datatset
  • Loading branch information
bxdd committed Feb 3, 2021
commit afd4060c3247c7a6e0bda7d4cc54d4d80ed01e8e
28 changes: 28 additions & 0 deletions examples/highfreq/README.md
Original file line number Diff line number Diff line change
@@ -0,0 1,28 @@
# High-Frequency Dataset

This dataset is an example for RL high frequency trading.

## Get High-Frequency Data

Get high-frequency data by running the following command:
```bash
python workflow.py get_data
```

## Dump & Reload & Reinitialize the Dataset


The High-Frequency Dataset is implemented as `qlib.data.dataset.DatasetH` in the `workflow.py`. `DatatsetH` is the subclass of `qlib.utils.serial.Serializable`, which supports being dumped in or loaded from disk in `pickle` format.

### About Reinitialization

After reloading `Dataset` from disk, `Qlib` also support reinitialize the dataset. It means that users can reset some config of `Dataset` or `DataHandler` such as `instruments`, `start_time`, `end_time` and `segmens`, etc.

The example is given in `workflow.py`, users can run the code as follows.

### Run the Code

Run the example by running the following command:
```bash
python workflow.py dump_and_load_dataset
```
Empty file removed examples/highfreq/__init__.py
Empty file.
45 changes: 32 additions & 13 deletions examples/highfreq/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 9,7 @@
import pickle
import numpy as np
import pandas as pd
from qlib.config import HIGH_FREQ_CONFIG
from qlib.config import REG_CN, HIGH_FREQ_CONFIG
from qlib.contrib.model.gbdt import LGBModel
from qlib.contrib.data.handler import Alpha158
from qlib.contrib.strategy.strategy import TopkDropoutStrategy
Expand All @@ -26,7 26,6 @@

from highfreq_ops import get_calendar_day, DayLast, FFillNan, BFillNan, Date, Select, IsNull


class HighfreqWorkflow(object):

SPEC_CONF = {"custom_ops": [DayLast, FFillNan, BFillNan, Date, Select, IsNull], "expression_cache": None}
Expand Down Expand Up @@ -123,8 122,7 @@ def get_data(self):
backtest_train, backtest_test = dataset_backtest.prepare(["train", "test"])
print(backtest_train, backtest_test)

del xtrain, xtest
del backtest_train, backtest_test
return

def dump_and_load_dataset(self):
"""dump and load dataset state on disk"""
Expand All @@ -146,18 144,39 @@ def dump_and_load_dataset(self):
dataset_backtest = pickle.load(file_dataset_backtest)

self._prepare_calender_cache()
##=============reload_dataset=============
dataset.init(init_type=DataHandlerLP.IT_LS)
dataset_backtest.init()
##=============reinit dataset=============
dataset.init(
handler_kwargs = {
"init_type" : DataHandlerLP.IT_LS,
"start_time" : "2021-01-19 00:00:00",
"end_time" : "2021-01-25 16:00:00",
},
segment_kwargs = {
"test": (
"2021-01-19 00:00:00",
"2021-01-25 16:00:00",
),
}
)
dataset_backtest.init(
handler_kwargs = {
"start_time" : "2021-01-19 00:00:00",
"end_time" : "2021-01-25 16:00:00",
},
segment_kwargs = {
"test": (
"2021-01-19 00:00:00",
"2021-01-25 16:00:00",
),
}
)

##=============get data=============
xtrain, xtest = dataset.prepare(["train", "test"])
backtest_train, backtest_test = dataset_backtest.prepare(["train", "test"])
xtest = dataset.prepare(["test"])
backtest_test = dataset_backtest.prepare(["test"])

print(xtrain, xtest)
print(backtest_train, backtest_test)
del xtrain, xtest
del backtest_train, backtest_test
print(xtest, backtest_test)
return


if __name__ == "__main__":
Expand Down
39 changes: 36 additions & 3 deletions qlib/data/dataset/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 87,42 @@ def __init__(self, handler: Union[dict, DataHandler], segments: dict):
"""
super().__init__(handler, segments)

def init(self, **kwargs):
"""Initialize the DatasetH, Only parameters belonging to handler.init will be passed in"""
self.handler.init(**kwargs)
def init(self, handler_kwargs:dict = None, segment_kwargs:dict = None):
"""
Initialize the DatasetH

Parameters
----------
handler_kwargs : dict
Config of DataHanlder, which could include the following arguments:

- arguments of DataHandler.conf_data, such as 'instruments', 'start_time' and 'end_time'.

- arguments of DataHandler.init, such as 'enable_cache', etc.

segment_kwargs : dict
Config of segments which is same as 'segments' in DatasetH.setup_data

"""
if handler_kwargs:
if not isinstance(handler_kwargs, dict):
raise TypeError(f"param handler_kwargs must be type dict, not {type(handler_kwargs)}")
kwargs_init = {}
kwargs_conf_data = {}
conf_data_arg = {"instruments", "start_time", "end_time"}
for k, v in handler_kwargs.items():
if k in conf_data_arg:
kwargs_conf_data.update({k:v})
else:
kwargs_init.update({k:v})

self.handler.conf_data(**kwargs_conf_data)
self.handler.init(**kwargs_init)

if segment_kwargs:
if not isinstance(segment_kwargs, dict):
raise TypeError(f"param handler_kwargs must be type dict, not {type(segment_kwargs)}")
self.segments = segment_kwargs.copy()

def setup_data(self, handler: Union[dict, DataHandler], segments: dict):
"""
Expand Down