diff --git a/README.md b/README.md index 9739f650dc2bdf3ba197d9eaa285ad028f7dfe5e..09cf04420c184d412a17d25151e8a7964783a9e9 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,41 @@ # datascout Simple package to handle data saving and reading with minimum required libraries. -Mainly used as dependance of `pyjapcscout`, but it can be used for other purposes as well. +Mainly used as dependance of `pyjapcscout`, but it can be used for other purposes as well (maybe). ## Purpose of this project -The idea is to provide a few sweet functions to go from a nested `dict` of `numpy` arrays to `parquet` (and to `pickle`, and `json`) and come back **preserving** the data types. The aspect related to data types preservation is important for the roud-trip of meachine parameter reading, saving and settings. +The idea is to provide a few sweet functions to go from a nested `dict` of `numpy` arrays to `parquet` (and to `pickle`, and `json`) and come back **preserving** the data types (but for `json`, for which no coming back is impelmented here!). The aspect related to data types preservation is important for the roud-trip of meachine parameter reading, saving and settings. +This package is meant to be simple enought and with very little dependencies to allow for *home* data analysis without the needs of *CERN TN Network* or *Java* libraries. ## Getting started +First you need to install this package in your (virtual) environment. Presently, the suggested way is to go for local folder installation: +``` +git clone https://gitlab.cern.ch/abpcomputing/sandbox/datascout.git datascout +cd datascout +python -m pip install -e . +``` +so that one can easily update the package following its development by doing a simple `git pull` within the `datascout` folder created above. + +This package provides the following (main) functions. Note that many of those functions are simple wrappers of external functions (from `pandas`, `pyarrow`, `awkward`), but sometimes with some twiks to make sure data type/shape is somewhat always preserved. + +- `dict_to_pandas(input_dict)`: Creates a `pandas` dataframe from a (list of) `dict`. +- `dict_to_awkward(input_dict)`: Creates an `awkward` array from a (list of) `dict`. +- `dict_to_parquet(input_dict, filename)`: Saves a (list of) `dict` into a `parquet` file. **In order to do so, 2D arrays are split in 1D arrays of 1D arrays.** +- `dict_to_pickle(input_dict, filename)`: Saves a (list of) `dict` into a `pickle` file. +- `dict_to_json(input_dict, filename)`: Saves a (list of) `dict` into a `json` file. +- `json_to_pandas(filename)`: It loads from a `json` file a `pandas` dataframe. This function is not so interesting (because data types/shapes are not preserved), but provided for convenience. +- `pandas_to_dict(input_pandas)`: It converts back a `pandas` dataframe into a (list of) `dict`. +- `awkward_to_dict(input_awkward)`: It converts back a `awkward` array into a (list of) `dict`. **In order to preserve data type/shape, it re-merges 1D arrays of 1D arrays into 2D arrays.** +- `parquet_to_dict(filename)`: Loads a (list of) `dict` from a `parquet` file. **In order to preserve data type/shape, it re-merges 1D arrays of 1D arrays into 2D arrays.** +- `pickle_to_dict(filname)`: Loads a (list of) `dict` from a `pickle` file. +- `pandas_to_awkward(input_pandas)`: It creates an `awkward`array starting from a `pandas` dataframe. +- `awkward_to_pandas(input_awkward)`: It creates an `pandas` dataframe starting from a `awkward` array. +- `parquet_to_pandas(filename)`: It loads a `parquet` file into a `pandas` dataframe. **Instead of using the method provided by `pandas` (which does not preserve single value types and 2D arrays), it first loads the parquet as `dict`, and then converts it into a `pandas` dataframe.** +- `parquet_to_awkward(filename)`: It loads a `parquet` file into a `awkward` array. +- `save_dict(dictData, folderPath = None, filename = None, fileFormat='parquet')`: Additional wrapper of a few functions above to easily save a `dict` on a file using a supported format (`parquet` and `dict` for the time being) +- `load_dict(filename, fileFormat='parquet')`: It reads a file assuming a given format and returns its content as a `dict` (which can be then converted to other formats...) ## How to develop it: diff --git a/datascout/__init__.py b/datascout/__init__.py index 3711ef45840815fa7bf84a3ef5167c95031a0dde..a5771edad917d5009263d0b989d58d28637c7b52 100644 --- a/datascout/__init__.py +++ b/datascout/__init__.py @@ -41,5 +41,3 @@ from ._datascout import parquet_to_awkward # other hidden functions that could be useful for debug from ._datascout import _find_lists from ._datascout import _compare_data - - diff --git a/datascout/_datascout.py b/datascout/_datascout.py index 1fb3335caa004ddff853c1dd28ea3f418cf50ce2..cfe39ece8e41e0fda372951b0675cf05513da893 100644 --- a/datascout/_datascout.py +++ b/datascout/_datascout.py @@ -12,6 +12,8 @@ import pickle import datetime import copy import os +from pathlib import Path + ###### # Functions needed to split 2D arrays @@ -287,7 +289,10 @@ def dict_to_json(input_dict, filename): ''' Function provided for convenience, but not of interest for typical use case... ''' - dict_to_pandas(input_dict).to_json(filename+'.json') + name, ext = os.path.splitext(filename) + if len(ext) == 0: + filename = filename+'.json' + dict_to_pandas(input_dict).to_json(filename) def json_to_pandas(filename): ''' @@ -337,6 +342,42 @@ def parquet_to_awkward(filename): return ak.from_parquet(filename) +####### Simple save/load functions for the user + +def _getFilename(): + return datetime.now().strftime("%Y.%m.%d.%H.%M.%S.%f") + +def save_dict(dictData, folderPath = None, filename = None, fileFormat='parquet'): + if filename == None: + filename = _getFilename() + Path(folderPath).mkdir(parents=True, exist_ok=True) + filename = os.path.join(folderPath, filename) + if fileFormat == 'parquet': + dict_to_parquet(dictData, filename+'.parquet') + elif fileFormat == 'json': + dict_to_json(dictData, filename+'.json') + elif (fileFormat == 'pickle') or (fileFormat == 'pickledict'): + dict_to_pickle(dictData, filename+'.pkl') + elif fileFormat == 'mat': + raise ValueError('MAT format not yet supported') + scipy.io.savemat(filename+'.mat', dictData) + else: + raise ValueError('Unknown file format') + +def load_dict(filename, fileFormat='parquet'): + if fileFormat == 'parquet': + return parquet_to_dict(filename) + elif (fileFormat == 'pickle') or (fileFormat == 'pickledict'): + return pickle_to_dict(filename) + elif fileFormat == 'json': + raise ValueError('JSON format not yet supported') + elif fileFormat == 'mat': + raise ValueError('MAT format not yet supported') + print('TODO: compatibility with MATLAB generated files?!') + return scipy.io.loadmat(filename) + else: + raise ValueError('Unknown file format ({})'.format(fileFormat)) + ####### Some additional functions for debugging purposes