From e241d91143977abfb4529e3ed45ac61368eb5fc8 Mon Sep 17 00:00:00 2001
From: Davide Gamba <davide.gamba@cern.ch>
Date: Tue, 18 May 2021 23:58:43 +0200
Subject: [PATCH] added some quick readme

---
 README.md               | 31 +++++++++++++++++++++++++++--
 datascout/__init__.py   |  2 --
 datascout/_datascout.py | 43 ++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 71 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 9739f65..09cf044 100644
--- a/README.md
+++ b/README.md
@@ -1,14 +1,41 @@
 # datascout
 
 Simple package to handle data saving and reading with minimum required libraries.
-Mainly used as dependance of `pyjapcscout`, but it can be used for other purposes as well.
+Mainly used as dependance of `pyjapcscout`, but it can be used for other purposes as well (maybe).
 
 ## Purpose of this project
 
-The idea is to provide a few sweet functions to go from a nested `dict` of `numpy` arrays to `parquet` (and to `pickle`, and `json`) and come back **preserving** the data types. The aspect related to data types preservation is important for the roud-trip of meachine parameter reading, saving and settings.
+The idea is to provide a few sweet functions to go from a nested `dict` of `numpy` arrays to `parquet` (and to `pickle`, and `json`) and come back **preserving** the data types (but for `json`, for which no coming back is impelmented here!). The aspect related to data types preservation is important for the roud-trip of meachine parameter reading, saving and settings.
+This package is meant to be simple enought and with very little dependencies to allow for *home* data analysis without the needs of *CERN TN Network* or *Java* libraries. 
 
 ## Getting started
 
+First you need to install this package in your (virtual) environment. Presently, the suggested way is to go for local folder installation:
+```
+git clone https://gitlab.cern.ch/abpcomputing/sandbox/datascout.git datascout
+cd datascout
+python -m pip install -e .
+```
+so that one can easily update the package following its development by doing a simple `git pull` within the `datascout` folder created above.
+
+This package provides the following (main) functions. Note that many of those functions are simple wrappers of external functions (from `pandas`, `pyarrow`, `awkward`), but sometimes with some twiks to make sure data type/shape is somewhat always preserved. 
+
+- `dict_to_pandas(input_dict)`: Creates a `pandas` dataframe from a (list of) `dict`. 
+- `dict_to_awkward(input_dict)`: Creates an `awkward` array from a (list of) `dict`. 
+- `dict_to_parquet(input_dict, filename)`: Saves a (list of) `dict` into a `parquet` file. **In order to do so, 2D arrays are split in 1D arrays of 1D arrays.**
+- `dict_to_pickle(input_dict, filename)`: Saves a (list of) `dict` into a `pickle` file. 
+- `dict_to_json(input_dict, filename)`: Saves a (list of) `dict` into a `json` file. 
+- `json_to_pandas(filename)`: It loads from a `json` file a `pandas` dataframe. This function is not so interesting (because data types/shapes are not preserved), but provided for convenience.
+- `pandas_to_dict(input_pandas)`: It converts back a `pandas` dataframe into a (list of) `dict`.
+- `awkward_to_dict(input_awkward)`: It converts back a `awkward` array into a (list of) `dict`. **In order to preserve data type/shape, it re-merges 1D arrays of 1D arrays into 2D arrays.**
+- `parquet_to_dict(filename)`: Loads a (list of) `dict` from a `parquet` file. **In order to preserve data type/shape, it re-merges 1D arrays of 1D arrays into 2D arrays.**
+- `pickle_to_dict(filname)`: Loads a (list of) `dict` from a `pickle` file.
+- `pandas_to_awkward(input_pandas)`: It creates an `awkward`array starting from a `pandas` dataframe.
+- `awkward_to_pandas(input_awkward)`: It creates an `pandas` dataframe starting from a `awkward` array.
+- `parquet_to_pandas(filename)`: It loads a `parquet` file into a `pandas` dataframe. **Instead of using the method provided by `pandas` (which does not preserve single value types and 2D arrays), it first loads the parquet as `dict`, and then converts it into a `pandas` dataframe.**
+- `parquet_to_awkward(filename)`: It loads a `parquet` file into a `awkward` array.
+- `save_dict(dictData, folderPath = None, filename = None, fileFormat='parquet')`: Additional wrapper of a few functions above to easily save a `dict` on a file using a supported format (`parquet` and `dict` for the time being)
+- `load_dict(filename, fileFormat='parquet')`: It reads a file assuming a given format and returns its content as a `dict` (which can be then converted to other formats...)
 
 
 ## How to develop it:
diff --git a/datascout/__init__.py b/datascout/__init__.py
index 3711ef4..a5771ed 100644
--- a/datascout/__init__.py
+++ b/datascout/__init__.py
@@ -41,5 +41,3 @@ from ._datascout import parquet_to_awkward
 # other hidden functions that could be useful for debug
 from ._datascout import _find_lists
 from ._datascout import _compare_data
-
-
diff --git a/datascout/_datascout.py b/datascout/_datascout.py
index 1fb3335..cfe39ec 100644
--- a/datascout/_datascout.py
+++ b/datascout/_datascout.py
@@ -12,6 +12,8 @@ import pickle
 import datetime
 import copy
 import os
+from pathlib import Path
+
 
 ######
 # Functions needed to split 2D arrays
@@ -287,7 +289,10 @@ def dict_to_json(input_dict, filename):
     '''
     Function provided for convenience, but not of interest for typical use case...
     '''
-    dict_to_pandas(input_dict).to_json(filename+'.json')
+    name, ext = os.path.splitext(filename)
+    if len(ext) == 0:
+        filename = filename+'.json'
+    dict_to_pandas(input_dict).to_json(filename)
 
 def json_to_pandas(filename):
     '''
@@ -337,6 +342,42 @@ def parquet_to_awkward(filename):
     return ak.from_parquet(filename)
 
 
+####### Simple save/load functions for the user
+
+def _getFilename():
+    return datetime.now().strftime("%Y.%m.%d.%H.%M.%S.%f")
+
+def save_dict(dictData, folderPath = None, filename = None, fileFormat='parquet'):
+    if filename == None:
+        filename = _getFilename()
+    Path(folderPath).mkdir(parents=True, exist_ok=True)
+    filename = os.path.join(folderPath, filename)
+    if fileFormat == 'parquet':
+        dict_to_parquet(dictData, filename+'.parquet')
+    elif fileFormat == 'json':
+        dict_to_json(dictData, filename+'.json')
+    elif (fileFormat == 'pickle') or (fileFormat == 'pickledict'):
+        dict_to_pickle(dictData, filename+'.pkl')
+    elif fileFormat == 'mat':
+        raise ValueError('MAT format not yet supported')
+        scipy.io.savemat(filename+'.mat', dictData)
+    else:
+        raise ValueError('Unknown file format')
+
+def load_dict(filename, fileFormat='parquet'):
+    if fileFormat == 'parquet':
+        return parquet_to_dict(filename)
+    elif (fileFormat == 'pickle') or (fileFormat == 'pickledict'):
+        return pickle_to_dict(filename)
+    elif fileFormat == 'json':
+        raise ValueError('JSON format not yet supported')
+    elif fileFormat == 'mat':
+        raise ValueError('MAT format not yet supported')
+        print('TODO: compatibility with MATLAB generated files?!')
+        return scipy.io.loadmat(filename)
+    else:
+        raise ValueError('Unknown file format ({})'.format(fileFormat))
+
 
 ####### Some additional functions for debugging purposes
 
-- 
GitLab