diff --git a/.gitignore b/.gitignore index 5ceb3864c2911029f0a6010fadab352e4b8e2d07..a0f847a182dc3a722c3b5d9735717c507be316d3 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,12 @@ venv +__pycache__ +.mypy_cache +*.pyc +.ipynb_checkpoints +/build +/datascout.egg-info +datascout/_version.py +*.swp +*~ +.cache +docs/build diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000000000000000000000000000000000000..196226fc22e2d1e45e4e16fc917af743f8d37660 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,19 @@ +Copyright (c) CERN 2015-2020 + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index 38d49955479f065493e1fc03f1995a6ad86b436c..ad29b4d6fc7c4a64ded5dbc0ac6ca5f7d7b42612 100644 --- a/README.md +++ b/README.md @@ -68,6 +68,7 @@ python -m pip install -e . ``` acc-py init-docs ``` +then started populating the files under docs folder with the desired content... 7. Check code style using `black`: ``` diff --git a/datascout/__init__.py b/datascout/__init__.py index 31d105bd191cb5f1611657a89e481ca3e07e697c..d42cf3d84788dd063e6693afd7e4087d4552d59a 100644 --- a/datascout/__init__.py +++ b/datascout/__init__.py @@ -1,9 +1,10 @@ """ -list of sweet functions for data conversion and writing to disk +Sweet functions for data conversion and data writing to disk """ +# Note: the text above goes into the API documentation -__version__ = "0.0.3" +from ._version import version as __version__ # to look at pyarrow, typically not used by a user, @@ -45,6 +46,6 @@ from ._datascout import _compare_data # saving from ._datascout import save_dict -# function to address different files schemas +# function to address different file schemas from ._datascout import split_schemas from ._datascout import compare_schema diff --git a/datascout/_datascout.py b/datascout/_datascout.py index 6a4f2c94f0f68c497d380bf1f63b0333199c716c..68d1843c20760d7dd5b6a5ed5734a8b6484a548a 100644 --- a/datascout/_datascout.py +++ b/datascout/_datascout.py @@ -323,16 +323,39 @@ def pyarrow_to_pandas(input_pa): def dict_to_pandas(input_dict): + """Convert a dictionary or list of dictionaries into a pandas DataFrame + + Args: + input_dict (dict): the dictionary to convert + + Returns: + (DataFrame): the data converted as Pandas DataFrame + """ if not isinstance(input_dict, list): input_dict = [input_dict] return pd.DataFrame(input_dict) def dict_to_awkward(input_dict): + """Convert a dictionary or list of dictionaries into a Awkward Array + + Args: + input_dict (dict): the dictionary to convert + + Returns: + (Awkward): the data converted as Awkward Array + """ return ak.from_arrow(dict_to_pyarrow(input_dict)) def dict_to_parquet(input_dict, filename): + """Save a given dictionary into a parquet file + + Args: + input_dict (dict): the dictionary to save + filename (string): the file name (with its path) + """ + # we could also just go to pandas, and then to parquet. # dict_to_pandas(input_dict).to_parquet(filename) name, ext = os.path.splitext(filename) @@ -342,6 +365,15 @@ def dict_to_parquet(input_dict, filename): def dict_to_pickle(input_dict, filename): + """Save a given dictionary into a pickle file + + **WARNING**: Note that pickle is not the favourite way of storing data! + + Args: + input_dict (dict): the dictionary to save + filename (string): the file name (with its path) + """ + name, ext = os.path.splitext(filename) if len(ext) == 0: filename = filename + ".pkl" @@ -350,8 +382,13 @@ def dict_to_pickle(input_dict, filename): def dict_to_json(input_dict, filename): - """ - Function provided for convenience, but not of interest for typical use case... + """Save a given dictionary into a JSON file + + Function provided for convenience, but not of interest for typical use case. + + Args: + input_dict (dict): the dictionary to save + filename (string): the file name (with its path) """ name, ext = os.path.splitext(filename) if len(ext) == 0: @@ -360,17 +397,27 @@ def dict_to_json(input_dict, filename): def json_to_pandas(filename): - """ - Function provided for convenience, but not of interest for typical use case... + """Loads a JSON file as Pandas DataFrame + + Args: + filename (string): the file name (with its path) + + Returns: + (DataFrame): the loaded file as Pandas DataFrame """ return pd.read_json(filename) def pandas_to_dict(input_pandas): + """Convert a Pandas DataFrame into a pyjapcscout-like dict or list of dicts in case of many records + + Args: + input_pandas (DataFrame): the pandas DataFrame to convert + + Returns: + (dict or list): the data converted as dict or list of dicts """ - it converts a pandas dataframe into a pyjapcscout-like dict - (or list of dicts in case of many records) - """ + output = input_pandas.to_dict("records") if len(output) == 1: output = output[0] @@ -378,40 +425,89 @@ def pandas_to_dict(input_pandas): def awkward_to_dict(input_awkward): - """ - it converts the specified row of an awkward array into a pyjapcscout-like dict + """Convert a Awkward Array data into a pyjapcscout-like dict or list of dicts in case of many records + + Args: + input_awkward (Awkward): the Awkward Array to convert + + Returns: + (dict or list): the data converted as dict or list of dicts """ return _convert_parrow_data(ak.to_arrow(input_awkward)) def pickle_to_dict(filename): + """Loads a pickle from file into a pyjapcscout-like dict + + Args: + filename (string): the file name (with its path) + + Returns: + (dict): the data loaded as dict + """ with open(filename, "rb") as handle: load_dict = pickle.load(handle) return load_dict def parquet_to_dict(filename): + """Loads a parquet file into a pyjapcscout-like dict + + Args: + filename (string): the file name (with its path) + + Returns: + (dict): the data loaded as dict + """ return pyarrow_to_dict(parquet_to_pyarrow(filename)) # between pandas and awkward def pandas_to_awkward(input_pandas): + """Convert a pandas DataFrame into a Awkward Array + + Args: + input_pandas (DataFrame): the pandas DataFrame to convert + + Returns: + (Awkward): the data converted as Awkward Array + """ return dict_to_awkward(pandas_to_dict(input_pandas)) def awkward_to_pandas(input_awkward): + """Convert a pandas DataFrame into a Awkward Array + + Args: + input_awkward (DataFrame): the Awkward Array to convert + + Returns: + (DataFrame): the data converted as pandas DataFrame + """ return dict_to_pandas(awkward_to_dict) -# reading from parquet to pandas without type loss def parquet_to_pandas(filename): + """ It reads a **single(?)** parquet file into a pandas DataFrame with no data type loss + + Args: + filename (string): the file name (with its path) + + Returns: + (DataFrame): the data loaded as pandas DataFrame """ - It reads a **single** parquet into a pandas dataframe with no data type loss - """ - return dict_to_pandas(parquet_to_dict(filename)) + return dict_to_pandas(parquet_to_dict(filename)) def parquet_to_awkward(filename): + """ It reads a **single(?)** parquet file into a Awkward Array with no data type loss + + Args: + filename (string): the file name (with its path) + + Returns: + (Awkward): the data loaded as Awkward Array + """ return ak.from_parquet(filename) @@ -421,6 +517,15 @@ def parquet_to_awkward(filename): def save_dict(dictData, folderPath=None, filename=None, fileFormat="parquet"): + """Save data stored in a dictionary to file + + Args: + dictData (dict): the dictionary to store + folderPath(str): the folder/path where to store data (default=None i.e. stores in the active path) + filename(str): the file name (default=None, i.e. it uses local time as file name) + fileFormat(str): file format in ['parquet', 'json', 'pickle'] (default="parquet") + + """ if filename is None: filename = datetime.now().strftime("%Y.%m.%d.%H.%M.%S.%f") Path(folderPath).mkdir(parents=True, exist_ok=True) @@ -439,6 +544,15 @@ def save_dict(dictData, folderPath=None, filename=None, fileFormat="parquet"): def load_dict(filename, fileFormat="parquet"): + """Loads data from file into a pyjapcscout-like dictionary + + Args: + filename (string): the file name (with its path) + fileFormat(str): file format in ['parquet', 'json', 'pickle'] (default="parquet") + + Returns: + (dict): the data loaded as pyjapcscout-like dictionary + """ if fileFormat == "parquet": return parquet_to_dict(filename) elif (fileFormat == "pickle") or (fileFormat == "pickledict"): @@ -461,6 +575,7 @@ def load_dict(filename, fileFormat="parquet"): def _find_lists(data, verbose=False): """ Look inside data (assumed to be a dict) and tell if some fields are actually lists. + In theory, `datascout` package is meant to be used only on dicts that do NOT contain any list! """ for key, value in data.items(): @@ -478,7 +593,8 @@ def _find_lists(data, verbose=False): def _compare_data(data1, data2, use_assert=False): """ Compares two dictionaries or lists and show the differences (of type or data type). - For a full comparison, it is sometimes best to call this function also with inverted + + For a full comparison, it is sometimes best to call this function also with inverted inputs. """ def not_equal(a, b): @@ -515,7 +631,7 @@ def _compare_schema(a, reference_file = None, verbose = False): """ Compare the schema of a list of file **a**. The default reference file in the first one of the list **a**. - It is possible to defien a different referenve file usint the **reference_file**. + It is possible to define a different reference file using the **reference_file**. """ my_list = [] if len(a) > 0: @@ -527,7 +643,7 @@ def _compare_schema(a, reference_file = None, verbose = False): if not ref_schema==pq.read_schema(ii): if verbose: - print (f'\n ****** {ii} has a different schema! ****** ') + print(f'\n ****** {ii} has a different schema! ****** ') print('This is the difference\n === Schema of the reference file ===') print(set(ref_schema)-set(pq.read_schema(ii))) print('This is the difference\n === Schema of the compared file ===') @@ -540,7 +656,16 @@ def _compare_schema(a, reference_file = None, verbose = False): def split_schemas(list_of_files): """ - Split a list of files by schemas. + Split a list of Parquet files by their data schema. + + This function is convenient when one needs to extract from many parquet files a sub-set + that follows the same schema, i.e. that can be all loaded at once in a pandas DataFrame + + Args: + list_of_files (list): the file names (with their paths) to be analysed + + Returns: + (list): list of lists. The first list includes the files for which no schema was found. """ aux=_compare_schema(list_of_files, reference_file = None, verbose = False) if len(aux)==0: @@ -551,6 +676,17 @@ def split_schemas(list_of_files): def compare_schema(folder, reference_file = None, verbose = False): """ Given a **folder**, it compares all parquet files in it. + + This function is convenient when one needs to extract from many parquet files a sub-set + that follows the same schema, i.e. that can be all loaded at once in a pandas dataframe + + Args: + folder (list): the folder name + reference_file (str): file from which to take the reference schema (default=None, i.e. it uses the first file as reference) + verbose (bool): (default=False) + + Returns: + (list): returns a list of files with scema equal to the one of **reference_file** """ return _compare_schema(sorted(glob.glob(folder+'/**/*.parquet', recursive=True)), reference_file, verbose) diff --git a/docs/source/index.rst b/docs/source/index.rst index cc17c1929f8a12504e170563b3760f233dadc37d..0feb359fd9a7ed064d4385f21ccf82d7dc1ed24b 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -4,7 +4,10 @@ datascout Introduction ------------ -SHORT DESCRIPTION OF PROJECT +Datascout is a simple package to handle data saving and reading with minimum required libraries. +The main purpose is to use it as dependance of `pyjapcscout` in the control room for storing data +acquired form the control system as parquet files, and then on the user's "GPN" computer for data +analysis without the need of JAVA or other dependances needed to interact with the control system. Installation @@ -16,6 +19,12 @@ Using the `acc-py Python package index pip install datascout +If you prefer to install your own local development version, you can simply clone the repository:: + + git clone https://gitlab.cern.ch/abpcomputing/sandbox/datascout.git datascout + cd datascout + python -m pip install -e . + Documentation contents ---------------------- diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000000000000000000000000000000000000..53222a3bcd81e1fbb73e744a3f8415964cc49fcb --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,8 @@ +[build-system] +build-backend = "setuptools.build_meta" +requires = ["setuptools>=42", "wheel", "setuptools_scm[toml]>=3.5"] + +# Enable automatic versioning from git. +[tool.setuptools_scm] +write_to = "datascout/_version.py" +local_scheme = "no-local-version" diff --git a/setup.py b/setup.py index 490136d676b58792ead046a9c8af251204d4a445..0efec59d67215234b63e6c5f918c3b3ded146b51 100644 --- a/setup.py +++ b/setup.py @@ -7,6 +7,7 @@ https://packaging.python.org/guides/distributing-packages-using-setuptools/ """ from pathlib import Path from setuptools import setup, find_packages +from setuptools_scm import get_version HERE = Path(__file__).parent.absolute() @@ -15,7 +16,15 @@ with (HERE / "README.md").open("rt") as fh: REQUIREMENTS: dict = { - "core": ["numpy", "scipy", "pandas", "pyarrow", "awkward", "datetime", "pathlib"], + "core": [ + "numpy", + "scipy", + "pandas", + "pyarrow", + "awkward", + "datetime", + "pathlib", + ], "test": [ "pytest", ], @@ -27,17 +36,24 @@ REQUIREMENTS: dict = { "acc-py-sphinx", ], } +# Get the version from setuptools_scm using the same scheme defined +# in pyproject.toml. Note that we use a no-local-version scheme +# (where versions are defined as "X.Y.Z.devN[+local-scheme]") so that +# the entry-point follows a valid entry-point specification as per +# https://packaging.python.org/specifications/entry-points/. +VERSION = get_version(local_scheme="no-local-version") setup( name="datascout", - version="0.0.2", author="Davide Gamba", author_email="davide.gamba@cern.ch", + version=VERSION, + license='MIT', description="Sweet functions for dict from/to pandas, awkward, parquet data conversion.", long_description=LONG_DESCRIPTION, long_description_content_type="text/markdown", - url="", + url="https://gitlab.cern.ch/abpcomputing/sandbox/datascout/", packages=find_packages(), python_requires="~=3.7", classifiers=[ @@ -57,4 +73,5 @@ setup( # The 'all' extra is the union of all requirements. "all": [req for reqs in REQUIREMENTS.values() for req in reqs], }, + )