diff --git a/datascout/__init__.py b/datascout/__init__.py index a5771edad917d5009263d0b989d58d28637c7b52..442097aea1c9c412686b3eae228b657c138c2fed 100644 --- a/datascout/__init__.py +++ b/datascout/__init__.py @@ -6,8 +6,8 @@ list of sweet functions for data conversion and writing to disk __version__ = "0.0.1.beta0" -# to look at pyarrow, typically not used by a user, -# but key functions for this package +# to look at pyarrow, typically not used by a user, +# but key functions for this package from ._datascout import dict_to_pyarrow from ._datascout import pyarrow_to_parquet from ._datascout import parquet_to_pyarrow diff --git a/datascout/_datascout.py b/datascout/_datascout.py index cfe39ece8e41e0fda372951b0675cf05513da893..af28cae49e13d528220d4b05ff96f3e12548538d 100644 --- a/datascout/_datascout.py +++ b/datascout/_datascout.py @@ -18,57 +18,59 @@ from pathlib import Path ###### # Functions needed to split 2D arrays -def _split_2D_array(val, in_memory=False, split_to_list=False, verbose=False): - ''' - _split_2D_array(val, in_memory=False, split_to_list=False, verbose=False) - It converts numpy 2D arrays into either: - - 1D "object" arrays containing 1D val.dtype arrays (split_to_list=False) - - list of 1D val.dtype arrays (split_to_list=True) - by default, split_to_list=False +def _split_2D_array(val, in_memory=False, split_to_list=False, verbose=False): + """It converts numpy 2D arrays into either 1D arrays or list of 1D arrays - It returns the split value or the original value if the input was not + Args: + val (numpy.ndarray): the array to convert + in_memory (bool): data is not copied but just represented in a different form (default=False) + split_to_list (bool): data is split in a 1D list instead of 1D object array (default=False) + verbose (bool): print some information when data is split (default=False) - If in_memory == True (default=False), data is not copied but just represented in a different form - ''' + Returns: + the split value or the original value if the input was not of the right type or did not need to be split. + """ if (type(val) == np.ndarray) and len(np.shape(val)) == 2: if not in_memory: val = copy.deepcopy(val) - if verbose: print('made a copy of '+str(val)) + if verbose: + print("made a copy of " + str(val)) if split_to_list: newVal = list(val) else: # (TODO: probably to be done better!!!) auxDim = np.shape(val)[0] # split val, without making data copy - auxData = np.split(np.ravel(val), auxDim) - # put it in object array - newVal = np.empty((auxDim,), dtype=object) + auxData = np.split(np.ravel(val), auxDim) + # put it in object array + newVal = np.empty((auxDim,), dtype=object) for i in range(auxDim): newVal[i] = auxData[i] if verbose: - print(' ----- ') - print(str(val)+' ('+str(type(val))+')') - print(' -> converted to -> ') - print(str(newVal)+' ('+str(type(newVal))+')') + print(" ----- ") + print(str(val) + " (" + str(type(val)) + ")") + print(" -> converted to -> ") + print(str(newVal) + " (" + str(type(newVal)) + ")") return newVal else: return val + def _convert_dict_list(data, in_memory=False, split_to_list=False, verbose=False): - ''' + """ Parse the input data, which should be a list or a dict, and convert all 2D arrays into either - 1D object array of 1D arrays - 1D list of 1D arrays - If in_memory=True (default False), it changes the data in memory. + If in_memory=True (default False), it changes the data in memory. In any case, the modified data is returned. - + NOTE: The conversion is done such to reduce to minimum (I think) data copy: i.e. the actual data is not copied (or copies are reduced to the minimum). It is up to the user to make a deepcopy, if desired, of the data before and/or after conversion. - - ''' + + """ if in_memory == False: data = copy.copy(data) if type(data) == list: @@ -76,62 +78,74 @@ def _convert_dict_list(data, in_memory=False, split_to_list=False, verbose=False if type(data[i]) == list or type(data[i]) == dict: data[i] = _convert_dict_list(data[i]) elif type(data[i]) == np.ndarray: - data[i] = _split_2D_array(data[i], in_memory=in_memory, split_to_list=split_to_list, verbose=verbose) + data[i] = _split_2D_array( + data[i], + in_memory=in_memory, + split_to_list=split_to_list, + verbose=verbose, + ) elif type(data) == dict: for key in data.keys(): if type(data[key]) == list or type(data[key]) == dict: data[key] = _convert_dict_list(data[key]) elif type(data[key]) == np.ndarray: - data[key] = _split_2D_array(data[key], in_memory=in_memory, split_to_list=split_to_list, verbose=verbose) + data[key] = _split_2D_array( + data[key], + in_memory=in_memory, + split_to_list=split_to_list, + verbose=verbose, + ) return data ###### # Functions needed to re-merge 1D arrays of 1D arrays into 2D arrays + def _merge_to_2D(val, string_as_obj=False, verbose=False): - ''' + """ _merge_to_2D(val, string_as_obj=False, verbose=False) It converts back numpy arrays of "object" dtype into 2D arrays. - By construction, if conversion actually occurs, this operation makes a copy of + By construction, if conversion actually occurs, this operation makes a copy of the data (probably with some exceptions) - - string_as_obj=False (default): - This options (if enabled) makes sure that the returned object is a 2D array of "object" + + string_as_obj=False (default): + This options (if enabled) makes sure that the returned object is a 2D array of "object" data type in case of string arrays. This is necessary in case you want to edit one string of the array without it being cut... - ''' + """ if ((type(val) == np.ndarray) and val.dtype == object) or (type(val) == list): newVal = np.stack(val) # fix subtle issue with strings which I am assuming are arrays of objects if string_as_obj and (newVal.dtype.type is np.str_): newVal = newVal.astype(object) if verbose: - print(' ----- ') - print(str(val)+' ('+str(type(val))+')') - print(' -> reverted to -> ') - print(str(newVal)+' ('+str(newVal.dtype)+')') + print(" ----- ") + print(str(val) + " (" + str(type(val)) + ")") + print(" -> reverted to -> ") + print(str(newVal) + " (" + str(newVal.dtype) + ")") return newVal else: return val + def _revert_dict_list(data, in_memory=False, string_as_obj=False, verbose=False): - ''' + """ Parse the input data, which should be a list or a dict, and convert all 1D arrays of "object" type - into 2D arrays of the proper data type. + into 2D arrays of the proper data type. If string_as_obj=True (default=False), the obtained 2D arrays of strings are converted into 2D arrays of "object" dtype. - If in_memory=True (default False), it changes the data in memory. + If in_memory=True (default False), it changes the data in memory. In any case, the modified data is returned. - + NOTE: The conversion is done such to reduce to minimum (I think) data copy: i.e. the actual data is not copied (or copies are reduced to the minimum). It is up to the user to make a deepcopy, if desired, of the data before and/or after conversion. - - ''' + + """ if in_memory == False: data = copy.copy(data) @@ -140,7 +154,9 @@ def _revert_dict_list(data, in_memory=False, string_as_obj=False, verbose=False) if type(entry) == dict: _revert_dict_list(entry) elif type(entry) == list or type(entry) == np.ndarray: - entry = _merge_to_2D(entry, string_as_obj=string_as_obj, verbose=verbose) + entry = _merge_to_2D( + entry, string_as_obj=string_as_obj, verbose=verbose + ) if len(entry) > 0 and isinstance(entry.flatten()[0], dict): for nasted_data in entry.flatten(): _revert_dict_list(nasted_data) @@ -149,20 +165,25 @@ def _revert_dict_list(data, in_memory=False, string_as_obj=False, verbose=False) if type(data[key]) == dict: _revert_dict_list(data[key]) elif type(data[key]) == list or type(data[key]) == np.ndarray: - data[key] = _merge_to_2D(data[key], string_as_obj=string_as_obj, verbose=verbose) + data[key] = _merge_to_2D( + data[key], string_as_obj=string_as_obj, verbose=verbose + ) if len(data[key]) > 0 and isinstance(data[key].flatten()[0], dict): for nasted_data in data[key].flatten(): _revert_dict_list(nasted_data) return data + ###### # CORE function of this project: it allows to convert a pyarrow object into a dict # -def _convert_parrow_data(data, treat_str_arrays_as_str=True, use_list_for_2D_array=False): - ''' +def _convert_parrow_data( + data, treat_str_arrays_as_str=True, use_list_for_2D_array=False +): + """ _convert_parrow_data(data) - it extract data from a pyarrow object to a "standard" pyjapcscout-like dict dataset, + it extract data from a pyarrow object to a "standard" pyjapcscout-like dict dataset, i.e. a dictionary with only not null numpy objects/arrays and no lists (but if you enable use_list_for_2D_array) if treat_str_arrays_as_str (default=True) it will try to preserve str data type also for arrays @@ -171,18 +192,20 @@ def _convert_parrow_data(data, treat_str_arrays_as_str=True, use_list_for_2D_arr Typically the output should be a `dict`. If, however, one is trying to convert more complex structures like a pyarrow Table or StructArray, the output will be a list of dictionaries, if more than one data records are found. - ''' + """ if isinstance(data, pa.lib.Table): output = [] for irow in range(data.num_rows): outputRow = dict() for column in data.column_names: - #if len(data['device1']) -> probably to be done something like this... + # if len(data['device1']) -> probably to be done something like this... device_dict = dict() # those should be value, header, exception for item in data[column][irow].items(): # this can be iterated... I think - device_dict[item[0]] = _convert_parrow_data(item[1], treat_str_arrays_as_str, use_list_for_2D_array) + device_dict[item[0]] = _convert_parrow_data( + item[1], treat_str_arrays_as_str, use_list_for_2D_array + ) outputRow[column] = device_dict output.append(outputRow) if len(output) == 1: @@ -192,7 +215,11 @@ def _convert_parrow_data(data, treat_str_arrays_as_str=True, use_list_for_2D_arr elif isinstance(data, pa.StructArray): output = [] for row in data: - output.append(_convert_parrow_data(row, treat_str_arrays_as_str, use_list_for_2D_array)) + output.append( + _convert_parrow_data( + row, treat_str_arrays_as_str, use_list_for_2D_array + ) + ) if len(output) == 1: return output[0] else: @@ -200,12 +227,16 @@ def _convert_parrow_data(data, treat_str_arrays_as_str=True, use_list_for_2D_arr elif isinstance(data, pa.StructScalar): output = dict() for item in data.items(): - output[item[0]] = _convert_parrow_data(item[1], treat_str_arrays_as_str, use_list_for_2D_array) + output[item[0]] = _convert_parrow_data( + item[1], treat_str_arrays_as_str, use_list_for_2D_array + ) return output elif isinstance(data, pa.ListScalar): if isinstance(data.type.value_type, pa.lib.ListType): - aux_dtype= data.type.value_type.value_type.to_pandas_dtype() - if treat_str_arrays_as_str and data.type.value_type.value_type.equals(pa.string()): + aux_dtype = data.type.value_type.value_type.to_pandas_dtype() + if treat_str_arrays_as_str and data.type.value_type.value_type.equals( + pa.string() + ): # actually a string! not a generic object.... aux_dtype = np.str_ return np.array(data.as_py(), dtype=aux_dtype) @@ -214,12 +245,18 @@ def _convert_parrow_data(data, treat_str_arrays_as_str=True, use_list_for_2D_arr if use_list_for_2D_array: output = [] for auxValue in data.values: - output.append(_convert_parrow_data(auxValue), treat_str_arrays_as_str, use_list_for_2D_array) + output.append( + _convert_parrow_data(auxValue), + treat_str_arrays_as_str, + use_list_for_2D_array, + ) return output else: output = np.empty((len(data.values),), dtype=object) for i, auxValue in enumerate(data.values): - output[i] = _convert_parrow_data(auxValue, treat_str_arrays_as_str, use_list_for_2D_array) + output[i] = _convert_parrow_data( + auxValue, treat_str_arrays_as_str, use_list_for_2D_array + ) return output else: # could be a 1D array of some data type @@ -229,189 +266,218 @@ def _convert_parrow_data(data, treat_str_arrays_as_str=True, use_list_for_2D_arr aux_dtype = np.str_ return np.array(data.as_py(), dtype=aux_dtype) else: - print('Zzzuth...') + print("Zzzuth...") return data elif issubclass(type(data), pa.lib.Scalar): # horrible casting!... did not find a better way.... return data.type.to_pandas_dtype()(data.as_py()) else: - print('Sigh... unknown data type: '+str(type(data))) + print("Sigh... unknown data type: " + str(type(data))) return data ###### Some important functions not so interesting for the standard user, but fundamental + def dict_to_pyarrow(input_dict): - my_data_dict_converted = _convert_dict_list(input_dict, in_memory=False, split_to_list=False, verbose=False) + my_data_dict_converted = _convert_dict_list( + input_dict, in_memory=False, split_to_list=False, verbose=False + ) if not isinstance(my_data_dict_converted, list): my_data_dict_converted = [my_data_dict_converted] return pa.Table.from_pandas(pd.DataFrame(my_data_dict_converted)) + def pyarrow_to_parquet(input_pa, filename): pq.write_table(input_pa, filename) + def parquet_to_pyarrow(filename): return pq.read_table(filename) + def pyarrow_to_dict(input_pa): return _convert_parrow_data(input_pa) - + def pyarrow_to_pandas(input_pa): return dict_to_pandas(pyarrow_to_dict(input_pa)) + ####### The functions interesting for the user + def dict_to_pandas(input_dict): if not isinstance(input_dict, list): input_dict = [input_dict] return pd.DataFrame(input_dict) + def dict_to_awkward(input_dict): return ak.from_arrow(dict_to_pyarrow(input_dict)) + def dict_to_parquet(input_dict, filename): - # we could also just go to pandas, and then to parquet. + # we could also just go to pandas, and then to parquet. # dict_to_pandas(input_dict).to_parquet(filename) name, ext = os.path.splitext(filename) if len(ext) == 0: - filename = filename+'.parquet' + filename = filename + ".parquet" pyarrow_to_parquet(dict_to_pyarrow(input_dict), filename) + def dict_to_pickle(input_dict, filename): name, ext = os.path.splitext(filename) if len(ext) == 0: - filename = filename+'.pkl' - with open(filename, 'wb') as handle: + filename = filename + ".pkl" + with open(filename, "wb") as handle: pickle.dump(input_dict, handle, protocol=pickle.HIGHEST_PROTOCOL) + def dict_to_json(input_dict, filename): - ''' + """ Function provided for convenience, but not of interest for typical use case... - ''' + """ name, ext = os.path.splitext(filename) if len(ext) == 0: - filename = filename+'.json' + filename = filename + ".json" dict_to_pandas(input_dict).to_json(filename) + def json_to_pandas(filename): - ''' + """ Function provided for convenience, but not of interest for typical use case... - ''' + """ return df.read_json(filename) + def pandas_to_dict(input_pandas): - ''' + """ it converts a pandas dataframe into a pyjapcscout-like dict (or list of dicts in case of many records) - ''' - output = input_pandas.to_dict('records') + """ + output = input_pandas.to_dict("records") if len(output) == 1: output = output[0] return output + def awkward_to_dict(input_awkward): - ''' + """ it converts the specified row of an awkward array into a pyjapcscout-like dict - ''' + """ return _convert_parrow_data(ak.to_arrow(input_awkward)) + def pickle_to_dict(filename): - with open(filename, 'rb') as handle: + with open(filename, "rb") as handle: load_dict = pickle.load(handle) return load_dict + def parquet_to_dict(filename): return pyarrow_to_dict(parquet_to_pyarrow(filename)) + # between pandas and awkward def pandas_to_awkward(input_pandas): return dict_to_awkward(pandas_to_dict(input_pandas)) + def awkward_to_pandas(input_awkward): return dict_to_pandas(awkward_to_dict) + # reading from parquet to pandas without type loss def parquet_to_pandas(filename): - ''' + """ It reads a **single** parquet into a pandas dataframe with no data type loss - ''' + """ return dict_to_pandas(parquet_to_dict(filename)) + def parquet_to_awkward(filename): return ak.from_parquet(filename) ####### Simple save/load functions for the user + def _getFilename(): return datetime.now().strftime("%Y.%m.%d.%H.%M.%S.%f") -def save_dict(dictData, folderPath = None, filename = None, fileFormat='parquet'): + +def save_dict(dictData, folderPath=None, filename=None, fileFormat="parquet"): if filename == None: filename = _getFilename() Path(folderPath).mkdir(parents=True, exist_ok=True) filename = os.path.join(folderPath, filename) - if fileFormat == 'parquet': - dict_to_parquet(dictData, filename+'.parquet') - elif fileFormat == 'json': - dict_to_json(dictData, filename+'.json') - elif (fileFormat == 'pickle') or (fileFormat == 'pickledict'): - dict_to_pickle(dictData, filename+'.pkl') - elif fileFormat == 'mat': - raise ValueError('MAT format not yet supported') - scipy.io.savemat(filename+'.mat', dictData) + if fileFormat == "parquet": + dict_to_parquet(dictData, filename + ".parquet") + elif fileFormat == "json": + dict_to_json(dictData, filename + ".json") + elif (fileFormat == "pickle") or (fileFormat == "pickledict"): + dict_to_pickle(dictData, filename + ".pkl") + elif fileFormat == "mat": + raise ValueError("MAT format not yet supported") + scipy.io.savemat(filename + ".mat", dictData) else: - raise ValueError('Unknown file format') + raise ValueError("Unknown file format") + -def load_dict(filename, fileFormat='parquet'): - if fileFormat == 'parquet': +def load_dict(filename, fileFormat="parquet"): + if fileFormat == "parquet": return parquet_to_dict(filename) - elif (fileFormat == 'pickle') or (fileFormat == 'pickledict'): + elif (fileFormat == "pickle") or (fileFormat == "pickledict"): return pickle_to_dict(filename) - elif fileFormat == 'json': - raise ValueError('JSON format not yet supported') - elif fileFormat == 'mat': - raise ValueError('MAT format not yet supported') - print('TODO: compatibility with MATLAB generated files?!') + elif fileFormat == "json": + raise ValueError("JSON format not yet supported") + elif fileFormat == "mat": + raise ValueError("MAT format not yet supported") + print("TODO: compatibility with MATLAB generated files?!") return scipy.io.loadmat(filename) else: - raise ValueError('Unknown file format ({})'.format(fileFormat)) + raise ValueError("Unknown file format ({})".format(fileFormat)) ####### Some additional functions for debugging purposes -def _find_lists(data, verbose = False): - ''' + +def _find_lists(data, verbose=False): + """ Look inside data (assumed to be a dict) and tell if some fields are actually lists. In theory, `datascout` package is meant to be used only on dicts that do NOT contain any list! - ''' + """ for key, value in data.items(): - if verbose: print(key) + if verbose: + print(key) if isinstance(value, list): - print(key+" is a list!") + print(key + " is a list!") elif isinstance(value, dict): _find_lists(value) else: - if verbose: print(" ..is "+str(type(value))) + if verbose: + print(" ..is " + str(type(value))) - -def _compare_data(data1, data2, use_assert = False): - ''' +def _compare_data(data1, data2, use_assert=False): + """ Compares two dictionaries or lists and show the differences (of type or data type). For a full comparison, it is sometimes best to call this function also with inverted - ''' + """ + def not_equal(a, b): - print(' ------ ') - print(str(a) + ' (' + str(type(a)) + ')') - print(' NOT EQUAL ') - print(str(b) + ' (' + str(type(b)) + ')') - print(' ------ ') + print(" ------ ") + print(str(a) + " (" + str(type(a)) + ")") + print(" NOT EQUAL ") + print(str(b) + " (" + str(type(b)) + ")") + print(" ------ ") if use_assert: - raise AssertionError('{} not equal to {}'.format(a, b)) - - if (type(data1) != type(data2)) or (hasattr(data1, '__len__') and (len(data1) != len(data2))): + raise AssertionError("{} not equal to {}".format(a, b)) + + if (type(data1) != type(data2)) or ( + hasattr(data1, "__len__") and (len(data1) != len(data2)) + ): not_equal(data1, data2) elif isinstance(data1, list): for i in range(len(data1)): @@ -429,5 +495,3 @@ def _compare_data(data1, data2, use_assert = False): _compare_data(data1.flatten()[i], data2.flatten()[i], use_assert) else: not_equal(data1, data2) - - diff --git a/datascout/tests/test_dataconversion.py b/datascout/tests/test_dataconversion.py index d9f62a7fcfc0a8cd2f87f2e5443bb219438a28b6..9d346964f868ae926431a17f8cda84a58080b677 100644 --- a/datascout/tests/test_dataconversion.py +++ b/datascout/tests/test_dataconversion.py @@ -14,56 +14,165 @@ import datetime import copy import os + def generate_data_dict(): - ''' + """ Simply generate a dictionary with some values that should be compatible with this package. - Note: only 'device1' contains random data... - ''' - return {'device1': {'value': { 'property1':np.int8(np.random.rand(1)[0]*10**2), - 'property2':np.int8(np.random.rand(1,43)*10**2), - 'property3':np.int8(np.random.rand(10,3)*10**2), - 'property4':np.int8(np.random.rand(50,1)*10**2)}, - 'header': {'acqStamp':np.int64(np.random.rand(1)[0]*10**12),'cycleStamp':np.int64(np.random.rand(1)[0]*10**12)}, - 'exception': ''}, - 'device2': {'value': np.array([[1, 12], [4, 5], [1, 2]], dtype=np.int16), - 'header': {'acqStamp':np.int64(44444),'cycleStamp':np.int64(3455445)}, - 'exception': ''}, - 'device3': {'value': '', - 'header': {'acqStamp':np.int64(44444),'cycleStamp':np.int64(0)}, - 'exception': 'Cipolla'}, - 'device4': {'value': { 'property5':'This is string', - 'property6':np.array(['my', 'list'], dtype=str), #np.str_ or object? -> np.str_! - 'property7':np.array([['my'], ['list'],['long']], dtype=str), - 'property8':np.array([['my', 'list'], ['of', 'more'], ['val', 'string']], dtype=str), - }, - 'header': {'acqStamp':np.int64(55555),'cycleStamp':np.int64(3455445)}, - 'exception': ''}, - 'device5': {'value': { 'property9':{'JAPC_FUNCTION': {'X': np.array([1, 2, 3, 4], dtype=np.float64), 'Y':np.array([14, 2, 7, 5], dtype=np.float64)}}, - 'property6':np.array([{'JAPC_FUNCTION': {'X': np.array([1, 2], dtype=np.float64), 'Y':np.array([14, 2], dtype=np.float64)}}, {'JAPC_FUNCTION':{'X': np.array([3, 4], dtype=np.float64), 'Y':np.array([7, 5], dtype=np.float64)}}], dtype=object), - }, - 'header': {'acqStamp':np.int64(4444444),'cycleStamp':np.int64(0)}, - 'exception': ''}, - 'device6': {'value': {'JAPC_FUNCTION': {'X': np.array([1, 2, 3, 4], dtype=np.float64), 'Y':np.array([14, 2, 7, 5], dtype=np.float64)}}, - 'header': {'acqStamp':np.int64(4455444),'cycleStamp':np.int64(0)}, - 'exception': ''}, - 'device7': {'value': { 'property10':{'JAPC_ENUM':{'code':np.int64(2), 'string':'piero'}}, - 'property11':np.array([{'JAPC_ENUM':{'code':np.int64(3), 'string':'carlo'}}, {'JAPC_ENUM':{'code':np.int64(4), 'string':'micio'}}], dtype=object), - 'property12':{'JAPC_ENUM_SET':{'codes':np.array([2, 8], dtype=np.int64), 'aslong':np.int64(123), 'strings':np.array(['nieva','po'], dtype=str)}}, #np.str_ - 'property13':np.array([{'JAPC_ENUM_SET':{'codes':np.array([7,44], dtype=np.int64), 'aslong':np.int64(123), 'strings':np.array(['nieva','po'], dtype=str)}}, - {'JAPC_ENUM_SET':{'codes':np.array([5,6], dtype=np.int64), 'aslong':np.int64(77), 'strings':np.array(['nettuno','plutone'], dtype=str)}} - ], dtype=object), - 'property14':np.array([{'JAPC_ENUM_SET':{'codes':np.array([], dtype=np.int64), 'aslong':np.int64(0), 'strings':np.array([], dtype=str)}}, - {'JAPC_ENUM_SET':{'codes':np.array([5,6], dtype=np.int64), 'aslong':np.int64(77), 'strings':np.array(['nettuno','plutone'], dtype=str)}} - ], dtype=object)}, - 'header': {'acqStamp':np.int64(44333444),'cycleStamp':np.int64(0)}, - 'exception': ''}, - 'device8': {'value': {'JAPC_ENUM_SET':{'codes':np.array([2, 8], dtype=np.int64), 'aslong':np.int64(123), 'strings':np.array(['nieva','po'], dtype=str)}}, - 'header': {'acqStamp':np.int64(4),'cycleStamp':np.int64(0)}, - 'exception': 'no data for xxxx'}, - 'device9': {'value': {'cipolla' : np.array([], dtype=str) }, - 'header': {'acqStamp':np.int64(4),'cycleStamp':np.int64(0)}, - 'exception': 'no data for xxxx'}} - + Note: only 'device1' contains random data... + """ + return { + "device1": { + "value": { + "property1": np.int8(np.random.rand(1)[0] * 10 ** 2), + "property2": np.int8(np.random.rand(1, 43) * 10 ** 2), + "property3": np.int8(np.random.rand(10, 3) * 10 ** 2), + "property4": np.int8(np.random.rand(50, 1) * 10 ** 2), + }, + "header": { + "acqStamp": np.int64(np.random.rand(1)[0] * 10 ** 12), + "cycleStamp": np.int64(np.random.rand(1)[0] * 10 ** 12), + }, + "exception": "", + }, + "device2": { + "value": np.array([[1, 12], [4, 5], [1, 2]], dtype=np.int16), + "header": {"acqStamp": np.int64(44444), "cycleStamp": np.int64(3455445)}, + "exception": "", + }, + "device3": { + "value": "", + "header": {"acqStamp": np.int64(44444), "cycleStamp": np.int64(0)}, + "exception": "Cipolla", + }, + "device4": { + "value": { + "property5": "This is string", + "property6": np.array( + ["my", "list"], dtype=str + ), # np.str_ or object? -> np.str_! + "property7": np.array([["my"], ["list"], ["long"]], dtype=str), + "property8": np.array( + [["my", "list"], ["of", "more"], ["val", "string"]], dtype=str + ), + }, + "header": {"acqStamp": np.int64(55555), "cycleStamp": np.int64(3455445)}, + "exception": "", + }, + "device5": { + "value": { + "property9": { + "JAPC_FUNCTION": { + "X": np.array([1, 2, 3, 4], dtype=np.float64), + "Y": np.array([14, 2, 7, 5], dtype=np.float64), + } + }, + "property6": np.array( + [ + { + "JAPC_FUNCTION": { + "X": np.array([1, 2], dtype=np.float64), + "Y": np.array([14, 2], dtype=np.float64), + } + }, + { + "JAPC_FUNCTION": { + "X": np.array([3, 4], dtype=np.float64), + "Y": np.array([7, 5], dtype=np.float64), + } + }, + ], + dtype=object, + ), + }, + "header": {"acqStamp": np.int64(4444444), "cycleStamp": np.int64(0)}, + "exception": "", + }, + "device6": { + "value": { + "JAPC_FUNCTION": { + "X": np.array([1, 2, 3, 4], dtype=np.float64), + "Y": np.array([14, 2, 7, 5], dtype=np.float64), + } + }, + "header": {"acqStamp": np.int64(4455444), "cycleStamp": np.int64(0)}, + "exception": "", + }, + "device7": { + "value": { + "property10": {"JAPC_ENUM": {"code": np.int64(2), "string": "piero"}}, + "property11": np.array( + [ + {"JAPC_ENUM": {"code": np.int64(3), "string": "carlo"}}, + {"JAPC_ENUM": {"code": np.int64(4), "string": "micio"}}, + ], + dtype=object, + ), + "property12": { + "JAPC_ENUM_SET": { + "codes": np.array([2, 8], dtype=np.int64), + "aslong": np.int64(123), + "strings": np.array(["nieva", "po"], dtype=str), + } + }, # np.str_ + "property13": np.array( + [ + { + "JAPC_ENUM_SET": { + "codes": np.array([7, 44], dtype=np.int64), + "aslong": np.int64(123), + "strings": np.array(["nieva", "po"], dtype=str), + } + }, + { + "JAPC_ENUM_SET": { + "codes": np.array([5, 6], dtype=np.int64), + "aslong": np.int64(77), + "strings": np.array(["nettuno", "plutone"], dtype=str), + } + }, + ], + dtype=object, + ), + "property14": np.array( + [ + { + "JAPC_ENUM_SET": { + "codes": np.array([], dtype=np.int64), + "aslong": np.int64(0), + "strings": np.array([], dtype=str), + } + }, + { + "JAPC_ENUM_SET": { + "codes": np.array([5, 6], dtype=np.int64), + "aslong": np.int64(77), + "strings": np.array(["nettuno", "plutone"], dtype=str), + } + }, + ], + dtype=object, + ), + }, + "header": {"acqStamp": np.int64(44333444), "cycleStamp": np.int64(0)}, + "exception": "", + }, + "device8": { + "value": { + "JAPC_ENUM_SET": { + "codes": np.array([2, 8], dtype=np.int64), + "aslong": np.int64(123), + "strings": np.array(["nieva", "po"], dtype=str), + } + }, + "header": {"acqStamp": np.int64(4), "cycleStamp": np.int64(0)}, + "exception": "no data for xxxx", + }, + "device9": { + "value": {"cipolla": np.array([], dtype=str)}, + "header": {"acqStamp": np.int64(4), "cycleStamp": np.int64(0)}, + "exception": "no data for xxxx", + }, + } def generic_test_data_conversion(my_data_dict): @@ -72,26 +181,30 @@ def generic_test_data_conversion(my_data_dict): # go to panda and back without altering initial data my_pandas = datascout.dict_to_pandas(my_data_dict) - datascout._compare_data(my_data_dict, my_data_dict_ref, use_assert = True) + datascout._compare_data(my_data_dict, my_data_dict_ref, use_assert=True) my_data_back = datascout.pandas_to_dict(my_pandas) - datascout._compare_data(my_data_back, my_data_dict_ref, use_assert = True) + datascout._compare_data(my_data_back, my_data_dict_ref, use_assert=True) # go to pyarrow and back without altering initial data my_pyarrow = datascout.dict_to_pyarrow(my_data_dict) - datascout._compare_data(my_data_dict, my_data_dict_ref, use_assert = True) + datascout._compare_data(my_data_dict, my_data_dict_ref, use_assert=True) my_data_back = datascout.pyarrow_to_dict(my_pyarrow) - datascout._compare_data(my_data_back, my_data_dict_ref, use_assert = True) + datascout._compare_data(my_data_back, my_data_dict_ref, use_assert=True) # go to awkward and back without altering initial data my_ak = datascout.dict_to_awkward(my_data_dict) - datascout._compare_data(my_data_dict, my_data_dict_ref, use_assert = True) + datascout._compare_data(my_data_dict, my_data_dict_ref, use_assert=True) my_data_back = datascout.awkward_to_dict(my_ak) - datascout._compare_data(my_data_back, my_data_dict_ref, use_assert = True) + datascout._compare_data(my_data_back, my_data_dict_ref, use_assert=True) # a long chain - my_data_back = datascout.awkward_to_dict(datascout.pandas_to_awkward(datascout.pyarrow_to_pandas(datascout.dict_to_pyarrow(my_data_dict)))) - datascout._compare_data(my_data_dict, my_data_dict_ref, use_assert = True) - datascout._compare_data(my_data_back, my_data_dict_ref, use_assert = True) + my_data_back = datascout.awkward_to_dict( + datascout.pandas_to_awkward( + datascout.pyarrow_to_pandas(datascout.dict_to_pyarrow(my_data_dict)) + ) + ) + datascout._compare_data(my_data_dict, my_data_dict_ref, use_assert=True) + datascout._compare_data(my_data_back, my_data_dict_ref, use_assert=True) def generic_test_save_load(tmpdir, my_data_dict): @@ -99,20 +212,20 @@ def generic_test_save_load(tmpdir, my_data_dict): my_data_dict_ref = copy.deepcopy(my_data_dict) # define temporary filename - temp_filename_parquet = os.path.join(str(tmpdir), 'test.parquet') - temp_filename_pickle = os.path.join(str(tmpdir), 'test.pkl') + temp_filename_parquet = os.path.join(str(tmpdir), "test.parquet") + temp_filename_pickle = os.path.join(str(tmpdir), "test.pkl") # go to parquet datascout.dict_to_parquet(my_data_dict, temp_filename_parquet) - datascout._compare_data(my_data_dict, my_data_dict_ref, use_assert = True) + datascout._compare_data(my_data_dict, my_data_dict_ref, use_assert=True) my_data_back = datascout.parquet_to_dict(temp_filename_parquet) - datascout._compare_data(my_data_back, my_data_dict_ref, use_assert = True) + datascout._compare_data(my_data_back, my_data_dict_ref, use_assert=True) # go to pickle datascout.dict_to_pickle(my_data_dict, temp_filename_pickle) - datascout._compare_data(my_data_dict, my_data_dict_ref, use_assert = True) + datascout._compare_data(my_data_dict, my_data_dict_ref, use_assert=True) my_data_back = datascout.pickle_to_dict(temp_filename_pickle) - datascout._compare_data(my_data_back, my_data_dict_ref, use_assert = True) + datascout._compare_data(my_data_back, my_data_dict_ref, use_assert=True) def test_data_conversion(): @@ -128,12 +241,14 @@ def test_data_array_conversion(): # test it generic_test_data_conversion(my_data_dict) + def test_save_load(tmpdir): # generate dataset my_data_dict = generate_data_dict() # test it generic_test_save_load(tmpdir, my_data_dict) + def test_save_load_array(tmpdir): # generate dataset my_data_dict = [generate_data_dict(), generate_data_dict(), generate_data_dict()] @@ -142,9 +257,9 @@ def test_save_load_array(tmpdir): # Function above can be locally tests as: -''' +""" from pathlib import Path tmpdir=Path('.') test_save_load(tmpdir) test_save_load_array(tmpdir) -''' \ No newline at end of file +""" diff --git a/examples/000_example.py b/examples/000_example.py index 4fb793b6d4b87bb5464799c92538d1076e9bbcda..1fee7df1e741c03c7592e1f415532617d72e7f74 100644 --- a/examples/000_example.py +++ b/examples/000_example.py @@ -4,42 +4,62 @@ from datetime import timezone import datetime import numpy as np + def convert_2d_array(my_array): - if len(np.shape(my_array))==1: return my_array - list=[] + if len(np.shape(my_array)) == 1: + return my_array + list = [] for jj in range(np.shape(my_array)[0]): - list.append(my_array[jj,:]) - return list + list.append(my_array[jj, :]) + return list def old2new(my_dict, verbose=False): for device_name, device_value in my_dict.items(): - if verbose: print(device_name) - device_value['header']['acqStamp'] = \ - np.int64(device_value['header']['acqStamp'].replace(tzinfo=timezone.utc).timestamp()*1e9) - device_value['header']['cycleStamp'] = \ - np.int64(device_value['header']['cycleStamp'].replace(tzinfo=timezone.utc).timestamp()*1e6)*1e3 - device_value['header']['setStamp'] = \ - np.int64(device_value['header']['setStamp'].replace(tzinfo=timezone.utc).timestamp()*1e9) - if (not type(device_value['value'])==dict) and (not type(device_value['value'])==list): - if math.isnan(device_value['value']): - device_value['value']= 'no data' - if type(device_value['value'])==np.ndarray: - device_value['value']= convert_2d_array(device_value['value']) - if type(device_value['value'])==dict: - for value_name, value_value in device_value['value'].items(): - if verbose: print(device_value['value'][value_name]) - if type(device_value['value'][value_name])==np.ndarray: - device_value['value'][value_name]=convert_2d_array(value_value) + if verbose: + print(device_name) + device_value["header"]["acqStamp"] = np.int64( + device_value["header"]["acqStamp"].replace(tzinfo=timezone.utc).timestamp() + * 1e9 + ) + device_value["header"]["cycleStamp"] = ( + np.int64( + device_value["header"]["cycleStamp"] + .replace(tzinfo=timezone.utc) + .timestamp() + * 1e6 + ) + * 1e3 + ) + device_value["header"]["setStamp"] = np.int64( + device_value["header"]["setStamp"].replace(tzinfo=timezone.utc).timestamp() + * 1e9 + ) + if (not type(device_value["value"]) == dict) and ( + not type(device_value["value"]) == list + ): + if math.isnan(device_value["value"]): + device_value["value"] = "no data" + if type(device_value["value"]) == np.ndarray: + device_value["value"] = convert_2d_array(device_value["value"]) + if type(device_value["value"]) == dict: + for value_name, value_value in device_value["value"].items(): + if verbose: + print(device_value["value"][value_name]) + if type(device_value["value"][value_name]) == np.ndarray: + device_value["value"][value_name] = convert_2d_array(value_value) return my_dict -#my_parquet = '/eos/project/l/liu/datascout/parquet_file/2021.04.30.01.54.47.454151.parquet' -my_dict_file = '/eos/project/l/liu/datascout/pickle_files/2021.04.30.01.54.47.454151.pkl' -#ds.parquet_to_dict(my_parquet) +# my_parquet = '/eos/project/l/liu/datascout/parquet_file/2021.04.30.01.54.47.454151.parquet' +my_dict_file = ( + "/eos/project/l/liu/datascout/pickle_files/2021.04.30.01.54.47.454151.pkl" +) + +# ds.parquet_to_dict(my_parquet) my_dict = ds.pickle_to_dict(my_dict_file) -a=old2new(my_dict[0]) +a = old2new(my_dict[0]) # ONLINE ds.dict_to_awkward(a) @@ -47,21 +67,16 @@ ds.dict_to_pyarrow(a) ds.dict_to_pandas(a) # OFFLINE -ds.dict_to_parquet(a, 'test') -ds.dict_to_json(a, 'test') -ds.dict_to_pickle(a, 'test') - -# -print('parquet_to_pyarrow') -ds.parquet_to_pyarrow('test.parquet') - -print('parquet_to_pandas') -ds.parquet_to_pandas('test.parquet') - -print('parquet_to_awkward') -ds.parquet_to_awkward('test.parquet') - - +ds.dict_to_parquet(a, "test") +ds.dict_to_json(a, "test") +ds.dict_to_pickle(a, "test") +# +print("parquet_to_pyarrow") +ds.parquet_to_pyarrow("test.parquet") +print("parquet_to_pandas") +ds.parquet_to_pandas("test.parquet") +print("parquet_to_awkward") +ds.parquet_to_awkward("test.parquet") diff --git a/setup.py b/setup.py index a3e9b857c0025affb9fc8d0b82d6f3c649fd034e..c847b9be56129fcad30078d0af8f39c13b79c6ee 100644 --- a/setup.py +++ b/setup.py @@ -10,59 +10,51 @@ from setuptools import setup, find_packages HERE = Path(__file__).parent.absolute() -with (HERE / 'README.md').open('rt') as fh: +with (HERE / "README.md").open("rt") as fh: LONG_DESCRIPTION = fh.read().strip() REQUIREMENTS: dict = { - 'core': [ - 'numpy', - 'pandas', - 'pyarrow', - 'awkward', - 'datetime', - 'pathlib' + "core": ["numpy", "pandas", "pyarrow", "awkward", "datetime", "pathlib"], + "test": [ + "pytest", ], - 'test': [ - 'pytest', - ], - 'dev': [ + "dev": [ # 'requirement-for-development-purposes-only', ], - 'doc': [ - 'sphinx', - 'acc-py-sphinx', + "doc": [ + "sphinx", + "acc-py-sphinx", ], } setup( - name='datascout', + name="datascout", version="0.0.1.dev0", - - author='Davide Gamba', - author_email='davide.gamba@cern.ch', - description='SHORT DESCRIPTION OF PROJECT', + author="Davide Gamba", + author_email="davide.gamba@cern.ch", + description="SHORT DESCRIPTION OF PROJECT", long_description=LONG_DESCRIPTION, - long_description_content_type='text/markdown', - url='', - + long_description_content_type="text/markdown", + url="", packages=find_packages(), - python_requires='~=3.7', + python_requires="~=3.7", classifiers=[ "Programming Language :: Python :: 3", "Operating System :: OS Independent", ], - - install_requires=REQUIREMENTS['core'], + install_requires=REQUIREMENTS["core"], extras_require={ **REQUIREMENTS, # The 'dev' extra is the union of 'test' and 'doc', with an option # to have explicit development dependencies listed. - 'dev': [req - for extra in ['dev', 'test', 'doc'] - for req in REQUIREMENTS.get(extra, [])], + "dev": [ + req + for extra in ["dev", "test", "doc"] + for req in REQUIREMENTS.get(extra, []) + ], # The 'all' extra is the union of all requirements. - 'all': [req for reqs in REQUIREMENTS.values() for req in reqs], + "all": [req for reqs in REQUIREMENTS.values() for req in reqs], }, )