diff --git a/datascout/__init__.py b/datascout/__init__.py index 378acfa6831aecaad9f7ea8ffe07ae8bf12a3dc5..5847394c5acf31207d62506fa28084a2a12ddebf 100644 --- a/datascout/__init__.py +++ b/datascout/__init__.py @@ -1,6 +1,34 @@ """ -Documentation for the datascout package +list of sweet functions for data conversion and writing to disk """ __version__ = "0.0.1.dev0" + + +# for the user +from ._datascout import dict_to_pandas +from ._datascout import dict_to_awkward +from ._datascout import dict_to_parquet +from ._datascout import dict_to_pickle +from ._datascout import dict_to_json + +# coming back +from ._datascout import pandas_to_dict +from ._datascout import awkward_to_dict +from ._datascout import pickle_to_dict +from ._datascout import parquet_to_dict + +# between pandas and awkward +from ._datascout import pandas_to_awkward +from ._datascout import awkward_to_pandas + +# reading from parquet to pandas/awkward without type loss +from ._datascout import parquet_to_pandas +from ._datascout import parquet_to_awkward + +# to look at pyarrow, typically not used by a user +from ._datascout import dict_to_pyarrow +from ._datascout import pyarrow_to_parquet +from ._datascout import parquet_to_pyarrow +from ._datascout import pyarrow_to_dict diff --git a/datascout/_datascout.py b/datascout/_datascout.py new file mode 100644 index 0000000000000000000000000000000000000000..5bf7a8d2d9a9452bfa743f43685e09633207a46f --- /dev/null +++ b/datascout/_datascout.py @@ -0,0 +1,311 @@ +""" +Implementation of sweet functions to convert data from one type to anothre + +""" +import numpy as np +import pandas as pd +import awkward as ak +import pyarrow.parquet as pq +import numpy as np +import pyarrow as pa +import pickle +import datetime +import copy + +###### +# Functions needed to split 2D arrays + +def split_2D_array(val, in_memory=False, split_to_list=False, verbose=False): + ''' + split_2D_array(val, in_memory=False, split_to_list=False, verbose=False) + + It converts numpy 2D arrays into either: + - 1D "object" arrays containing 1D val.dtype arrays (split_to_list=False) + - list of 1D val.dtype arrays (split_to_list=True) + by default, split_to_list=False + + It returns the split value or the original value if the input was not + + If in_memory == True (default=False), data is not copied but just represented in a different form + ''' + if (type(val) == np.ndarray) and len(np.shape(val)) == 2: + if not in_memory: + val = copy.deepcopy(val) + if verbose: print('made a copy of '+str(val)) + if split_to_list: + newVal = list(val) + else: + # (TODO: probably to be done better!!!) + auxDim = np.shape(val)[0] + # split val, without making data copy + auxData = np.split(np.ravel(val), auxDim) + # put it in object array + newVal = np.empty((auxDim,), dtype=object) + for i in range(auxDim): + newVal[i] = auxData[i] + if verbose: + print(' ----- ') + print(str(val)+' ('+str(type(val))+')') + print(' -> converted to -> ') + print(str(newVal)+' ('+str(type(newVal))+')') + return newVal + else: + return val + +def convert_dict_list(data, in_memory=False, split_to_list=False, verbose=False): + ''' + Parse the input data, which should be a list or a dict, and convert all 2D arrays into either + - 1D object array of 1D arrays + - 1D list of 1D arrays + + If in_memory=True (default False), it changes the data in memory. + In any case, the modified data is returned. + + NOTE: The conversion is done such to reduce to minimum (I think) data copy: i.e. the actual data + is not copied (or copies are reduced to the minimum). + It is up to the user to make a deepcopy, if desired, of the data before and/or after conversion. + + ''' + if in_memory == False: + data = copy.copy(data) + if type(data) == list: + for entry in data: + if type(entry) == list or type(entry) == dict: + entry = convert_dict_list(entry) + elif type(entry) == np.ndarray: + entry = split_2D_array(entry, in_memory=in_memory, split_to_list=split_to_list, verbose=verbose) + elif type(data) == dict: + for key in data.keys(): + if type(data[key]) == list or type(data[key]) == dict: + data[key] = convert_dict_list(data[key]) + elif type(data[key]) == np.ndarray: + data[key] = split_2D_array(data[key], in_memory=in_memory, split_to_list=split_to_list, verbose=verbose) + return data + + +###### +# Functions needed to re-merge 1D arrays of 1D arrays into 2D arrays + +def merge_to_2D(val, string_as_obj=False, verbose=False): + ''' + merge_to_2D(val, string_as_obj=False, verbose=False) + + It converts back numpy arrays of "object" dtype into 2D arrays. + By construction, if conversion actually occurs, this operation makes a copy of + the data (probably with some exceptions) + + string_as_obj=False (default): + This options (if enabled) makes sure that the returned object is a 2D array of "object" + data type in case of string arrays. This is necessary in case you want to edit one string + of the array without it being cut... + ''' + if ((type(val) == np.ndarray) and val.dtype == object) or (type(val) == list): + newVal = np.stack(val) + # fix subtle issue with strings which I am assuming are arrays of objects + if string_as_obj and (newVal.dtype.type is np.str_): + newVal = newVal.astype(object) + if verbose: + print(' ----- ') + print(str(val)+' ('+str(type(val))+')') + print(' -> reverted to -> ') + print(str(newVal)+' ('+str(newVal.dtype)+')') + return newVal + else: + return val + +def revert_dict_list(data, in_memory=False, string_as_obj=False, verbose=False): + ''' + Parse the input data, which should be a list or a dict, and convert all 1D arrays of "object" type + into 2D arrays of the proper data type. + + If string_as_obj=True (default=False), the obtained 2D arrays of strings are converted into 2D arrays + of "object" dtype. + + If in_memory=True (default False), it changes the data in memory. + In any case, the modified data is returned. + + NOTE: The conversion is done such to reduce to minimum (I think) data copy: i.e. the actual data + is not copied (or copies are reduced to the minimum). + It is up to the user to make a deepcopy, if desired, of the data before and/or after conversion. + + ''' + + if in_memory == False: + data = copy.copy(data) + if type(data) == list: + for entry in data: + if type(entry) == dict: + revert_dict_list(entry) + elif type(entry) == list or type(entry) == np.ndarray: + entry = merge_to_2D(entry, string_as_obj=string_as_obj, verbose=verbose) + if len(entry) > 0 and isinstance(entry.flatten()[0], dict): + for nasted_data in entry.flatten(): + revert_dict_list(nasted_data) + elif type(data) == dict: + for key in data.keys(): + if type(data[key]) == dict: + revert_dict_list(data[key]) + elif type(data[key]) == list or type(data[key]) == np.ndarray: + data[key] = merge_to_2D(data[key], string_as_obj=string_as_obj, verbose=verbose) + if len(data[key]) > 0 and isinstance(data[key].flatten()[0], dict): + for nasted_data in data[key].flatten(): + revert_dict_list(nasted_data) + return data + +###### +# CORE function of this project: it allows to convert a pyarrow object into a dict +# +def convert_parrow_data(data, treat_str_arrays_as_str=True, use_list_for_2D_array=False): + ''' + convert_parrow_data(data) + + it extract data from a pyarrow object to a "standard" pyjapcscout-like dict dataset, + i.e. a dictionary with only not null numpy objects/arrays and no lists (but if you enable use_list_for_2D_array) + + if treat_str_arrays_as_str (default=True) it will try to preserve str data type also for arrays + + if use_list_for_2D_array (default=False) it will try to use lists of 1D arrays instead of 2D arrays + ''' + if isinstance(data, pa.lib.Table): + output = dict() + for column in data.column_names: + #if len(data['device1']) -> probably to be done something like this... + device_dict = dict() + # those should be value, header, exception + for item in data[column][0].items(): + # this can be iterated... I think + device_dict[item[0]] = convert_parrow_data(item[1]) + output[column] = device_dict + return output + if isinstance(data, pa.StructScalar): + output_dict = dict() + for item in data.items(): + output_dict[item[0]] = convert_parrow_data(item[1]) + return output_dict + elif isinstance(data, pa.ListScalar): + if isinstance(data.type.value_type, pa.lib.ListType): + aux_dtype= data.type.value_type.value_type.to_pandas_dtype() + if treat_str_arrays_as_str and data.type.value_type.value_type.equals(pa.string()): + # actually a string! not a generic object.... + aux_dtype = np.str_ + return np.array(data.as_py(), dtype=aux_dtype) + elif isinstance(data.type.value_type, pa.lib.DataType): + if isinstance(data.type.value_type, pa.lib.StructType): + if use_list_for_2D_array: + auxOutput = [] + for auxValue in data.values: + auxOutput.append(convert_parrow_data(auxValue)) + return auxOutput + else: + auxOutput = np.empty((len(data.values),), dtype=object) + for i, auxValue in enumerate(data.values): + auxOutput[i] = convert_parrow_data(auxValue) + return auxOutput + else: + # could be a 1D array of some data type + aux_dtype = data.type.value_type.to_pandas_dtype() + if treat_str_arrays_as_str and data.type.value_type.equals(pa.string()): + # actually a string! not a generic object.... + aux_dtype = np.str_ + return np.array(data.as_py(), dtype=aux_dtype) + else: + print('Zzzuth...') + return data + elif issubclass(type(data), pa.lib.Scalar): + # horrible casting!... did not find a better way.... + return data.type.to_pandas_dtype()(data.as_py()) + else: + print('Sigh... unknown data type: '+str(type(data))) + return data + + +###### Some important functions not so interesting for the standard user, but fundamental + +def dict_to_pyarrow(input_dict): + my_data_dict_converted = convert_dict_list(input_dict, in_memory=False, split_to_list=False, verbose=False) + return pa.Table.from_pandas(pd.DataFrame([my_data_dict_converted])) + +def pyarrow_to_parquet(input_pa, filename): + pq.write_table(input_pa, filename) + +def parquet_to_pyarrow(filename): + return pq.read_table(filename) + +def pyarrow_to_dict(input_pa): + return convert_parrow_data(input_pa) + +def pyarrow_to_dict(input_pa): + return convert_parrow_data(input_pa) + + +####### The functions interesting for the user + +def dict_to_pandas(input_dict): + if not isinstance(input_dict, list): + input_dict = [input_dict] + return = pd.DataFrame(input_dict) + +def dict_to_awkward(input_dict): + return ak.from_arrow(dict_to_pyarrow(input_dict)) + +def dict_to_parquet(input_dict, filename): + # we could also just go to pandas, and then to parquet. + # dict_to_pandas(input_dict).to_parquet(filename) + pyarrow_to_parquet(dict_to_pyarrow(input_dict), filename+'.parquet') + +def dict_to_pickle(input_dict, filename): + with open(filename+'.pkl', 'wb') as handle: + pickle.dump(input_dict, handle, protocol=pickle.HIGHEST_PROTOCOL) + +def dict_to_json(input_dict, filename): + ''' + Function provided for convenience, but not of interest for typical use case... + ''' + dict_to_pandas(input_dict).to_json(filename+'.json') +def json_to_pandas(filename) + ''' + Function provided for convenience, but not of interest for typical use case... + ''' + return df.read_json(filename) + +def pandas_to_dict(input_pandas, row_index=0): + ''' + it converts the specified row of a pandas dataframe into a pyjapcscout-like dict + ''' + return input_pandas.iloc[row_index].to_dict() + +def awkward_to_dict(input_awkward, row_index=0): + ''' + it converts the specified row of an awkward array into a pyjapcscout-like dict + ''' + return = convert_parrow_data(ak.to_arrow(input_awkward)[row_index]) + +def pickle_to_dict(filename): + with open(filename, 'rb') as handle: + load_dict = pickle.load(handle) + return load_dict + +def parquet_to_dict(filename): + return pyarrow_to_dict(parquet_to_pyarrow) + +# between pandas and awkward +def pandas_to_awkward(input_pandas): + print("TODO") + return + input_pandas = input_pandas.copy() + # I need to split it 2D arrays... + #return dict_to_awkward(pandas_to_dict(input_pandas)) + +def awkward_to_pandas(input_awkward): + print("TODO") + +# reading from parquet to pandas without type loss +def parquet_to_pandas(filename): + ''' + It reads a **single** parquet into a pandas dataframe with no data type loss + ''' + return dict_to_pandas(parquet_to_dict(filename)) + +def parquet_to_awkward(filename): + return ak.from_parquet(filename) +