Skip to content
Snippets Groups Projects
Commit 1fb270e2 authored by Davide Gamba's avatar Davide Gamba
Browse files

starting putting code together

parent e39b66ab
No related branches found
No related tags found
No related merge requests found
Pipeline #2575528 failed
""" """
Documentation for the datascout package list of sweet functions for data conversion and writing to disk
""" """
__version__ = "0.0.1.dev0" __version__ = "0.0.1.dev0"
# for the user
from ._datascout import dict_to_pandas
from ._datascout import dict_to_awkward
from ._datascout import dict_to_parquet
from ._datascout import dict_to_pickle
from ._datascout import dict_to_json
# coming back
from ._datascout import pandas_to_dict
from ._datascout import awkward_to_dict
from ._datascout import pickle_to_dict
from ._datascout import parquet_to_dict
# between pandas and awkward
from ._datascout import pandas_to_awkward
from ._datascout import awkward_to_pandas
# reading from parquet to pandas/awkward without type loss
from ._datascout import parquet_to_pandas
from ._datascout import parquet_to_awkward
# to look at pyarrow, typically not used by a user
from ._datascout import dict_to_pyarrow
from ._datascout import pyarrow_to_parquet
from ._datascout import parquet_to_pyarrow
from ._datascout import pyarrow_to_dict
"""
Implementation of sweet functions to convert data from one type to anothre
"""
import numpy as np
import pandas as pd
import awkward as ak
import pyarrow.parquet as pq
import numpy as np
import pyarrow as pa
import pickle
import datetime
import copy
######
# Functions needed to split 2D arrays
def split_2D_array(val, in_memory=False, split_to_list=False, verbose=False):
'''
split_2D_array(val, in_memory=False, split_to_list=False, verbose=False)
It converts numpy 2D arrays into either:
- 1D "object" arrays containing 1D val.dtype arrays (split_to_list=False)
- list of 1D val.dtype arrays (split_to_list=True)
by default, split_to_list=False
It returns the split value or the original value if the input was not
If in_memory == True (default=False), data is not copied but just represented in a different form
'''
if (type(val) == np.ndarray) and len(np.shape(val)) == 2:
if not in_memory:
val = copy.deepcopy(val)
if verbose: print('made a copy of '+str(val))
if split_to_list:
newVal = list(val)
else:
# (TODO: probably to be done better!!!)
auxDim = np.shape(val)[0]
# split val, without making data copy
auxData = np.split(np.ravel(val), auxDim)
# put it in object array
newVal = np.empty((auxDim,), dtype=object)
for i in range(auxDim):
newVal[i] = auxData[i]
if verbose:
print(' ----- ')
print(str(val)+' ('+str(type(val))+')')
print(' -> converted to -> ')
print(str(newVal)+' ('+str(type(newVal))+')')
return newVal
else:
return val
def convert_dict_list(data, in_memory=False, split_to_list=False, verbose=False):
'''
Parse the input data, which should be a list or a dict, and convert all 2D arrays into either
- 1D object array of 1D arrays
- 1D list of 1D arrays
If in_memory=True (default False), it changes the data in memory.
In any case, the modified data is returned.
NOTE: The conversion is done such to reduce to minimum (I think) data copy: i.e. the actual data
is not copied (or copies are reduced to the minimum).
It is up to the user to make a deepcopy, if desired, of the data before and/or after conversion.
'''
if in_memory == False:
data = copy.copy(data)
if type(data) == list:
for entry in data:
if type(entry) == list or type(entry) == dict:
entry = convert_dict_list(entry)
elif type(entry) == np.ndarray:
entry = split_2D_array(entry, in_memory=in_memory, split_to_list=split_to_list, verbose=verbose)
elif type(data) == dict:
for key in data.keys():
if type(data[key]) == list or type(data[key]) == dict:
data[key] = convert_dict_list(data[key])
elif type(data[key]) == np.ndarray:
data[key] = split_2D_array(data[key], in_memory=in_memory, split_to_list=split_to_list, verbose=verbose)
return data
######
# Functions needed to re-merge 1D arrays of 1D arrays into 2D arrays
def merge_to_2D(val, string_as_obj=False, verbose=False):
'''
merge_to_2D(val, string_as_obj=False, verbose=False)
It converts back numpy arrays of "object" dtype into 2D arrays.
By construction, if conversion actually occurs, this operation makes a copy of
the data (probably with some exceptions)
string_as_obj=False (default):
This options (if enabled) makes sure that the returned object is a 2D array of "object"
data type in case of string arrays. This is necessary in case you want to edit one string
of the array without it being cut...
'''
if ((type(val) == np.ndarray) and val.dtype == object) or (type(val) == list):
newVal = np.stack(val)
# fix subtle issue with strings which I am assuming are arrays of objects
if string_as_obj and (newVal.dtype.type is np.str_):
newVal = newVal.astype(object)
if verbose:
print(' ----- ')
print(str(val)+' ('+str(type(val))+')')
print(' -> reverted to -> ')
print(str(newVal)+' ('+str(newVal.dtype)+')')
return newVal
else:
return val
def revert_dict_list(data, in_memory=False, string_as_obj=False, verbose=False):
'''
Parse the input data, which should be a list or a dict, and convert all 1D arrays of "object" type
into 2D arrays of the proper data type.
If string_as_obj=True (default=False), the obtained 2D arrays of strings are converted into 2D arrays
of "object" dtype.
If in_memory=True (default False), it changes the data in memory.
In any case, the modified data is returned.
NOTE: The conversion is done such to reduce to minimum (I think) data copy: i.e. the actual data
is not copied (or copies are reduced to the minimum).
It is up to the user to make a deepcopy, if desired, of the data before and/or after conversion.
'''
if in_memory == False:
data = copy.copy(data)
if type(data) == list:
for entry in data:
if type(entry) == dict:
revert_dict_list(entry)
elif type(entry) == list or type(entry) == np.ndarray:
entry = merge_to_2D(entry, string_as_obj=string_as_obj, verbose=verbose)
if len(entry) > 0 and isinstance(entry.flatten()[0], dict):
for nasted_data in entry.flatten():
revert_dict_list(nasted_data)
elif type(data) == dict:
for key in data.keys():
if type(data[key]) == dict:
revert_dict_list(data[key])
elif type(data[key]) == list or type(data[key]) == np.ndarray:
data[key] = merge_to_2D(data[key], string_as_obj=string_as_obj, verbose=verbose)
if len(data[key]) > 0 and isinstance(data[key].flatten()[0], dict):
for nasted_data in data[key].flatten():
revert_dict_list(nasted_data)
return data
######
# CORE function of this project: it allows to convert a pyarrow object into a dict
#
def convert_parrow_data(data, treat_str_arrays_as_str=True, use_list_for_2D_array=False):
'''
convert_parrow_data(data)
it extract data from a pyarrow object to a "standard" pyjapcscout-like dict dataset,
i.e. a dictionary with only not null numpy objects/arrays and no lists (but if you enable use_list_for_2D_array)
if treat_str_arrays_as_str (default=True) it will try to preserve str data type also for arrays
if use_list_for_2D_array (default=False) it will try to use lists of 1D arrays instead of 2D arrays
'''
if isinstance(data, pa.lib.Table):
output = dict()
for column in data.column_names:
#if len(data['device1']) -> probably to be done something like this...
device_dict = dict()
# those should be value, header, exception
for item in data[column][0].items():
# this can be iterated... I think
device_dict[item[0]] = convert_parrow_data(item[1])
output[column] = device_dict
return output
if isinstance(data, pa.StructScalar):
output_dict = dict()
for item in data.items():
output_dict[item[0]] = convert_parrow_data(item[1])
return output_dict
elif isinstance(data, pa.ListScalar):
if isinstance(data.type.value_type, pa.lib.ListType):
aux_dtype= data.type.value_type.value_type.to_pandas_dtype()
if treat_str_arrays_as_str and data.type.value_type.value_type.equals(pa.string()):
# actually a string! not a generic object....
aux_dtype = np.str_
return np.array(data.as_py(), dtype=aux_dtype)
elif isinstance(data.type.value_type, pa.lib.DataType):
if isinstance(data.type.value_type, pa.lib.StructType):
if use_list_for_2D_array:
auxOutput = []
for auxValue in data.values:
auxOutput.append(convert_parrow_data(auxValue))
return auxOutput
else:
auxOutput = np.empty((len(data.values),), dtype=object)
for i, auxValue in enumerate(data.values):
auxOutput[i] = convert_parrow_data(auxValue)
return auxOutput
else:
# could be a 1D array of some data type
aux_dtype = data.type.value_type.to_pandas_dtype()
if treat_str_arrays_as_str and data.type.value_type.equals(pa.string()):
# actually a string! not a generic object....
aux_dtype = np.str_
return np.array(data.as_py(), dtype=aux_dtype)
else:
print('Zzzuth...')
return data
elif issubclass(type(data), pa.lib.Scalar):
# horrible casting!... did not find a better way....
return data.type.to_pandas_dtype()(data.as_py())
else:
print('Sigh... unknown data type: '+str(type(data)))
return data
###### Some important functions not so interesting for the standard user, but fundamental
def dict_to_pyarrow(input_dict):
my_data_dict_converted = convert_dict_list(input_dict, in_memory=False, split_to_list=False, verbose=False)
return pa.Table.from_pandas(pd.DataFrame([my_data_dict_converted]))
def pyarrow_to_parquet(input_pa, filename):
pq.write_table(input_pa, filename)
def parquet_to_pyarrow(filename):
return pq.read_table(filename)
def pyarrow_to_dict(input_pa):
return convert_parrow_data(input_pa)
def pyarrow_to_dict(input_pa):
return convert_parrow_data(input_pa)
####### The functions interesting for the user
def dict_to_pandas(input_dict):
if not isinstance(input_dict, list):
input_dict = [input_dict]
return = pd.DataFrame(input_dict)
def dict_to_awkward(input_dict):
return ak.from_arrow(dict_to_pyarrow(input_dict))
def dict_to_parquet(input_dict, filename):
# we could also just go to pandas, and then to parquet.
# dict_to_pandas(input_dict).to_parquet(filename)
pyarrow_to_parquet(dict_to_pyarrow(input_dict), filename+'.parquet')
def dict_to_pickle(input_dict, filename):
with open(filename+'.pkl', 'wb') as handle:
pickle.dump(input_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
def dict_to_json(input_dict, filename):
'''
Function provided for convenience, but not of interest for typical use case...
'''
dict_to_pandas(input_dict).to_json(filename+'.json')
def json_to_pandas(filename)
'''
Function provided for convenience, but not of interest for typical use case...
'''
return df.read_json(filename)
def pandas_to_dict(input_pandas, row_index=0):
'''
it converts the specified row of a pandas dataframe into a pyjapcscout-like dict
'''
return input_pandas.iloc[row_index].to_dict()
def awkward_to_dict(input_awkward, row_index=0):
'''
it converts the specified row of an awkward array into a pyjapcscout-like dict
'''
return = convert_parrow_data(ak.to_arrow(input_awkward)[row_index])
def pickle_to_dict(filename):
with open(filename, 'rb') as handle:
load_dict = pickle.load(handle)
return load_dict
def parquet_to_dict(filename):
return pyarrow_to_dict(parquet_to_pyarrow)
# between pandas and awkward
def pandas_to_awkward(input_pandas):
print("TODO")
return
input_pandas = input_pandas.copy()
# I need to split it 2D arrays...
#return dict_to_awkward(pandas_to_dict(input_pandas))
def awkward_to_pandas(input_awkward):
print("TODO")
# reading from parquet to pandas without type loss
def parquet_to_pandas(filename):
'''
It reads a **single** parquet into a pandas dataframe with no data type loss
'''
return dict_to_pandas(parquet_to_dict(filename))
def parquet_to_awkward(filename):
return ak.from_parquet(filename)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment