starting putting code together

1fb270e2 · Davide Gamba · e39b66ab · 1fb270e2 · 1fb270e2
Commit 1fb270e2 authored 4 years ago by Davide Gamba
--- a/datascout/__init__.py
+++ b/datascout/__init__.py
 """
-Documentation for the datascout package
+list of sweet functions for data conversion and writing to disk

 """

 __version__ = "0.0.1.dev0"
+
+
+# for the user
+from ._datascout import dict_to_pandas
+from ._datascout import dict_to_awkward
+from ._datascout import dict_to_parquet
+from ._datascout import dict_to_pickle
+from ._datascout import dict_to_json
+
+# coming back
+from ._datascout import pandas_to_dict
+from ._datascout import awkward_to_dict
+from ._datascout import pickle_to_dict
+from ._datascout import parquet_to_dict
+
+# between pandas and awkward
+from ._datascout import pandas_to_awkward
+from ._datascout import awkward_to_pandas
+
+# reading from parquet to pandas/awkward without type loss
+from ._datascout import parquet_to_pandas
+from ._datascout import parquet_to_awkward
+
+# to look at pyarrow, typically not used by a user
+from ._datascout import dict_to_pyarrow
+from ._datascout import pyarrow_to_parquet
+from ._datascout import parquet_to_pyarrow
+from ._datascout import pyarrow_to_dict
--- a/datascout/_datascout.py
+++ b/datascout/_datascout.py
+"""
+Implementation of sweet functions to convert data from one type to anothre
+
+"""
+import numpy as np
+import pandas as pd
+import awkward as ak
+import pyarrow.parquet as pq
+import numpy as np
+import pyarrow as pa
+import pickle
+import datetime
+import copy
+
+######
+# Functions needed to split 2D arrays
+
+def split_2D_array(val, in_memory=False, split_to_list=False, verbose=False):
+    '''
+    split_2D_array(val, in_memory=False, split_to_list=False, verbose=False)
+
+    It converts numpy 2D arrays into either:
+    - 1D "object" arrays containing 1D val.dtype arrays (split_to_list=False)
+    - list of 1D val.dtype arrays (split_to_list=True)
+    by default, split_to_list=False
+
+    It returns the split value or the original value if the input was not
+
+    If in_memory == True (default=False), data is not copied but just represented in a different form
+    '''
+    if (type(val) == np.ndarray) and len(np.shape(val)) == 2:
+        if not in_memory:
+            val = copy.deepcopy(val)
+            if verbose: print('made a copy of '+str(val))
+        if split_to_list:
+            newVal = list(val)
+        else:
+            # (TODO: probably to be done better!!!)
+            auxDim = np.shape(val)[0]
+            # split val, without making data copy
+            auxData = np.split(np.ravel(val), auxDim) 
+            # put it in object array 
+            newVal = np.empty((auxDim,), dtype=object) 
+            for i in range(auxDim):
+                newVal[i] = auxData[i]
+        if verbose:
+            print('     -----    ')
+            print(str(val)+' ('+str(type(val))+')')
+            print(' -> converted to -> ')
+            print(str(newVal)+' ('+str(type(newVal))+')')
+        return newVal
+    else:
+        return val
+
+def convert_dict_list(data, in_memory=False, split_to_list=False, verbose=False):
+    '''
+    Parse the input data, which should be a list or a dict, and convert all 2D arrays into either
+    - 1D object array of 1D arrays
+    - 1D list of 1D arrays
+
+    If in_memory=True (default False), it changes the data in memory. 
+    In any case, the modified data is returned.
+    
+    NOTE: The conversion is done such to reduce to minimum (I think) data copy: i.e. the actual data
+    is not copied (or copies are reduced to the minimum).
+    It is up to the user to make a deepcopy, if desired, of the data before and/or after conversion.
+    
+    '''
+    if in_memory == False:
+        data = copy.copy(data)
+    if type(data) == list:
+        for entry in data:
+            if type(entry) == list or type(entry) == dict:
+                entry = convert_dict_list(entry)
+            elif type(entry) == np.ndarray:
+                entry = split_2D_array(entry, in_memory=in_memory, split_to_list=split_to_list, verbose=verbose)
+    elif type(data) == dict:
+        for key in data.keys():
+            if type(data[key]) == list or type(data[key]) == dict:
+                data[key] = convert_dict_list(data[key])
+            elif type(data[key]) == np.ndarray:
+                data[key] = split_2D_array(data[key], in_memory=in_memory, split_to_list=split_to_list, verbose=verbose)
+    return data
+
+
+######
+# Functions needed to re-merge 1D arrays of 1D arrays into 2D arrays
+
+def merge_to_2D(val, string_as_obj=False, verbose=False):
+    '''
+    merge_to_2D(val, string_as_obj=False, verbose=False)
+
+    It converts back numpy arrays of "object" dtype into 2D arrays.
+    By construction, if conversion actually occurs, this operation makes a copy of 
+    the data (probably with some exceptions)
+    
+    string_as_obj=False (default): 
+    This options (if enabled) makes sure that the returned object is a 2D array of "object" 
+    data type in case of string arrays. This is necessary in case you want to edit one string
+    of the array without it being cut...
+    '''
+    if ((type(val) == np.ndarray) and val.dtype == object) or (type(val) == list):
+        newVal = np.stack(val)
+        # fix subtle issue with strings which I am assuming are arrays of objects
+        if string_as_obj and (newVal.dtype.type is np.str_):
+            newVal = newVal.astype(object)
+        if verbose:
+            print('     -----    ')
+            print(str(val)+' ('+str(type(val))+')')
+            print(' -> reverted to -> ')
+            print(str(newVal)+' ('+str(newVal.dtype)+')')
+        return newVal
+    else:
+        return val
+
+def revert_dict_list(data, in_memory=False, string_as_obj=False, verbose=False):
+    '''
+    Parse the input data, which should be a list or a dict, and convert all 1D arrays of "object" type
+    into 2D arrays of the proper data type. 
+
+    If string_as_obj=True (default=False), the obtained 2D arrays of strings are converted into 2D arrays
+    of "object" dtype.
+
+    If in_memory=True (default False), it changes the data in memory. 
+    In any case, the modified data is returned.
+    
+    NOTE: The conversion is done such to reduce to minimum (I think) data copy: i.e. the actual data
+    is not copied (or copies are reduced to the minimum).
+    It is up to the user to make a deepcopy, if desired, of the data before and/or after conversion.
+    
+    '''
+
+    if in_memory == False:
+        data = copy.copy(data)
+    if type(data) == list:
+        for entry in data:
+            if type(entry) == dict:
+                revert_dict_list(entry)
+            elif type(entry) == list or type(entry) == np.ndarray:
+                entry = merge_to_2D(entry, string_as_obj=string_as_obj, verbose=verbose)
+                if len(entry) > 0 and isinstance(entry.flatten()[0], dict):
+                    for nasted_data in entry.flatten():
+                        revert_dict_list(nasted_data)
+    elif type(data) == dict:
+        for key in data.keys():
+            if type(data[key]) == dict:
+                revert_dict_list(data[key])
+            elif type(data[key]) == list or type(data[key]) == np.ndarray:
+                data[key] = merge_to_2D(data[key], string_as_obj=string_as_obj, verbose=verbose)
+                if len(data[key]) > 0 and isinstance(data[key].flatten()[0], dict):
+                    for nasted_data in data[key].flatten():
+                        revert_dict_list(nasted_data)
+    return data
+
+######
+# CORE function of this project: it allows to convert a pyarrow object into a dict
+#
+def convert_parrow_data(data, treat_str_arrays_as_str=True, use_list_for_2D_array=False):
+    '''
+    convert_parrow_data(data)
+
+    it extract data from a pyarrow object to a "standard" pyjapcscout-like dict dataset, 
+    i.e. a dictionary with only not null numpy objects/arrays and no lists (but if you enable use_list_for_2D_array)
+
+    if treat_str_arrays_as_str (default=True) it will try to preserve str data type also for arrays
+
+    if use_list_for_2D_array (default=False) it will try to use lists of 1D arrays instead of 2D arrays
+    '''
+    if isinstance(data, pa.lib.Table):
+        output = dict()
+        for column in data.column_names:
+            #if len(data['device1']) -> probably to be done something like this...
+            device_dict = dict()
+            # those should be value, header, exception
+            for item in data[column][0].items():
+                # this can be iterated... I think
+                device_dict[item[0]] = convert_parrow_data(item[1])
+            output[column] = device_dict
+        return output
+    if isinstance(data, pa.StructScalar):
+        output_dict = dict()
+        for item in data.items():
+            output_dict[item[0]] = convert_parrow_data(item[1])
+        return output_dict
+    elif isinstance(data, pa.ListScalar):
+        if isinstance(data.type.value_type, pa.lib.ListType):
+            aux_dtype= data.type.value_type.value_type.to_pandas_dtype()
+            if treat_str_arrays_as_str and data.type.value_type.value_type.equals(pa.string()):
+                # actually a string! not a generic object....
+                aux_dtype = np.str_
+            return np.array(data.as_py(), dtype=aux_dtype)
+        elif isinstance(data.type.value_type, pa.lib.DataType):
+            if isinstance(data.type.value_type, pa.lib.StructType):
+                if use_list_for_2D_array:
+                    auxOutput = []
+                    for auxValue in data.values:
+                        auxOutput.append(convert_parrow_data(auxValue))
+                    return auxOutput
+                else:
+                    auxOutput = np.empty((len(data.values),), dtype=object)
+                    for i, auxValue in enumerate(data.values):
+                        auxOutput[i] = convert_parrow_data(auxValue)
+                    return auxOutput
+            else:
+                # could be a 1D array of some data type
+                aux_dtype = data.type.value_type.to_pandas_dtype()
+                if treat_str_arrays_as_str and data.type.value_type.equals(pa.string()):
+                    # actually a string! not a generic object....
+                    aux_dtype = np.str_
+                return np.array(data.as_py(), dtype=aux_dtype)
+        else:
+            print('Zzzuth...')
+            return data
+    elif issubclass(type(data), pa.lib.Scalar):
+        # horrible casting!... did not find a better way....
+        return data.type.to_pandas_dtype()(data.as_py())
+    else:
+        print('Sigh... unknown data type: '+str(type(data)))
+        return data
+
+
+###### Some important functions not so interesting for the standard user, but fundamental
+
+def dict_to_pyarrow(input_dict):
+    my_data_dict_converted = convert_dict_list(input_dict, in_memory=False, split_to_list=False, verbose=False)
+    return pa.Table.from_pandas(pd.DataFrame([my_data_dict_converted]))
+
+def pyarrow_to_parquet(input_pa, filename):
+    pq.write_table(input_pa, filename)
+
+def parquet_to_pyarrow(filename):
+    return pq.read_table(filename)
+
+def pyarrow_to_dict(input_pa):
+    return convert_parrow_data(input_pa)
+
+def pyarrow_to_dict(input_pa):
+    return convert_parrow_data(input_pa)
+
+
+####### The functions interesting for the user
+
+def dict_to_pandas(input_dict):
+    if not isinstance(input_dict, list):
+        input_dict = [input_dict]
+    return = pd.DataFrame(input_dict)
+
+def dict_to_awkward(input_dict):
+    return ak.from_arrow(dict_to_pyarrow(input_dict))
+
+def dict_to_parquet(input_dict, filename):
+    # we could also just go to pandas, and then to parquet. 
+    #   dict_to_pandas(input_dict).to_parquet(filename)
+    pyarrow_to_parquet(dict_to_pyarrow(input_dict), filename+'.parquet')
+
+def dict_to_pickle(input_dict, filename):
+    with open(filename+'.pkl', 'wb') as handle:
+        pickle.dump(input_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
+
+def dict_to_json(input_dict, filename):
+    '''
+    Function provided for convenience, but not of interest for typical use case...
+    '''
+    dict_to_pandas(input_dict).to_json(filename+'.json')
+def json_to_pandas(filename)
+    '''
+    Function provided for convenience, but not of interest for typical use case...
+    '''
+    return df.read_json(filename)
+
+def pandas_to_dict(input_pandas, row_index=0):
+    '''
+    it converts the specified row of a pandas dataframe into a pyjapcscout-like dict
+    '''
+    return input_pandas.iloc[row_index].to_dict()
+
+def awkward_to_dict(input_awkward, row_index=0):
+    '''
+    it converts the specified row of an awkward array into a pyjapcscout-like dict
+    '''
+    return = convert_parrow_data(ak.to_arrow(input_awkward)[row_index])
+
+def pickle_to_dict(filename):
+    with open(filename, 'rb') as handle:
+        load_dict = pickle.load(handle)
+    return load_dict
+
+def parquet_to_dict(filename):
+    return pyarrow_to_dict(parquet_to_pyarrow)
+
+# between pandas and awkward
+def pandas_to_awkward(input_pandas):
+    print("TODO")
+    return
+    input_pandas = input_pandas.copy()
+    # I need to split it 2D arrays...
+    #return dict_to_awkward(pandas_to_dict(input_pandas))
+
+def awkward_to_pandas(input_awkward):
+    print("TODO")
+
+# reading from parquet to pandas without type loss
+def parquet_to_pandas(filename):
+    '''
+    It reads a **single** parquet into a pandas dataframe with no data type loss
+    '''
+    return dict_to_pandas(parquet_to_dict(filename))
+
+def parquet_to_awkward(filename):
+    return ak.from_parquet(filename)
+