diff --git a/datascout/__init__.py b/datascout/__init__.py index 5bc1d08fff8580ada6c059c4d4e9c6ffab4a2d93..3711ef45840815fa7bf84a3ef5167c95031a0dde 100644 --- a/datascout/__init__.py +++ b/datascout/__init__.py @@ -3,7 +3,7 @@ list of sweet functions for data conversion and writing to disk """ -__version__ = "0.0.1.dev0" +__version__ = "0.0.1.beta0" # to look at pyarrow, typically not used by a user, diff --git a/datascout/_datascout.py b/datascout/_datascout.py index 2166aa14cf0314c983f4d0463927825e0a859ee5..1fb3335caa004ddff853c1dd28ea3f418cf50ce2 100644 --- a/datascout/_datascout.py +++ b/datascout/_datascout.py @@ -70,11 +70,11 @@ def _convert_dict_list(data, in_memory=False, split_to_list=False, verbose=False if in_memory == False: data = copy.copy(data) if type(data) == list: - for entry in data: - if type(entry) == list or type(entry) == dict: - entry = _convert_dict_list(entry) - elif type(entry) == np.ndarray: - entry = _split_2D_array(entry, in_memory=in_memory, split_to_list=split_to_list, verbose=verbose) + for i in range(len(data)): + if type(data[i]) == list or type(data[i]) == dict: + data[i] = _convert_dict_list(data[i]) + elif type(data[i]) == np.ndarray: + data[i] = _split_2D_array(data[i], in_memory=in_memory, split_to_list=split_to_list, verbose=verbose) elif type(data) == dict: for key in data.keys(): if type(data[key]) == list or type(data[key]) == dict: @@ -166,23 +166,40 @@ def _convert_parrow_data(data, treat_str_arrays_as_str=True, use_list_for_2D_arr if treat_str_arrays_as_str (default=True) it will try to preserve str data type also for arrays if use_list_for_2D_array (default=False) it will try to use lists of 1D arrays instead of 2D arrays + + Typically the output should be a `dict`. If, however, one is trying to convert more complex structures + like a pyarrow Table or StructArray, the output will be a list of dictionaries, if more than one data records are found. ''' if isinstance(data, pa.lib.Table): + output = [] + for irow in range(data.num_rows): + outputRow = dict() + for column in data.column_names: + #if len(data['device1']) -> probably to be done something like this... + device_dict = dict() + # those should be value, header, exception + for item in data[column][irow].items(): + # this can be iterated... I think + device_dict[item[0]] = _convert_parrow_data(item[1], treat_str_arrays_as_str, use_list_for_2D_array) + outputRow[column] = device_dict + output.append(outputRow) + if len(output) == 1: + return output[0] + else: + return output + elif isinstance(data, pa.StructArray): + output = [] + for row in data: + output.append(_convert_parrow_data(row, treat_str_arrays_as_str, use_list_for_2D_array)) + if len(output) == 1: + return output[0] + else: + return output + elif isinstance(data, pa.StructScalar): output = dict() - for column in data.column_names: - #if len(data['device1']) -> probably to be done something like this... - device_dict = dict() - # those should be value, header, exception - for item in data[column][0].items(): - # this can be iterated... I think - device_dict[item[0]] = _convert_parrow_data(item[1]) - output[column] = device_dict - return output - if isinstance(data, pa.StructScalar): - output_dict = dict() for item in data.items(): - output_dict[item[0]] = _convert_parrow_data(item[1]) - return output_dict + output[item[0]] = _convert_parrow_data(item[1], treat_str_arrays_as_str, use_list_for_2D_array) + return output elif isinstance(data, pa.ListScalar): if isinstance(data.type.value_type, pa.lib.ListType): aux_dtype= data.type.value_type.value_type.to_pandas_dtype() @@ -193,15 +210,15 @@ def _convert_parrow_data(data, treat_str_arrays_as_str=True, use_list_for_2D_arr elif isinstance(data.type.value_type, pa.lib.DataType): if isinstance(data.type.value_type, pa.lib.StructType): if use_list_for_2D_array: - auxOutput = [] + output = [] for auxValue in data.values: - auxOutput.append(_convert_parrow_data(auxValue)) - return auxOutput + output.append(_convert_parrow_data(auxValue), treat_str_arrays_as_str, use_list_for_2D_array) + return output else: - auxOutput = np.empty((len(data.values),), dtype=object) + output = np.empty((len(data.values),), dtype=object) for i, auxValue in enumerate(data.values): - auxOutput[i] = _convert_parrow_data(auxValue) - return auxOutput + output[i] = _convert_parrow_data(auxValue, treat_str_arrays_as_str, use_list_for_2D_array) + return output else: # could be a 1D array of some data type aux_dtype = data.type.value_type.to_pandas_dtype() @@ -224,7 +241,9 @@ def _convert_parrow_data(data, treat_str_arrays_as_str=True, use_list_for_2D_arr def dict_to_pyarrow(input_dict): my_data_dict_converted = _convert_dict_list(input_dict, in_memory=False, split_to_list=False, verbose=False) - return pa.Table.from_pandas(pd.DataFrame([my_data_dict_converted])) + if not isinstance(my_data_dict_converted, list): + my_data_dict_converted = [my_data_dict_converted] + return pa.Table.from_pandas(pd.DataFrame(my_data_dict_converted)) def pyarrow_to_parquet(input_pa, filename): pq.write_table(input_pa, filename) @@ -234,6 +253,7 @@ def parquet_to_pyarrow(filename): def pyarrow_to_dict(input_pa): return _convert_parrow_data(input_pa) + def pyarrow_to_pandas(input_pa): return dict_to_pandas(pyarrow_to_dict(input_pa)) @@ -275,17 +295,21 @@ def json_to_pandas(filename): ''' return df.read_json(filename) -def pandas_to_dict(input_pandas, row_index=0): +def pandas_to_dict(input_pandas): ''' - it converts the specified row of a pandas dataframe into a pyjapcscout-like dict + it converts a pandas dataframe into a pyjapcscout-like dict + (or list of dicts in case of many records) ''' - return input_pandas.iloc[row_index].to_dict() + output = input_pandas.to_dict('records') + if len(output) == 1: + output = output[0] + return output -def awkward_to_dict(input_awkward, row_index=0): +def awkward_to_dict(input_awkward): ''' it converts the specified row of an awkward array into a pyjapcscout-like dict ''' - return _convert_parrow_data(ak.to_arrow(input_awkward)[row_index]) + return _convert_parrow_data(ak.to_arrow(input_awkward)) def pickle_to_dict(filename): with open(filename, 'rb') as handle: @@ -332,7 +356,7 @@ def _find_lists(data, verbose = False): -def _compare_data(data1, data2): +def _compare_data(data1, data2, use_assert = False): ''' Compares two dictionaries or lists and show the differences (of type or data type). For a full comparison, it is sometimes best to call this function also with inverted @@ -343,23 +367,25 @@ def _compare_data(data1, data2): print(' NOT EQUAL ') print(str(b) + ' (' + str(type(b)) + ')') print(' ------ ') + if use_assert: + raise AssertionError('{} not equal to {}'.format(a, b)) if (type(data1) != type(data2)) or (hasattr(data1, '__len__') and (len(data1) != len(data2))): not_equal(data1, data2) elif isinstance(data1, list): for i in range(len(data1)): - _compare_data(data1[i], data2[i]) + _compare_data(data1[i], data2[i], use_assert) elif isinstance(data1, dict): - _compare_data(data1.keys(), data2.keys()) + _compare_data(data1.keys(), data2.keys(), use_assert) for key in data1.keys(): - _compare_data(data1[key], data2[key]) + _compare_data(data1[key], data2[key], use_assert) elif isinstance(data1, np.ndarray): if data1.dtype != object: if not np.array_equal(data1, data2): not_equal(data1, data2) elif data1.shape == data2.shape: for i in range(data1.size): - _compare_data(data1.flatten()[i], data2.flatten()[i]) + _compare_data(data1.flatten()[i], data2.flatten()[i], use_assert) else: not_equal(data1, data2) diff --git a/datascout/tests/test_dataconversion.py b/datascout/tests/test_dataconversion.py index 7a97fd4bb4c811b78b59a8b62334063948344200..bb8b01fb1d7d1a8da84ddede0148d1816ec748c5 100644 --- a/datascout/tests/test_dataconversion.py +++ b/datascout/tests/test_dataconversion.py @@ -66,40 +66,35 @@ def generate_data_dict(): -def test_data_conversion(): - # generate dataset - my_data_dict = generate_data_dict() - +def generic_test_data_conversion(my_data_dict): # make a reference copy of selected dict my_data_dict_ref = copy.deepcopy(my_data_dict) # go to panda and back without altering initial data my_pandas = datascout.dict_to_pandas(my_data_dict) - datascout._compare_data(my_data_dict, my_data_dict_ref) + datascout._compare_data(my_data_dict, my_data_dict_ref, use_assert = True) my_data_back = datascout.pandas_to_dict(my_pandas) - datascout._compare_data(my_data_back, my_data_dict_ref) + datascout._compare_data(my_data_back, my_data_dict_ref, use_assert = True) # go to pyarrow and back without altering initial data my_pyarrow = datascout.dict_to_pyarrow(my_data_dict) - datascout._compare_data(my_data_dict, my_data_dict_ref) + datascout._compare_data(my_data_dict, my_data_dict_ref, use_assert = True) my_data_back = datascout.pyarrow_to_dict(my_pyarrow) - datascout._compare_data(my_data_back, my_data_dict_ref) + datascout._compare_data(my_data_back, my_data_dict_ref, use_assert = True) # go to awkward and back without altering initial data my_ak = datascout.dict_to_awkward(my_data_dict) - datascout._compare_data(my_data_dict, my_data_dict_ref) + datascout._compare_data(my_data_dict, my_data_dict_ref, use_assert = True) my_data_back = datascout.awkward_to_dict(my_ak) - datascout._compare_data(my_data_back, my_data_dict_ref) + datascout._compare_data(my_data_back, my_data_dict_ref, use_assert = True) # a long chain my_data_back = datascout.awkward_to_dict(datascout.pandas_to_awkward(datascout.pyarrow_to_pandas(datascout.dict_to_pyarrow(my_data_dict)))) - datascout._compare_data(my_data_dict, my_data_dict_ref) - datascout._compare_data(my_data_back, my_data_dict_ref) + datascout._compare_data(my_data_dict, my_data_dict_ref, use_assert = True) + datascout._compare_data(my_data_back, my_data_dict_ref, use_assert = True) -def test_save_load(tmpdir): - # generate dataset - my_data_dict = generate_data_dict() +def generic_test_save_load(tmpdir, my_data_dict): # make a reference copy of selected dict my_data_dict_ref = copy.deepcopy(my_data_dict) @@ -109,18 +104,47 @@ def test_save_load(tmpdir): # go to parquet datascout.dict_to_parquet(my_data_dict, temp_filename_parquet) - datascout._compare_data(my_data_dict, my_data_dict_ref) + datascout._compare_data(my_data_dict, my_data_dict_ref, use_assert = True) my_data_back = datascout.parquet_to_dict(temp_filename_parquet) - datascout._compare_data(my_data_back, my_data_dict_ref) + datascout._compare_data(my_data_back, my_data_dict_ref, use_assert = True) # go to pickle datascout.dict_to_pickle(my_data_dict, temp_filename_pickle) - datascout._compare_data(my_data_dict, my_data_dict_ref) + datascout._compare_data(my_data_dict, my_data_dict_ref, use_assert = True) my_data_back = datascout.pickle_to_dict(temp_filename_pickle) - datascout._compare_data(my_data_back, my_data_dict_ref) + datascout._compare_data(my_data_back, my_data_dict_ref, use_assert = True) + + +def test_data_conversion(): + # generate dataset + my_data_dict = generate_data_dict() + # test it + generic_test_data_conversion(my_data_dict) + + +def test_data_array_conversion(): + # generate dataset + my_data_dict = [generate_data_dict(), generate_data_dict(), generate_data_dict()] + # test it + generic_test_data_conversion(my_data_dict) + +def test_save_load(tmpdir): + # generate dataset + my_data_dict = generate_data_dict() + # test it + generic_test_save_load(tmpdir, my_data_dict) + +def test_save_load_array(tmpdir): + # generate dataset + my_data_dict = [generate_data_dict(), generate_data_dict(), generate_data_dict()] + # test it + generic_test_save_load(tmpdir, my_data_dict) + -return # Function above can be locally tests as: +''' from pathlib import Path tmpdir=Path('.') -test_save_load(tmpdir) \ No newline at end of file +test_save_load(tmpdir) +test_save_load_array(tmpdir) +''' \ No newline at end of file