Skip to content
Snippets Groups Projects
Commit be8b62f3 authored by Guido Sterbini's avatar Guido Sterbini
Browse files

Update _datascout.py

parent 316f32e7
Branches
No related tags found
No related merge requests found
Checking pipeline status
...@@ -12,6 +12,7 @@ import datetime ...@@ -12,6 +12,7 @@ import datetime
import copy import copy
import os import os
import scipy import scipy
import glob
from pathlib import Path from pathlib import Path
...@@ -509,3 +510,48 @@ def _compare_data(data1, data2, use_assert=False): ...@@ -509,3 +510,48 @@ def _compare_data(data1, data2, use_assert=False):
_compare_data(data1.flatten()[i], data2.flatten()[i], use_assert) _compare_data(data1.flatten()[i], data2.flatten()[i], use_assert)
else: else:
not_equal(data1, data2) not_equal(data1, data2)
def _compare_schema(a, reference_file = None, verbose = False):
"""
Compare the schema of a list of file **a**.
The default reference file in the first one of the list **a**.
It is possible to defien a different referenve file usint the **reference_file**.
"""
my_list = []
if len(a) > 0:
if verbose: print(f'Reference schema: {a[0]}')
if reference_file == None:
reference_file = a[0]
ref_schema = pq.read_schema(reference_file)
for ii in a:
if not ref_schema==pq.read_schema(ii):
if verbose:
print (f'\n ****** {ii} has a different schema! ****** ')
print('This is the difference\n === Schema of the reference file ===')
print(set(ref_schema)-set(pq.read_schema(ii)))
print('This is the difference\n === Schema of the compared file ===')
print(set(pq.read_schema(ii))-set(ref_schema))
my_list.append(ii)
else:
print('No file in the folder.')
return my_list
def split_schemas(list_of_files):
"""
Split a list of files by schemas.
"""
aux=_compare_schema(list_of_files, reference_file = None, verbose = False)
if len(aux)==0:
return [sorted(list_of_files)]
else:
return [sorted(list(set(list_of_files)-set(aux))), *split_schemas(aux)]
def compare_schema(folder, reference_file = None, verbose = False):
"""
Given a **folder**, it compares all parquet files in it.
"""
return _compare_schema(sorted(glob.glob(folder+'/**/*.parquet', recursive=True)),
reference_file, verbose)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment