Update _datascout.py

be8b62f3 · Guido Sterbini · 316f32e7 · be8b62f3
Commit be8b62f3 authored 3 years ago by Guido Sterbini
--- a/datascout/_datascout.py
+++ b/datascout/_datascout.py
@@ -12,6 +12,7 @@ import datetime
 import copy
 import os
 import scipy
+import glob
 from pathlib import Path
@@ -509,3 +510,48 @@ def _compare_data(data1, data2, use_assert=False):
                _compare_data(data1.flatten()[i], data2.flatten()[i], use_assert)
        else:
            not_equal(data1, data2)
+def _compare_schema(a, reference_file = None, verbose = False):
+    """
+    Compare the schema of a list of file **a**. 
+    The default reference file in the first one of the list **a**.
+    It is possible to defien a different referenve file usint the **reference_file**.
+    """
+    my_list = []
+    if len(a) > 0:
+        if verbose: print(f'Reference schema: {a[0]}')
+        if reference_file == None:
+            reference_file = a[0]
+        ref_schema = pq.read_schema(reference_file)
+        for ii in a:
+            if not ref_schema==pq.read_schema(ii):
+                if verbose: 
+                    print (f'\n ****** {ii} has a different schema! ****** ')
+                    print('This is the difference\n === Schema of the reference file ===')
+                    print(set(ref_schema)-set(pq.read_schema(ii)))
+                    print('This is the difference\n === Schema of the compared file ===')
+                    print(set(pq.read_schema(ii))-set(ref_schema))
+                my_list.append(ii)
+    else:
+        print('No file in the folder.')
+    return my_list
+def split_schemas(list_of_files):
+    """
+    Split a list of files by schemas.
+    """
+    aux=_compare_schema(list_of_files, reference_file = None, verbose = False)
+    if len(aux)==0:
+        return [sorted(list_of_files)]
+    else:
+        return [sorted(list(set(list_of_files)-set(aux))), *split_schemas(aux)]
+def compare_schema(folder, reference_file = None, verbose = False):
+    """
+    Given a **folder**, it compares all parquet files in it.
+    """
+    return _compare_schema(sorted(glob.glob(folder+'/**/*.parquet', recursive=True)), 
+                           reference_file, verbose)