Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
D
datascout
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Iterations
Requirements
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
ABP Computing
ABP Computing Sandbox
datascout
Commits
1fb270e2
Commit
1fb270e2
authored
3 years ago
by
Davide Gamba
Browse files
Options
Downloads
Patches
Plain Diff
starting putting code together
parent
e39b66ab
No related branches found
No related tags found
No related merge requests found
Pipeline
#2575528
failed
3 years ago
Stage: test
Changes
2
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
datascout/__init__.py
+29
-1
29 additions, 1 deletion
datascout/__init__.py
datascout/_datascout.py
+311
-0
311 additions, 0 deletions
datascout/_datascout.py
with
340 additions
and
1 deletion
datascout/__init__.py
+
29
−
1
View file @
1fb270e2
"""
"""
Documenta
tion for
the
data
s
co
ut package
list of sweet func
tion
s
for data
co
nversion and writing to disk
"""
"""
__version__
=
"
0.0.1.dev0
"
__version__
=
"
0.0.1.dev0
"
# for the user
from
._datascout
import
dict_to_pandas
from
._datascout
import
dict_to_awkward
from
._datascout
import
dict_to_parquet
from
._datascout
import
dict_to_pickle
from
._datascout
import
dict_to_json
# coming back
from
._datascout
import
pandas_to_dict
from
._datascout
import
awkward_to_dict
from
._datascout
import
pickle_to_dict
from
._datascout
import
parquet_to_dict
# between pandas and awkward
from
._datascout
import
pandas_to_awkward
from
._datascout
import
awkward_to_pandas
# reading from parquet to pandas/awkward without type loss
from
._datascout
import
parquet_to_pandas
from
._datascout
import
parquet_to_awkward
# to look at pyarrow, typically not used by a user
from
._datascout
import
dict_to_pyarrow
from
._datascout
import
pyarrow_to_parquet
from
._datascout
import
parquet_to_pyarrow
from
._datascout
import
pyarrow_to_dict
This diff is collapsed.
Click to expand it.
datascout/_datascout.py
0 → 100644
+
311
−
0
View file @
1fb270e2
"""
Implementation of sweet functions to convert data from one type to anothre
"""
import
numpy
as
np
import
pandas
as
pd
import
awkward
as
ak
import
pyarrow.parquet
as
pq
import
numpy
as
np
import
pyarrow
as
pa
import
pickle
import
datetime
import
copy
######
# Functions needed to split 2D arrays
def
split_2D_array
(
val
,
in_memory
=
False
,
split_to_list
=
False
,
verbose
=
False
):
'''
split_2D_array(val, in_memory=False, split_to_list=False, verbose=False)
It converts numpy 2D arrays into either:
- 1D
"
object
"
arrays containing 1D val.dtype arrays (split_to_list=False)
- list of 1D val.dtype arrays (split_to_list=True)
by default, split_to_list=False
It returns the split value or the original value if the input was not
If in_memory == True (default=False), data is not copied but just represented in a different form
'''
if
(
type
(
val
)
==
np
.
ndarray
)
and
len
(
np
.
shape
(
val
))
==
2
:
if
not
in_memory
:
val
=
copy
.
deepcopy
(
val
)
if
verbose
:
print
(
'
made a copy of
'
+
str
(
val
))
if
split_to_list
:
newVal
=
list
(
val
)
else
:
# (TODO: probably to be done better!!!)
auxDim
=
np
.
shape
(
val
)[
0
]
# split val, without making data copy
auxData
=
np
.
split
(
np
.
ravel
(
val
),
auxDim
)
# put it in object array
newVal
=
np
.
empty
((
auxDim
,),
dtype
=
object
)
for
i
in
range
(
auxDim
):
newVal
[
i
]
=
auxData
[
i
]
if
verbose
:
print
(
'
-----
'
)
print
(
str
(
val
)
+
'
(
'
+
str
(
type
(
val
))
+
'
)
'
)
print
(
'
-> converted to ->
'
)
print
(
str
(
newVal
)
+
'
(
'
+
str
(
type
(
newVal
))
+
'
)
'
)
return
newVal
else
:
return
val
def
convert_dict_list
(
data
,
in_memory
=
False
,
split_to_list
=
False
,
verbose
=
False
):
'''
Parse the input data, which should be a list or a dict, and convert all 2D arrays into either
- 1D object array of 1D arrays
- 1D list of 1D arrays
If in_memory=True (default False), it changes the data in memory.
In any case, the modified data is returned.
NOTE: The conversion is done such to reduce to minimum (I think) data copy: i.e. the actual data
is not copied (or copies are reduced to the minimum).
It is up to the user to make a deepcopy, if desired, of the data before and/or after conversion.
'''
if
in_memory
==
False
:
data
=
copy
.
copy
(
data
)
if
type
(
data
)
==
list
:
for
entry
in
data
:
if
type
(
entry
)
==
list
or
type
(
entry
)
==
dict
:
entry
=
convert_dict_list
(
entry
)
elif
type
(
entry
)
==
np
.
ndarray
:
entry
=
split_2D_array
(
entry
,
in_memory
=
in_memory
,
split_to_list
=
split_to_list
,
verbose
=
verbose
)
elif
type
(
data
)
==
dict
:
for
key
in
data
.
keys
():
if
type
(
data
[
key
])
==
list
or
type
(
data
[
key
])
==
dict
:
data
[
key
]
=
convert_dict_list
(
data
[
key
])
elif
type
(
data
[
key
])
==
np
.
ndarray
:
data
[
key
]
=
split_2D_array
(
data
[
key
],
in_memory
=
in_memory
,
split_to_list
=
split_to_list
,
verbose
=
verbose
)
return
data
######
# Functions needed to re-merge 1D arrays of 1D arrays into 2D arrays
def
merge_to_2D
(
val
,
string_as_obj
=
False
,
verbose
=
False
):
'''
merge_to_2D(val, string_as_obj=False, verbose=False)
It converts back numpy arrays of
"
object
"
dtype into 2D arrays.
By construction, if conversion actually occurs, this operation makes a copy of
the data (probably with some exceptions)
string_as_obj=False (default):
This options (if enabled) makes sure that the returned object is a 2D array of
"
object
"
data type in case of string arrays. This is necessary in case you want to edit one string
of the array without it being cut...
'''
if
((
type
(
val
)
==
np
.
ndarray
)
and
val
.
dtype
==
object
)
or
(
type
(
val
)
==
list
):
newVal
=
np
.
stack
(
val
)
# fix subtle issue with strings which I am assuming are arrays of objects
if
string_as_obj
and
(
newVal
.
dtype
.
type
is
np
.
str_
):
newVal
=
newVal
.
astype
(
object
)
if
verbose
:
print
(
'
-----
'
)
print
(
str
(
val
)
+
'
(
'
+
str
(
type
(
val
))
+
'
)
'
)
print
(
'
-> reverted to ->
'
)
print
(
str
(
newVal
)
+
'
(
'
+
str
(
newVal
.
dtype
)
+
'
)
'
)
return
newVal
else
:
return
val
def
revert_dict_list
(
data
,
in_memory
=
False
,
string_as_obj
=
False
,
verbose
=
False
):
'''
Parse the input data, which should be a list or a dict, and convert all 1D arrays of
"
object
"
type
into 2D arrays of the proper data type.
If string_as_obj=True (default=False), the obtained 2D arrays of strings are converted into 2D arrays
of
"
object
"
dtype.
If in_memory=True (default False), it changes the data in memory.
In any case, the modified data is returned.
NOTE: The conversion is done such to reduce to minimum (I think) data copy: i.e. the actual data
is not copied (or copies are reduced to the minimum).
It is up to the user to make a deepcopy, if desired, of the data before and/or after conversion.
'''
if
in_memory
==
False
:
data
=
copy
.
copy
(
data
)
if
type
(
data
)
==
list
:
for
entry
in
data
:
if
type
(
entry
)
==
dict
:
revert_dict_list
(
entry
)
elif
type
(
entry
)
==
list
or
type
(
entry
)
==
np
.
ndarray
:
entry
=
merge_to_2D
(
entry
,
string_as_obj
=
string_as_obj
,
verbose
=
verbose
)
if
len
(
entry
)
>
0
and
isinstance
(
entry
.
flatten
()[
0
],
dict
):
for
nasted_data
in
entry
.
flatten
():
revert_dict_list
(
nasted_data
)
elif
type
(
data
)
==
dict
:
for
key
in
data
.
keys
():
if
type
(
data
[
key
])
==
dict
:
revert_dict_list
(
data
[
key
])
elif
type
(
data
[
key
])
==
list
or
type
(
data
[
key
])
==
np
.
ndarray
:
data
[
key
]
=
merge_to_2D
(
data
[
key
],
string_as_obj
=
string_as_obj
,
verbose
=
verbose
)
if
len
(
data
[
key
])
>
0
and
isinstance
(
data
[
key
].
flatten
()[
0
],
dict
):
for
nasted_data
in
data
[
key
].
flatten
():
revert_dict_list
(
nasted_data
)
return
data
######
# CORE function of this project: it allows to convert a pyarrow object into a dict
#
def
convert_parrow_data
(
data
,
treat_str_arrays_as_str
=
True
,
use_list_for_2D_array
=
False
):
'''
convert_parrow_data(data)
it extract data from a pyarrow object to a
"
standard
"
pyjapcscout-like dict dataset,
i.e. a dictionary with only not null numpy objects/arrays and no lists (but if you enable use_list_for_2D_array)
if treat_str_arrays_as_str (default=True) it will try to preserve str data type also for arrays
if use_list_for_2D_array (default=False) it will try to use lists of 1D arrays instead of 2D arrays
'''
if
isinstance
(
data
,
pa
.
lib
.
Table
):
output
=
dict
()
for
column
in
data
.
column_names
:
#if len(data['device1']) -> probably to be done something like this...
device_dict
=
dict
()
# those should be value, header, exception
for
item
in
data
[
column
][
0
].
items
():
# this can be iterated... I think
device_dict
[
item
[
0
]]
=
convert_parrow_data
(
item
[
1
])
output
[
column
]
=
device_dict
return
output
if
isinstance
(
data
,
pa
.
StructScalar
):
output_dict
=
dict
()
for
item
in
data
.
items
():
output_dict
[
item
[
0
]]
=
convert_parrow_data
(
item
[
1
])
return
output_dict
elif
isinstance
(
data
,
pa
.
ListScalar
):
if
isinstance
(
data
.
type
.
value_type
,
pa
.
lib
.
ListType
):
aux_dtype
=
data
.
type
.
value_type
.
value_type
.
to_pandas_dtype
()
if
treat_str_arrays_as_str
and
data
.
type
.
value_type
.
value_type
.
equals
(
pa
.
string
()):
# actually a string! not a generic object....
aux_dtype
=
np
.
str_
return
np
.
array
(
data
.
as_py
(),
dtype
=
aux_dtype
)
elif
isinstance
(
data
.
type
.
value_type
,
pa
.
lib
.
DataType
):
if
isinstance
(
data
.
type
.
value_type
,
pa
.
lib
.
StructType
):
if
use_list_for_2D_array
:
auxOutput
=
[]
for
auxValue
in
data
.
values
:
auxOutput
.
append
(
convert_parrow_data
(
auxValue
))
return
auxOutput
else
:
auxOutput
=
np
.
empty
((
len
(
data
.
values
),),
dtype
=
object
)
for
i
,
auxValue
in
enumerate
(
data
.
values
):
auxOutput
[
i
]
=
convert_parrow_data
(
auxValue
)
return
auxOutput
else
:
# could be a 1D array of some data type
aux_dtype
=
data
.
type
.
value_type
.
to_pandas_dtype
()
if
treat_str_arrays_as_str
and
data
.
type
.
value_type
.
equals
(
pa
.
string
()):
# actually a string! not a generic object....
aux_dtype
=
np
.
str_
return
np
.
array
(
data
.
as_py
(),
dtype
=
aux_dtype
)
else
:
print
(
'
Zzzuth...
'
)
return
data
elif
issubclass
(
type
(
data
),
pa
.
lib
.
Scalar
):
# horrible casting!... did not find a better way....
return
data
.
type
.
to_pandas_dtype
()(
data
.
as_py
())
else
:
print
(
'
Sigh... unknown data type:
'
+
str
(
type
(
data
)))
return
data
###### Some important functions not so interesting for the standard user, but fundamental
def
dict_to_pyarrow
(
input_dict
):
my_data_dict_converted
=
convert_dict_list
(
input_dict
,
in_memory
=
False
,
split_to_list
=
False
,
verbose
=
False
)
return
pa
.
Table
.
from_pandas
(
pd
.
DataFrame
([
my_data_dict_converted
]))
def
pyarrow_to_parquet
(
input_pa
,
filename
):
pq
.
write_table
(
input_pa
,
filename
)
def
parquet_to_pyarrow
(
filename
):
return
pq
.
read_table
(
filename
)
def
pyarrow_to_dict
(
input_pa
):
return
convert_parrow_data
(
input_pa
)
def
pyarrow_to_dict
(
input_pa
):
return
convert_parrow_data
(
input_pa
)
####### The functions interesting for the user
def
dict_to_pandas
(
input_dict
):
if
not
isinstance
(
input_dict
,
list
):
input_dict
=
[
input_dict
]
return
=
pd
.
DataFrame
(
input_dict
)
def
dict_to_awkward
(
input_dict
):
return
ak
.
from_arrow
(
dict_to_pyarrow
(
input_dict
))
def
dict_to_parquet
(
input_dict
,
filename
):
# we could also just go to pandas, and then to parquet.
# dict_to_pandas(input_dict).to_parquet(filename)
pyarrow_to_parquet
(
dict_to_pyarrow
(
input_dict
),
filename
+
'
.parquet
'
)
def
dict_to_pickle
(
input_dict
,
filename
):
with
open
(
filename
+
'
.pkl
'
,
'
wb
'
)
as
handle
:
pickle
.
dump
(
input_dict
,
handle
,
protocol
=
pickle
.
HIGHEST_PROTOCOL
)
def
dict_to_json
(
input_dict
,
filename
):
'''
Function provided for convenience, but not of interest for typical use case...
'''
dict_to_pandas
(
input_dict
).
to_json
(
filename
+
'
.json
'
)
def
json_to_pandas
(
filename
)
'''
Function provided for convenience, but not of interest for typical use case...
'''
return
df
.
read_json
(
filename
)
def
pandas_to_dict
(
input_pandas
,
row_index
=
0
):
'''
it converts the specified row of a pandas dataframe into a pyjapcscout-like dict
'''
return
input_pandas
.
iloc
[
row_index
].
to_dict
()
def
awkward_to_dict
(
input_awkward
,
row_index
=
0
):
'''
it converts the specified row of an awkward array into a pyjapcscout-like dict
'''
return
=
convert_parrow_data
(
ak
.
to_arrow
(
input_awkward
)[
row_index
])
def
pickle_to_dict
(
filename
):
with
open
(
filename
,
'
rb
'
)
as
handle
:
load_dict
=
pickle
.
load
(
handle
)
return
load_dict
def
parquet_to_dict
(
filename
):
return
pyarrow_to_dict
(
parquet_to_pyarrow
)
# between pandas and awkward
def
pandas_to_awkward
(
input_pandas
):
print
(
"
TODO
"
)
return
input_pandas
=
input_pandas
.
copy
()
# I need to split it 2D arrays...
#return dict_to_awkward(pandas_to_dict(input_pandas))
def
awkward_to_pandas
(
input_awkward
):
print
(
"
TODO
"
)
# reading from parquet to pandas without type loss
def
parquet_to_pandas
(
filename
):
'''
It reads a **single** parquet into a pandas dataframe with no data type loss
'''
return
dict_to_pandas
(
parquet_to_dict
(
filename
))
def
parquet_to_awkward
(
filename
):
return
ak
.
from_parquet
(
filename
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment