diff --git a/gemanalysis/cli.py b/gemanalysis/cli.py new file mode 100644 index 0000000000000000000000000000000000000000..09ee53fbc5cc3822cccc45a2601276597c8d48ba --- /dev/null +++ b/gemanalysis/cli.py @@ -0,0 +1,78 @@ +import argparse +from gemanalysis.data_importer.data_importer import DataImporter +import os +from os.path import dirname + + +def main(): + + importer = DataImporter() + + # Top level parser + parser = argparse.ArgumentParser() + subparsers = parser.add_subparsers(required=True) + + # parser for the import command + import_parser = subparsers.add_parser( + "import", help="import files into the analysis suite database" + ) + import_parser.add_argument( + "-r", "--run", type=int, required=True, help="provide run number of input file" + ) + import_parser.add_argument( + "-p", "--path", type=str, nargs="+", required=True, help="provide path to the raw file" + ) + import_parser.add_argument( + "-d", + "--description", + required=False, + default="", + help="provide description of the input file", + ) + import_parser.set_defaults( + func=lambda args: importer.import_data(args.run, args.path, args.description) + ) + + # parser for getting files (get command) + get_file_parser = subparsers.add_parser( + "get", help="get files already existing in the analysis suite" + ) + get_file_parser.add_argument( + "-f", + "--file", + type=str, + required=False, + help="name of file you want to retrieve from the analysis suite database", + ) + get_file_parser.add_argument( + "--run", + type=int, + required=False, + help="run number for which files names need to be retrieved", + ) + get_file_parser.add_argument( + "--revision", + type=int, + required=False, + help="run number for which files names need to be retrieved", + ) + get_file_parser.set_defaults( + func=lambda args: importer.get_file(args.file, args.revision, args.run) + ) + + # Read arguments from the command line + args = parser.parse_args() + + args.func(args) + + # if not args.run: + # raise ValueError("run number not provided. Please provide a run number") + # if not args.path: + # raise ValueError("path to raw file not provided. Please provide a path.") + + # load = DataImporter() + # load.loader(args.run, args.path, args.description) + + +if __name__ == "__main__": + main() diff --git a/gemanalysis/config.yaml b/gemanalysis/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..58b79450f4caff1756382b7e0f0f59147fdd604e --- /dev/null +++ b/gemanalysis/config.yaml @@ -0,0 +1,8 @@ +data_path: !ENV "${TMPDIR}/gemdata/data" + +database: + host: localhost + port: 5432 + password: !ENV ${USER} + user: !ENV ${USER} + name: !ENV ${USER} diff --git a/gemanalysis/data_importer/__init__.py b/gemanalysis/data_importer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/gemanalysis/data_importer/csv_importer.py b/gemanalysis/data_importer/csv_importer.py new file mode 100644 index 0000000000000000000000000000000000000000..5183c93d1e9bbfead4569ab30a640b479044eb29 --- /dev/null +++ b/gemanalysis/data_importer/csv_importer.py @@ -0,0 +1,16 @@ +import os +import errno +import pandas as pd + +from gemanalysis.data_importer.importer_abstract import DataImporterAbstract + + +class CsvLoader(DataImporterAbstract): + @staticmethod + def read_file(file_path): + + if os.path.isfile(file_path): + return pd.read_csv(file_path) + + else: + raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), file_path) diff --git a/gemanalysis/data_importer/data_importer.py b/gemanalysis/data_importer/data_importer.py new file mode 100644 index 0000000000000000000000000000000000000000..808069811c75a3e84cd201f9a3905926eace6a5e --- /dev/null +++ b/gemanalysis/data_importer/data_importer.py @@ -0,0 +1,175 @@ +# import sys +# paths = [ 'D:\\analysisSuite\\gem-online-analysis-v2\\cmsgemos-analysis\\gemanalysis\\data_importer'] +# sys.path += paths + +import argparse +import datetime +from gemanalysis.postgres.analysis_metadata import AnalysisMetadata, create_table_and_get_session +from gemanalysis.data_importer.text_importer import TextImporter +from gemanalysis.data_importer.image_importer import ImageImporter +from gemanalysis.data_importer.json_importer import JsonLoader +from gemanalysis.data_importer.csv_importer import CsvLoader +import ntpath +import os +import uuid +from os.path import dirname, isdir, isfile, splitext +from pathlib import Path +import logging + + +from pyaml_env import parse_config + + +class DataImporter: + def __init__(self): + + self.logger = logging.getLogger("gemanalysis.data_importer.data_importer") + logging.basicConfig(level=logging.INFO) + + # This is done so that correct config file path is inferred no matter from where the program is run + config_file_path = os.path.abspath( + os.path.join(os.path.realpath(__file__), os.pardir, os.pardir, "config.yaml") + ) + self.config = parse_config(config_file_path) + + self.session = create_table_and_get_session() + + ##temporary fix for storing file paths ## + ## TODO: check how to remove escaping '\' added by os.path.join ## + def replace_forward_slash_with_backslash(self, path): + + temp = str.maketrans("\\", "/") + path = path.translate(temp) + return path + + def import_data(self, run_number, raw_data_path, description=""): + + raw_files = [] + files_to_be_inserted_in_db = [] + raw_files = self.get_list_of_raw_files(raw_data_path) + + with self.session.begin(): + for raw_file in raw_files: + + self.logger.info("importing file: %s for run number: %d", raw_file, run_number) + + destination_path_in_db = os.path.join( + self.config["data_path"], ntpath.basename(raw_file) + ) + destination_path_in_db = self.replace_forward_slash_with_backslash( + destination_path_in_db + ) + file_handler = None + + with open(raw_file, "rb") as file_handler: + + # commenting below portion which required a specialized importer class for each file type. Reading data as binary for now + # if raw_file.endswith('.txt'): + # file_handler = TextImporter.read_file(raw_file) + + # elif raw_file.endswith('.csv'): + # file_handler = CsvLoader.read_file(raw_file) + + # elif raw_file.endswith('.json'): + # file_handler = JsonLoader.read_file(raw_file) + + # elif raw_file.endswith('.jpg'): + # file_handler = ImageImporter.read_file(raw_file) + # #f = ImageLoader.add_watermark(f, "sample watermark") + + if file_handler is not None: + + revision = 0 + + ## check if file already exists + existing_files = ( + self.session.query(AnalysisMetadata) + .filter_by(file_path=destination_path_in_db, run_number=run_number) + .order_by(AnalysisMetadata.revision.desc()) + .all() + ) + if len(existing_files) > 0: + self.logger.info( + "File already exists in database with most recent run number: %s", + existing_files[0].revision, + ) + revision = existing_files[0].revision + 1 + + # generating unique uuid and append it to file name for storage + unique_id = uuid.uuid1() + file_name, extension = os.path.splitext(destination_path_in_db) + file_name_in_destination = file_name + "_" + str(unique_id) + extension + + new_file = AnalysisMetadata( + revision=revision, + created_on=datetime.datetime.utcnow(), + parameters=None, + file_path=destination_path_in_db, + description=description, + uuid=str(unique_id), + run_number=run_number, + ) + files_to_be_inserted_in_db.append(new_file) + + # create destination folder if it does not exist + Path(self.config["data_path"]).mkdir(parents=True, exist_ok=True) + + with open(file_name_in_destination, "wb") as output_handler: + output_handler.write(file_handler.read()) + self.logger.info( + "File saved to destination path: %s", file_name_in_destination + ) + + self.session.add_all(files_to_be_inserted_in_db) + self.session.commit() + + def get_list_of_raw_files(self, raw_data_paths): + + raw_data_files = [] + + for file_path in raw_data_paths: + + if not os.path.exists(file_path): + raise FileNotFoundError( + "File Path: " + + file_path + + " does not exist. Please check the file path and try again." + ) + + if os.path.isdir(file_path): + for root, directory, files in os.walk(file_path): + for f in files: + # print(root, directory, os.path.abspath(f)) + raw_data_files.append(os.path.join(root, f)) + + elif os.path.isfile(file_path): + raw_data_files.append(file_path) + + return raw_data_files + + def get_file(self, file_name=None, revision=None, run_number=None): + + query = self.session.query(AnalysisMetadata) + + if file_name is not None: + file_path = os.path.join(self.config["data_path"], file_name) + query = query.filter_by(file_path=file_path) + + if revision is not None: + query = query.filter_by(revision=revision) + + if run_number is not None: + query = query.filter_by(run_number=run_number) + + files_retrieved = query.all() + retrieved_file_data = [] + + for file in files_retrieved: + file_name, extension = os.path.splitext(file.file_path) + absolute_file_path = file_name + "_" + file.uuid + extension + self.logger.info(absolute_file_path) + + with open(absolute_file_path, "rb") as f: + retrieved_file_data.append(f.read()) + + return retrieved_file_data diff --git a/gemanalysis/data_importer/image_importer.py b/gemanalysis/data_importer/image_importer.py new file mode 100644 index 0000000000000000000000000000000000000000..4e04b5d314a82489645cb8fd8659b314933e5943 --- /dev/null +++ b/gemanalysis/data_importer/image_importer.py @@ -0,0 +1,31 @@ +import os +from PIL import Image, ImageDraw, ImageFont +import errno + +from gemanalysis.data_importer.importer_abstract import DataImporterAbstract + + +class ImageImporter(DataImporterAbstract): + @staticmethod + def read_file(file_path): + + if os.path.isfile(file_path): + return Image.open(file_path) + + else: + raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), file_path) + + def add_watermark(img, text): + draw = ImageDraw.Draw(img) + width, height = img.size + + font = ImageFont.load_default() + textwidth, textheight = draw.textsize(text, font) + + margin = 10 + x = width - textwidth - margin + y = height - textheight - margin + + draw.text((x, y), text, font=font) + + return img diff --git a/gemanalysis/data_importer/importer_abstract.py b/gemanalysis/data_importer/importer_abstract.py new file mode 100644 index 0000000000000000000000000000000000000000..3891a6f6e71328fd3e8d0b557c0c83f61884b34d --- /dev/null +++ b/gemanalysis/data_importer/importer_abstract.py @@ -0,0 +1,7 @@ +from abc import ABC, abstractmethod + + +class DataImporterAbstract(ABC): + @abstractmethod + def read_file(self, f): + pass diff --git a/gemanalysis/data_importer/json_importer.py b/gemanalysis/data_importer/json_importer.py new file mode 100644 index 0000000000000000000000000000000000000000..7f0e6108939a70bcfe0797f6d9f9c0ccf32fd669 --- /dev/null +++ b/gemanalysis/data_importer/json_importer.py @@ -0,0 +1,17 @@ +import os +import json +import errno + +from gemanalysis.data_importer.importer_abstract import DataImporterAbstract + + +class JsonLoader(DataImporterAbstract): + @staticmethod + def read_file(file_path): + + if os.path.isfile(file_path): + with open(file_path, "r") as f: + return json.load(file_path) + + else: + raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), file_path) diff --git a/gemanalysis/data_importer/text_importer.py b/gemanalysis/data_importer/text_importer.py new file mode 100644 index 0000000000000000000000000000000000000000..a596426d87b00d8e5a6f10286eab57f8a3f74a7a --- /dev/null +++ b/gemanalysis/data_importer/text_importer.py @@ -0,0 +1,16 @@ +import os +import errno + +from gemanalysis.data_importer.importer_abstract import DataImporterAbstract + + +class TextImporter(DataImporterAbstract): + @staticmethod + def read_file(file_path): + + if os.path.isfile(file_path): + with open(file_path, "r") as f: + return f.read() + + else: + raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), file_path) diff --git a/gemanalysis/postgres/__init__.py b/gemanalysis/postgres/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/gemanalysis/postgres/analysis_metadata.py b/gemanalysis/postgres/analysis_metadata.py new file mode 100644 index 0000000000000000000000000000000000000000..993214b5d47379bf0c86504cea55338366085fb8 --- /dev/null +++ b/gemanalysis/postgres/analysis_metadata.py @@ -0,0 +1,31 @@ +from gemanalysis.postgres.psycopg2_manager import Psycopg2Manager +from sqlalchemy import Column, String, Integer, DateTime, TEXT, create_engine +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.orm import sessionmaker + +connection_string = Psycopg2Manager.get_connection_string() +db = create_engine(connection_string) +base = declarative_base() + + +class AnalysisMetadata(base): + + __tablename__ = "analysis_metadata" + + id = Column(Integer, autoincrement=True, primary_key=True) + file_path = Column(String, primary_key=True) + revision = Column(Integer, primary_key=True) + run_number = Column(Integer) + uuid = Column(String) + created_on = Column(DateTime) + parameters = Column(TEXT) + description = Column(String) + + +def create_table_and_get_session(): + Session = sessionmaker(db) + session = Session() + + base.metadata.create_all(db) + + return session diff --git a/gemanalysis/postgres/psycopg2_manager.py b/gemanalysis/postgres/psycopg2_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..3b185c98cccd41dca48bf6da3fe68cd1d639dff1 --- /dev/null +++ b/gemanalysis/postgres/psycopg2_manager.py @@ -0,0 +1,36 @@ +import psycopg2 +import os +from pyaml_env import parse_config + + +class Psycopg2Manager: + # This is done so that correct config file path is inferred no matter from where the program is run + config_file_path = os.path.abspath( + os.path.join(os.path.realpath(__file__), os.pardir, os.pardir, "config.yaml") + ) + config = parse_config(config_file_path) + + @staticmethod + def get_connection_string(): + db_config = Psycopg2Manager.config["database"] + + return "postgresql+psycopg2://{}:{}@{}:{}/{}".format( + db_config["user"], + db_config["password"], + db_config["host"], + db_config["port"], + db_config["name"], + ) + + @staticmethod + def get_psycopg2_manager(): + db_config = Psycopg2Manager.config["database"] + connection = psycopg2.connect( + database=db_config["name"], + user=db_config["user"], + password=db_config["password"], + host=db_config["host"], + port=db_config["port"], + ) + cursor = connection.cursor() + return cursor diff --git a/gemanalysis/readme.md b/gemanalysis/readme.md new file mode 100644 index 0000000000000000000000000000000000000000..29962c4ffc3397d9578dfaa379125180cb2dc755 --- /dev/null +++ b/gemanalysis/readme.md @@ -0,0 +1,43 @@ +# Analysis Suite | GEM Online Analysis Setup +Followed the instructions in DEVELOPERS.md in `cmsgemos-analysis\' ? +If yes, thats all you need ;) + +# Usage +Analysis Suite provides a command line interface with 2 commands as of now: + + 1. import + Params: + run (-r) - the run number of the file(s) you are importing + path (-p) - path of the file/directory you want to import. If directory path is provided, everything in the directory will be imported. Multiple file paths can be provided at once separated by a space ` ` on the command line + description (-d) - optional, description of the files, if any + + 2. get + Params: + file - optional, name of file you want to retrieve + run - optional, run number of file you want to retrieve + revision - optional, revision of the files you want to retrieve + + Note: + If no params are provided, everything is returned. + If file name is provided, everything corresponding to the file name is returned. Same goes with run and revision + +### DB details ## +The gem Database currently consists of just one table, namely Analysis_Metadata. +Analysis_Metadata stores the metadata related to the files imported into the analysis suite. + +The table has the following attributes: + + + Column | Type | Nullable | Description +-------------+--------------------------+-----------+----------+----------------------------------------------------------------------- + id | integer | not null | row id + revision | integer | not null | shows if the file is the nth revision of an existing file + run_number | integer | not null | GEM run which produced this file + file_path | character varying(1024) | not null | path where file is stored + uuid | character varying(512) | not null | unique id appended to file name when storing in the file system + created_on | timestamp with time zone | not null | creation timestamp + parameters | text | | parameters used (if any). Not being used at the moment but can be be helpful when importing files at a later stage + description | character varying(128) | | description of the file (if any) provided by the user + +Indexes: + "analysis_metadata_pkey" PRIMARY KEY, btree (file_path, revision)