diff --git a/database/network-database/constants.py b/database/network-database/constants.py index 8e6b0754a..3c3b025b1 100644 --- a/database/network-database/constants.py +++ b/database/network-database/constants.py @@ -13,15 +13,4 @@ class Constants: PROTEIN_DATA_FILEPATH = DATA_DIRECTORY + "/protein_data.tsv" GENE_REGULATORY_NETWORK_DATA_FILEPATH = DATA_DIRECTORY + "/gene_regulatory_network_data.tsv" PROTEIN_PROTEIN_INTERACTIONS_DATA_FILEPATH = DATA_DIRECTORY + "/protein_protein_interactions_data.tsv" - SOURCE_DATA_FILEPATH = DATA_DIRECTORY + "/source_data.tsv" - - # missing and update file paths - MISSING_DATA_DIRECTORY = DATA_DIRECTORY + "/missing_data" - UPDATE_DATA_DIRECTORY = DATA_DIRECTORY + "/update_data" - MISSING_GRN_GENE_DATA_FILEPATH = MISSING_DATA_DIRECTORY + "/missing_grn_gene_data.tsv" - UPDATE_GRN_GENE_DATA_FILEPATH = UPDATE_DATA_DIRECTORY + "/update_grn_gene_data.tsv" - MISSING_PPI_GENE_DATA_FILEPATH = MISSING_DATA_DIRECTORY + "/missing_ppi_gene_data.tsv" - UPDATE_PPI_GENE_DATA_FILEPATH = UPDATE_DATA_DIRECTORY + "/update_ppi_gene_data.tsv" - MISSING_PROTEIN_DATA_FILEPATH = MISSING_DATA_DIRECTORY + "/missing_protein_data.tsv" - UPDATE_PROTEIN_DATA_FILEPATH = UPDATE_DATA_DIRECTORY + "/update_protein_data.tsv" - UPDATE_PROTEIN_NAME_DATA_FILEPATH = UPDATE_DATA_DIRECTORY + "/update_protein_name_data.tsv" \ No newline at end of file + SOURCE_DATA_FILEPATH = DATA_DIRECTORY + "/source_data.tsv" \ No newline at end of file diff --git a/database/network-database/database_services/README.md b/database/network-database/database_services/README.md index 6a8d91f4d..af24ec240 100644 --- a/database/network-database/database_services/README.md +++ b/database/network-database/database_services/README.md @@ -4,6 +4,4 @@ This folder contains all services for database operations. **Note:** This folder ## File Descriptions -- **`filter.py`** – Retrieves data from the database and identifies new or updated records. -- **`populator.py`** – Handles inserting new data into the database. -- **`updater.py`** – Defines the process for updating existing records when necessary. +- **`populator.py`** – Handles inserting new data into the database. diff --git a/database/network-database/database_services/filter.py b/database/network-database/database_services/filter.py deleted file mode 100644 index 426a4f9b8..000000000 --- a/database/network-database/database_services/filter.py +++ /dev/null @@ -1,156 +0,0 @@ -import psycopg2 -import csv -import pandas as pd -from constants import Constants - -class Filter: - def __init__(self, db_url, save_service): - self.db_url = db_url - self.save_service = save_service - - def get_all_db_data(self, database_namespace, table_name, columns): - """ - Fetch all data from the specified table and return it as a list of dictionaries. - """ - conn = psycopg2.connect(self.db_url) - cursor = conn.cursor() - - query = f"SELECT {', '.join(columns)} FROM {database_namespace}.{table_name};" - cursor.execute(query) - - rows = cursor.fetchall() - column_names = [desc[0] for desc in cursor.description] - - result = [dict(zip(column_names, row)) for row in rows] - - cursor.close() - conn.close() - - return result - - def filter_data(self, data_filepath, db_data, key_columns, update_columns): - """ - Filter the data to return: - - Records that need to be inserted. - - Records that need to be updated. - """ - with open(data_filepath, 'r') as f: - reader = csv.DictReader(f, delimiter='\t') - data = list(reader) - - db_keys = {tuple(row[col] for col in key_columns): row for row in db_data} - - insert_data = [] - update_data = [] - update_data_names = [] - - for row in data: - key_tuple = tuple(row[col] for col in key_columns) - if key_tuple in db_keys: - db_record = db_keys[key_tuple] - changes_needed = False - - for col in update_columns: - if str(row[col]).lower() != str(db_record[col]).lower(): - # Special case for protein daat that ned to check if standard name is changed - if col == "standard_name" and data_filepath == Constants.PROTEIN_DATA_FILEPATH: - update_data_names.append({ - "old_standard_name": db_record[col], - "new_standard_name": row[col], - }) - - if col == "length" or col == "molecular_weight" or col == "pi": - if float(row[col]) == float(db_record[col]): - continue - - changes_needed = True - break - - if changes_needed: - update_data.append({ - **{col: row[col] for col in key_columns + update_columns}, - }) - else: - insert_data.append(row) - - insert_data_df = pd.DataFrame(insert_data) - update_data_df = pd.DataFrame(update_data) - - self.save_service.save(insert_data_df, Constants.MISSING_DATA_DIRECTORY, self.missing_filepath) - self.save_service.save(update_data_df, Constants.UPDATE_DATA_DIRECTORY, self.update_filepath) - - if data_filepath == Constants.PROTEIN_DATA_FILEPATH: - update_data_names_df = pd.DataFrame(update_data_names) - self.save_service.save(update_data_names_df, Constants.UPDATE_DATA_DIRECTORY, Constants.UPDATE_PROTEIN_NAME_DATA_FILEPATH) - - -class ProteinFilter(Filter): - def __init__(self, db_url, save_service): - super().__init__(db_url, save_service) - self.missing_filepath = Constants.MISSING_PROTEIN_DATA_FILEPATH - self.update_filepath = Constants.UPDATE_PROTEIN_DATA_FILEPATH - - def get_all_db_data(self): - """ - Fetch all protein data from the database. - """ - columns = ["standard_name", "gene_systematic_name", "length", "molecular_weight", "pi"] - return super().get_all_db_data(Constants.PPI_DATABASE_NAMESPACE, "protein", columns) - - def filter_data(self): - """ - Filter protein data that is missing or needs to be updated in the database. - """ - db_data = self.get_all_db_data() - - key_columns = ["gene_systematic_name"] - update_columns = ["standard_name", "length", "molecular_weight", "pi"] - - return super().filter_data(Constants.PROTEIN_DATA_FILEPATH, db_data, key_columns, update_columns) - -class GeneFilter(Filter): - def __init__(self, db_url, save_service, network_mode): - super().__init__(db_url, save_service) - self.network_mode = network_mode - if network_mode == Constants.GRN_NETWORK_MODE: - self.missing_filepath = Constants.MISSING_GRN_GENE_DATA_FILEPATH - self.update_filepath = Constants.UPDATE_GRN_GENE_DATA_FILEPATH - self.database_namespace = Constants.GRN_DATABASE_NAMESPACE - elif network_mode == Constants.PPI_NETWORK_MODE: - self.missing_filepath = Constants.MISSING_PPI_GENE_DATA_FILEPATH - self.update_filepath = Constants.UPDATE_PPI_GENE_DATA_FILEPATH - self.database_namespace = Constants.PPI_DATABASE_NAMESPACE - else: - raise ValueError("Unknown network type specified.") - - def get_all_db_data(self): - """ - Fetch all gene data from the database. - """ - if self.network_mode == Constants.GRN_NETWORK_MODE: - columns = ["gene_id", "display_gene_id", "regulator"] - elif self.network_mode == Constants.PPI_NETWORK_MODE: - - columns = ["gene_id", "display_gene_id"] - else: - raise ValueError("Unknown network type specified.") - - return super().get_all_db_data(self.database_namespace, "gene", columns) - - def filter_data(self): - """ - Filter gene data that is missing or needs to be updated in the database. - """ - - if self.network_mode == Constants.GRN_NETWORK_MODE: - update_columns = ["display_gene_id", "regulator"] - elif self.network_mode == Constants.PPI_NETWORK_MODE: - update_columns = ["display_gene_id"] - else: - raise ValueError("Unknown network type specified.") - - key_columns = ["gene_id"] - - db_data = self.get_all_db_data() - - return super().filter_data(Constants.GENE_DATA_FILEPATH, db_data, key_columns, update_columns) diff --git a/database/network-database/database_services/populator.py b/database/network-database/database_services/populator.py index 259f9a31e..5aa5ce779 100644 --- a/database/network-database/database_services/populator.py +++ b/database/network-database/database_services/populator.py @@ -32,7 +32,7 @@ def process_file(self, conn, cursor, data_filepath, copy_statement): """ # Determine if we need to drop the last column (PPI network type) - if self.network_mode == Constants.PPI_NETWORK_MODE and data_filepath == Constants.MISSING_PPI_GENE_DATA_FILEPATH: + if self.network_mode == Constants.PPI_NETWORK_MODE and data_filepath == Constants.GENE_DATA_FILEPATH: print("Dropping the regulator column from the input data...") processed_rows = [] @@ -72,12 +72,11 @@ class GeneDataPopulator(DataPopulator): def __init__(self, db_url, network_mode): super().__init__(db_url) self.network_mode = network_mode + self.filepath = Constants.GENE_DATA_FILEPATH if network_mode == Constants.GRN_NETWORK_MODE: self.database_namespace = Constants.GRN_DATABASE_NAMESPACE - self.filepath = Constants.MISSING_GRN_GENE_DATA_FILEPATH elif network_mode == Constants.PPI_NETWORK_MODE: self.database_namespace = Constants.PPI_DATABASE_NAMESPACE - self.filepath = Constants.MISSING_PPI_GENE_DATA_FILEPATH else: raise ValueError(f"Unknown network type: {network_mode}") @@ -92,7 +91,7 @@ def get_copy_statement(self): class ProteinDataPopulator(DataPopulator): def __init__(self, db_url): super().__init__(db_url) - self.filepath = Constants.MISSING_PROTEIN_DATA_FILEPATH + self.filepath = Constants.PROTEIN_DATA_FILEPATH def get_copy_statement(self): return f"COPY {Constants.PPI_DATABASE_NAMESPACE}.protein (standard_name, gene_systematic_name, length, molecular_weight, PI, taxon_id, time_stamp, source) FROM stdin WITH CSV DELIMITER E'\\t' HEADER;" diff --git a/database/network-database/database_services/updater.py b/database/network-database/database_services/updater.py deleted file mode 100644 index c4333aea8..000000000 --- a/database/network-database/database_services/updater.py +++ /dev/null @@ -1,157 +0,0 @@ -import psycopg2 -from abc import ABC, abstractmethod -import csv -from constants import Constants - -class Updater(ABC): - def __init__(self, db_url): - self.db_url = db_url - self.filepath = None - - @abstractmethod - def process_each_row(self, row): - """ - Process each row of data from the file. - """ - pass - - def update_data(self): - print(f"Updating data from {self.filepath}...") - conn, cursor = self._connect_to_db() - - rows = self._process_file() - - # SQL Update query for protein data - for row in rows: - update_query, params = self.process_each_row(row) - - self._execute_update(cursor, update_query, params) - - self._commit_and_close(conn, cursor) - - print("Data update complete!") - print("====================================================================") - - def _process_file(self): - """ - Helper function to process the file, which will be used by subclasses to process data rows. - """ - with open(self.filepath, 'r') as file: - reader = csv.reader(file, delimiter='\t') - next(reader) - return list(reader) - - def _execute_update(self, cursor, update_query, params): - """ - Executes the update query with provided parameters. - """ - try: - cursor.execute(update_query, params) - except Exception as e: - print(f"Error executing query: {e}") - cursor.connection.rollback() - else: - print(f"Update successful!") - - def _connect_to_db(self): - """ - Establish connection to the database and return cursor. - """ - try: - conn = psycopg2.connect(self.db_url) - cursor = conn.cursor() - return conn, cursor - except Exception as e: - print(f"Error connecting to the database: {e}") - raise - - def _commit_and_close(self, conn, cursor): - """ - Commit the transaction and close the database connection. - """ - conn.commit() - cursor.close() - conn.close() - -class GeneUpdater(Updater): - def __init__(self, db_url, network_mode): - super().__init__(db_url) - self.network_mode = network_mode - self.filepath = Constants.UPDATE_PPI_GENE_DATA_FILEPATH if network_mode == Constants.PPI_NETWORK_MODE else Constants.UPDATE_GRN_GENE_DATA_FILEPATH - - def process_each_row(self, row): - gene_id = row[0] - display_gene_id = row[1] - - # Construct query based on network type (GRN vs PPI) - if self.network_mode == Constants.GRN_NETWORK_MODE: - regulator = row[2] - update_query = """ - UPDATE "{}".gene - SET display_gene_id = %s, regulator = %s - WHERE gene_id = %s; - """.format(Constants.GRN_DATABASE_NAMESPACE) # Directly format the schema name here - params = (display_gene_id, regulator, gene_id) - elif self.network_mode == Constants.PPI_NETWORK_MODE: - update_query = """ - UPDATE "{}".gene - SET display_gene_id = %s - WHERE gene_id = %s; - """.format(Constants.PPI_DATABASE_NAMESPACE) - params = (display_gene_id, gene_id) - else: - raise ValueError(f"Unknown network type '{self.network_mode}' specified. Expected 'grn' or 'ppi'.") - - return update_query, params - - -class ProteinUpdater(Updater): - def __init__(self, db_url): - super().__init__(db_url) - self.filepath = Constants.UPDATE_PROTEIN_DATA_FILEPATH - - def process_each_row(self, row): - gene_systematic_name = row[0] - standard_name = row[1] - length = row[2] if row[2] != "None" else 0 - molecular_weight = row[3] - pi = row[4] if row[4] != "None" else 0 - - update_query = """ - UPDATE {}.protein - SET standard_name = %s, length = %s, molecular_weight = %s, pi = %s - WHERE gene_systematic_name = %s; - """.format(Constants.PPI_DATABASE_NAMESPACE) - params = (standard_name, length, molecular_weight, pi, gene_systematic_name) - - return update_query, params - -class ProteinProteinInteractionsUpdater(Updater): - def __init__(self, db_url): - super().__init__(db_url) - self.filepath = Constants.UPDATE_PROTEIN_NAME_DATA_FILEPATH - - def process_each_row(self, row): - old_standard_name = row[0] - new_standard_name = row[1] - - # Use SQL CASE statement to update either protein1 or protein2 - update_query = """ - UPDATE {}.physical_interactions - SET - protein1 = CASE - WHEN protein1 = %s THEN %s - ELSE protein1 - END, - protein2 = CASE - WHEN protein2 = %s THEN %s - ELSE protein2 - END - WHERE protein1 = %s OR protein2 = %s; - """.format(Constants.PPI_DATABASE_NAMESPACE) - - # Parameters for the query - params = (old_standard_name, new_standard_name, old_standard_name, new_standard_name, old_standard_name, old_standard_name) - - return update_query, params - diff --git a/database/network-database/main.py b/database/network-database/main.py index d89d00c84..a644fba79 100644 --- a/database/network-database/main.py +++ b/database/network-database/main.py @@ -2,7 +2,6 @@ from data_services.data_generator import * from data_services.save_service import * from database_services.filter import * -from database_services.updater import * from database_services.populator import * import argparse from datetime import datetime, timezone, timedelta @@ -27,22 +26,12 @@ def load_data(network_option): SourceDataGenerator(SourceProcessor(formatted_time_stamp), save_service) -def filter_data(network_option, db_url): - print("Filtering data.................................................") - if network_option in ['all', Constants.GRN_NETWORK_MODE]: - GeneFilter(db_url, save_service, network_mode="grn").filter_data() - - if network_option in ['all', Constants.PPI_NETWORK_MODE]: - GeneFilter(db_url, save_service, network_mode="ppi").filter_data() - ProteinFilter(db_url, save_service).filter_data() - def adding_data_to_databse(network_option, db_url): print("Adding data to database.................................................") if network_option in ['all', Constants.GRN_NETWORK_MODE]: network_mode = Constants.GRN_NETWORK_MODE SourceDataPopulator(db_url, network_mode).populate_data() GeneDataPopulator(db_url, network_mode).populate_data() - GeneUpdater(db_url, network_mode).update_data() GeneRegulatoryNetworkDataPopulator(db_url).populate_data() if network_option in ['all', Constants.PPI_NETWORK_MODE]: @@ -50,17 +39,13 @@ def adding_data_to_databse(network_option, db_url): SourceDataPopulator(db_url, network_mode).populate_data() GeneDataPopulator(db_url, network_mode).populate_data() - GeneUpdater(db_url, network_mode).update_data() ProteinDataPopulator(db_url).populate_data() - ProteinProteinInteractionsUpdater(db_url).update_data() - ProteinUpdater(db_url).update_data() ProteinProteinInteractionsDataPopulator(db_url).populate_data() def main(network_option, db_url): load_data(network_option) - filter_data(network_option, db_url) adding_data_to_databse(network_option, db_url) if __name__ == "__main__": diff --git a/database/schema/gene_regulatory_network_schema.sql b/database/schema/gene_regulatory_network_schema.sql index 5a97ffb27..e2eb66c4a 100644 --- a/database/schema/gene_regulatory_network_schema.sql +++ b/database/schema/gene_regulatory_network_schema.sql @@ -15,7 +15,7 @@ CREATE TABLE gene_regulatory_network_with_timestamp.gene ( regulator BOOLEAN, time_stamp TIMESTAMP WITH TIME ZONE, source VARCHAR, - PRIMARY KEY(gene_id, taxon_id), + PRIMARY KEY(gene_id, taxon_id, time_stamp, source), FOREIGN KEY (time_stamp, source) REFERENCES gene_regulatory_network_with_timestamp.source(time_stamp, source) ); @@ -26,8 +26,7 @@ CREATE TABLE gene_regulatory_network_with_timestamp.network ( annotation_type VARCHAR, time_stamp TIMESTAMP WITH TIME ZONE, source VARCHAR, - FOREIGN KEY (regulator_gene_id, taxon_id) REFERENCES gene_regulatory_network_with_timestamp.gene(gene_id, taxon_id), - FOREIGN KEY (target_gene_id, taxon_id) REFERENCES gene_regulatory_network_with_timestamp.gene(gene_id, taxon_id), - FOREIGN KEY (time_stamp, source) REFERENCES gene_regulatory_network_with_timestamp.source(time_stamp, source), + FOREIGN KEY (regulator_gene_id, taxon_id, time_stamp, source) REFERENCES gene_regulatory_network_with_timestamp.gene(gene_id, taxon_id, time_stamp, source), + FOREIGN KEY (target_gene_id, taxon_id, time_stamp, source) REFERENCES gene_regulatory_network_with_timestamp.gene(gene_id, taxon_id, time_stamp, source), CONSTRAINT unique_network UNIQUE (regulator_gene_id, target_gene_id, taxon_id, time_stamp, source, annotation_type) ); \ No newline at end of file diff --git a/database/schema/protein_protein_interactions_schema.sql b/database/schema/protein_protein_interactions_schema.sql index f77b8cbc7..11f262d73 100644 --- a/database/schema/protein_protein_interactions_schema.sql +++ b/database/schema/protein_protein_interactions_schema.sql @@ -14,12 +14,12 @@ CREATE TABLE protein_protein_interactions_with_timestamp.gene ( taxon_id VARCHAR, time_stamp TIMESTAMP WITH TIME ZONE, source VARCHAR, - PRIMARY KEY(gene_id, taxon_id), + PRIMARY KEY(gene_id, taxon_id, time_stamp, source), FOREIGN KEY (time_stamp, source) REFERENCES protein_protein_interactions_with_timestamp.source(time_stamp, source) ); CREATE TABLE protein_protein_interactions_with_timestamp.protein ( - standard_name VARCHAR PRIMARY KEY, + standard_name VARCHAR, gene_systematic_name VARCHAR, length FLOAT, molecular_weight FLOAT, @@ -27,22 +27,19 @@ CREATE TABLE protein_protein_interactions_with_timestamp.protein ( taxon_id VARCHAR, time_stamp TIMESTAMP WITH TIME ZONE, source VARCHAR, - FOREIGN KEY (gene_systematic_name, taxon_id) REFERENCES protein_protein_interactions_with_timestamp.gene(gene_id, taxon_id), - FOREIGN KEY (time_stamp, source) REFERENCES protein_protein_interactions_with_timestamp.source(time_stamp, source) + PRIMARY KEY(standard_name, time_stamp, source), + FOREIGN KEY (gene_systematic_name, taxon_id, time_stamp, source) REFERENCES protein_protein_interactions_with_timestamp.gene(gene_id, taxon_id, time_stamp, source) ); CREATE TABLE protein_protein_interactions_with_timestamp.physical_interactions ( protein1 VARCHAR, protein2 VARCHAR, - gene_systematic_name1 VARCHAR, - gene_systematic_name2 VARCHAR, interaction_detection_methods_identifier VARCHAR, annotation_type VARCHAR, experiment_name VARCHAR, time_stamp TIMESTAMP WITH TIME ZONE, source VARCHAR, - FOREIGN KEY (protein1) REFERENCES protein_protein_interactions_with_timestamp.protein(standard_name), - FOREIGN KEY (protein2) REFERENCES protein_protein_interactions_with_timestamp.protein(standard_name), - FOREIGN KEY (time_stamp, source) REFERENCES protein_protein_interactions_with_timestamp.source(time_stamp, source), + FOREIGN KEY (protein1, time_stamp, source) REFERENCES protein_protein_interactions_with_timestamp.protein(standard_name, time_stamp, source), + FOREIGN KEY (protein2, time_stamp, source) REFERENCES protein_protein_interactions_with_timestamp.protein(standard_name, time_stamp, source), CONSTRAINT unique_physical_interaction UNIQUE (protein1, protein2, interaction_detection_methods_identifier, annotation_type, experiment_name, time_stamp, source) );