BROKEN: Add initial first-class-files attempt

# Conflicts: # db/project.xml
populationgenomics · Jun 26, 2023 · 86fc7ad · 86fc7ad
1 parent 3c902cd
commit 86fc7ad
Show file tree

Hide file tree

Showing 4 changed files with 131 additions and 1 deletion.
diff --git a/db/project.xml b/db/project.xml
@@ -695,5 +695,42 @@
 			INSERT INTO analysis_type (id, name) VALUES ('analysis-runner', 'analysis-runner');
 		</sql>
 	</changeSet>
-
+	<changeSet author="michael.franklin" id="2022-10-18_first-class-files">
+		<sql>SET @@system_versioning_alter_history = 1;</sql>
+			<createTable tableName="file">
+<!--				Maybe should consider "in-progress" flag to shos that it's expecting an output to exist -->
+				<column name="id" type="INT" autoIncrement="true">
+					<constraints primaryKey="true" nullable="false" />
+				</column>
+				<column name="type" type="ENUM('file', 'directory')" />
+				<column name="path" type="TEXT">
+					<constraints unique="true" nullable="false" />
+				</column>
+				<column name="size" type="INT" />
+				<column name="checksum" type="TEXT" />
+
+			</createTable>
+
+<!--			attach files to analysis-->
+			<createTable tableName="analysis_files">
+				<column name="analysis_id" type="INT">
+					<constraints nullable="false" foreignKeyName="fk_analysis_files_analysis" references="analysis(id)" />
+				</column>
+				<column name="file_id" type="INT">
+					<constraints nullable="false" foreignKeyName="fk_analysis_files_file" references="file(id)" />
+				</column>
+				<column name="pattern" type="TEXT"/>
+			</createTable>
+
+		<!--			attach files to analysis-->
+			<createTable tableName="sequence_files">
+				<column name="sequence_id" type="INT">
+					<constraints nullable="false" foreignKeyName="fk_analysis_files_analysis" references="sample_sequencing(id)" />
+				</column>
+				<column name="file_id" type="INT">
+					<constraints nullable="false" foreignKeyName="fk_analysis_files_file" references="file(id)" />
+				</column>
+				<column name="pattern" type="TEXT"/>
+			</createTable>
+	</changeSet>
 </databaseChangeLog>
diff --git a/db/python/tables/file.py b/db/python/tables/file.py
@@ -0,0 +1,38 @@
+from db.python.connect import DbBase
+from models.models.file import File
+
+
+class FileTable(DbBase):
+    """
+    Capture File table operations and queries
+    """
+
+    table_name = 'file'
+
+    async def get_file_by_id(self, file_id: int) -> File:
+        pass
+
+    async def get_files_by_ids(self, file_ids: list[int]) -> list[File]:
+        pass
+
+    async def get_file_by_path(self, path: str) -> File:
+        pass
+
+    async def get_files_by_paths(self, paths: list[str]) -> list[File]:
+        pass
+
+    # region CREATE
+
+    async def create_files(self, files: list[File]):
+        """
+        Can insert and get by paths, will need to consider what
+        should happen if files exist, eg should it:
+
+        - Archive all old analysis / sequences at this location
+            + notify user of this
+        - Fail, and make this a precondition
+
+        """
+        pass
+
+    # endregion CREATE
diff --git a/models/models/file.py b/models/models/file.py
@@ -0,0 +1,19 @@
+from enum import Enum
+from typing import Optional
+
+from pydantic import BaseModel
+
+class FileType(Enum):
+    FILE = 'file'
+    DIRECTORY = 'directory'
+
+
+class File(BaseModel):
+    """Model to represent File"""
+
+    id: int
+    type: FileType
+    path: str
+    # in bytes
+    size: int
+    checksum: str | None
diff --git a/scripts/migrate_files.py b/scripts/migrate_files.py
@@ -0,0 +1,36 @@
+"""
+We need to apply this migration in three steps:
+
+- Add schema for files
+- migrate all the data to new schema
+- Remove old columns
+    + MariaDB will probably complain about deleting columns in archived data
+
+This is also the time we should consider archiving analysis with duplicate output
+paths, and also
+"""
+from collections import defaultdict
+
+from models.models.file import File
+
+connection = None
+
+
+def main():
+    pass
+
+
+async def migrate_analysis():
+    # collect all analysis-entries that are gs:// files
+    get_query = 'SELECT * FROM analysis WHERE output LIKE "gs://%"'
+    rows = await connection.fetch_all(get_query)
+    mapped_files = defaultdict(list)
+    for r in rows:
+        \mapped_files[r['output']].append(r['id'])
+
+    inserted_files = insert_files(list(mapped_files.keys()))
+
+    for f in inserted_files:
+        File(path=)
+
+