Skip to content

Commit

Permalink
BROKEN: Add initial first-class-files attempt
Browse files Browse the repository at this point in the history
# Conflicts:
#	db/project.xml
  • Loading branch information
illusional committed Jun 26, 2023
1 parent 3c902cd commit 86fc7ad
Show file tree
Hide file tree
Showing 4 changed files with 131 additions and 1 deletion.
39 changes: 38 additions & 1 deletion db/project.xml
Original file line number Diff line number Diff line change
Expand Up @@ -695,5 +695,42 @@
INSERT INTO analysis_type (id, name) VALUES ('analysis-runner', 'analysis-runner');
</sql>
</changeSet>

<changeSet author="michael.franklin" id="2022-10-18_first-class-files">
<sql>SET @@system_versioning_alter_history = 1;</sql>
<createTable tableName="file">
<!-- Maybe should consider "in-progress" flag to shos that it's expecting an output to exist -->
<column name="id" type="INT" autoIncrement="true">
<constraints primaryKey="true" nullable="false" />
</column>
<column name="type" type="ENUM('file', 'directory')" />
<column name="path" type="TEXT">
<constraints unique="true" nullable="false" />
</column>
<column name="size" type="INT" />
<column name="checksum" type="TEXT" />

</createTable>

<!-- attach files to analysis-->
<createTable tableName="analysis_files">
<column name="analysis_id" type="INT">
<constraints nullable="false" foreignKeyName="fk_analysis_files_analysis" references="analysis(id)" />
</column>
<column name="file_id" type="INT">
<constraints nullable="false" foreignKeyName="fk_analysis_files_file" references="file(id)" />
</column>
<column name="pattern" type="TEXT"/>
</createTable>

<!-- attach files to analysis-->
<createTable tableName="sequence_files">
<column name="sequence_id" type="INT">
<constraints nullable="false" foreignKeyName="fk_analysis_files_analysis" references="sample_sequencing(id)" />
</column>
<column name="file_id" type="INT">
<constraints nullable="false" foreignKeyName="fk_analysis_files_file" references="file(id)" />
</column>
<column name="pattern" type="TEXT"/>
</createTable>
</changeSet>
</databaseChangeLog>
38 changes: 38 additions & 0 deletions db/python/tables/file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from db.python.connect import DbBase
from models.models.file import File


class FileTable(DbBase):
"""
Capture File table operations and queries
"""

table_name = 'file'

async def get_file_by_id(self, file_id: int) -> File:
pass

async def get_files_by_ids(self, file_ids: list[int]) -> list[File]:
pass

async def get_file_by_path(self, path: str) -> File:
pass

async def get_files_by_paths(self, paths: list[str]) -> list[File]:
pass

# region CREATE

async def create_files(self, files: list[File]):
"""
Can insert and get by paths, will need to consider what
should happen if files exist, eg should it:
- Archive all old analysis / sequences at this location
+ notify user of this
- Fail, and make this a precondition
"""
pass

# endregion CREATE
19 changes: 19 additions & 0 deletions models/models/file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from enum import Enum
from typing import Optional

from pydantic import BaseModel

class FileType(Enum):
FILE = 'file'
DIRECTORY = 'directory'


class File(BaseModel):
"""Model to represent File"""

id: int
type: FileType
path: str
# in bytes
size: int
checksum: str | None
36 changes: 36 additions & 0 deletions scripts/migrate_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""
We need to apply this migration in three steps:
- Add schema for files
- migrate all the data to new schema
- Remove old columns
+ MariaDB will probably complain about deleting columns in archived data
This is also the time we should consider archiving analysis with duplicate output
paths, and also
"""
from collections import defaultdict

from models.models.file import File

connection = None


def main():
pass


async def migrate_analysis():
# collect all analysis-entries that are gs:// files
get_query = 'SELECT * FROM analysis WHERE output LIKE "gs://%"'
rows = await connection.fetch_all(get_query)
mapped_files = defaultdict(list)
for r in rows:
\mapped_files[r['output']].append(r['id'])

inserted_files = insert_files(list(mapped_files.keys()))

for f in inserted_files:
File(path=)


0 comments on commit 86fc7ad

Please sign in to comment.