diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6a92e7c --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +target/ +.DS_Store +data/*.tsv diff --git a/Cargo.lock b/Cargo.lock index f508675..c22aa2a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -81,6 +81,7 @@ dependencies = [ "fuser", "libc", "log", + "serde_json", ] [[package]] @@ -127,6 +128,12 @@ version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" +[[package]] +name = "itoa" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" + [[package]] name = "jiff" version = "0.2.24" @@ -253,6 +260,15 @@ version = "0.8.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", +] + [[package]] name = "serde_core" version = "1.0.228" @@ -273,6 +289,19 @@ dependencies = [ "syn", ] +[[package]] +name = "serde_json" +version = "1.0.149" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" +dependencies = [ + "itoa", + "memchr", + "serde", + "serde_core", + "zmij", +] + [[package]] name = "smallvec" version = "1.15.1" @@ -359,3 +388,9 @@ dependencies = [ "quote", "syn", ] + +[[package]] +name = "zmij" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" diff --git a/Cargo.toml b/Cargo.toml index 8971171..16dd66d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,3 +8,4 @@ fuser = "0.14" libc = "0.2" env_logger = "0.11" log = "0.4" +serde_json = "1" diff --git a/README.md b/README.md index adb27a3..519ceeb 100644 --- a/README.md +++ b/README.md @@ -1,20 +1,83 @@ # CS-FileSystem -User Space File System to populate Cybershuttle Data Sources +This is a user-space filesystem for exposing Cybershuttle data sources. -sudo apt install cargo +This version loads ATLAS metadata from a TSV file and exposes each protein entry +as a directory containing a `metadata.json` file. + +```text +/tmp/atlas_mount/ + atlas/ + 1r6w_A/ + metadata.json + 2y44_A/ + metadata.json +``` + +## Requirements +### Linux + +```bash +sudo apt install cargo sudo apt install -y libfuse3-dev libfuse-dev pkg-config +``` + +### macOS + +```bash +brew install pkgconf +brew install --cask macfuse +``` + +macFUSE may require approval in `System Settings -> Privacy & Security`. + +## ATLAS TSV + +Place the ATLAS metadata TSV somewhere local. The examples below assume the TSV is at: + +```text +data/2024_11_18_ATLAS_info.tsv +``` + +The TSV is not committed to the repository. Create the `data` directory and copy +or download the file there: + +```bash +mkdir -p data +cp /path/to/2024_11_18_ATLAS_info.tsv data/ +``` + +## Run + +Build and mount the filesystem: + +```bash +mkdir -p /tmp/atlas_mount +cargo run --release -- data/2024_11_18_ATLAS_info.tsv /tmp/atlas_mount +``` + +Leave that command running while the filesystem is mounted. + +In another terminal: -cargo build +```bash +ls /tmp/atlas_mount +ls /tmp/atlas_mount/atlas | head +ls /tmp/atlas_mount/atlas/1r6w_A +cat /tmp/atlas_mount/atlas/1r6w_A/metadata.json +``` -mkdir /tmp/myfs -cargo run --release -- /tmp/myfs +## Unmount +Linux: -In a different terminal -ls /tmp/myfs +```bash +fusermount -u /tmp/atlas_mount +``` +macOS: -To unmount -fusermount -u /tmp/myfs +```bash +diskutil unmount /tmp/atlas_mount +``` diff --git a/src/atlas.rs b/src/atlas.rs new file mode 100644 index 0000000..6e685c9 --- /dev/null +++ b/src/atlas.rs @@ -0,0 +1,37 @@ +use std::fs; +use std::io::{BufRead, BufReader}; + +pub struct AtlasEntry { + pub id: String, + pub metadata_json: String, // pre-serialized JSON for this entry +} + +pub fn load_atlas(tsv_path: &str) -> Vec { + let file = fs::File::open(tsv_path).expect("Failed to open TSV"); + let reader = BufReader::new(file); + let mut lines = reader.lines(); + + let header_line = lines.next().unwrap().unwrap(); + let headers: Vec<&str> = header_line.split('\t').collect(); + let mut entries = Vec::new(); + + for line in lines { + let line = line.unwrap(); + let fields: Vec<&str> = line.split('\t').collect(); + + let mut map = serde_json::Map::new(); + + for (i, header) in headers.iter().enumerate() { + if let Some(value) = fields.get(i) { + map.insert( + header.to_string(), + serde_json::Value::String(value.to_string()), + ); + } + } + let id = fields.get(0).unwrap_or(&"unknown").to_string(); + let metadata_json = serde_json::to_string_pretty(&serde_json::Value::Object(map)).unwrap(); + entries.push(AtlasEntry { id, metadata_json }); + } + entries +} diff --git a/src/main.rs b/src/main.rs index 10ae177..b5bb0b6 100644 --- a/src/main.rs +++ b/src/main.rs @@ -9,6 +9,9 @@ // cat /tmp/myfs/hello.txt // fusermount -u /tmp/myfs (Ctrl-C also unmounts) +mod atlas; +use atlas::{load_atlas, AtlasEntry}; +use std::collections::HashMap; use std::ffi::OsStr; use std::time::{Duration, UNIX_EPOCH}; @@ -39,51 +42,97 @@ struct DataSource { files: Vec, } -fn lookup_inode_in_directory(dir: &CSDirectory, inode_no: u64) -> Option { +fn directory_attr(ino: u64) -> FileAttr { + FileAttr { + ino, + size: 0, + blocks: 0, + atime: UNIX_EPOCH, + mtime: UNIX_EPOCH, + ctime: UNIX_EPOCH, + crtime: UNIX_EPOCH, + kind: FileType::Directory, + perm: 0o755, + nlink: 2, + uid: unsafe { libc::getuid() }, + gid: unsafe { libc::getgid() }, + rdev: 0, + flags: 0, + blksize: 512, + } +} + +fn regular_file_attr(ino: u64, size: u64) -> FileAttr { + FileAttr { + ino, + size, + blocks: 1, + atime: UNIX_EPOCH, + mtime: UNIX_EPOCH, + ctime: UNIX_EPOCH, + crtime: UNIX_EPOCH, + kind: FileType::RegularFile, + perm: 0o644, + nlink: 1, + uid: unsafe { libc::getuid() }, + gid: unsafe { libc::getgid() }, + rdev: 0, + flags: 0, + blksize: 512, + } +} + +fn build_atlas_datasource( + entries: &[AtlasEntry], + inode_gen: &mut InodeGenerator, + file_contents: &mut HashMap, +) -> DataSource { + let mut directories = Vec::new(); + + for entry in entries { + let file_inode = inode_gen.next(); + + file_contents.insert(file_inode, entry.metadata_json.clone()); + + let metadata_file = CSFile { + inode_no: file_inode, + cs_data_id: format!("{}_metadata", entry.id), + name: "metadata.json".to_string(), + }; + + let dir = CSDirectory { + inode_no: inode_gen.next(), + name: entry.id.clone(), + cs_data_id: entry.id.clone(), + files: vec![metadata_file], + directories: vec![], + }; + directories.push(dir); + } + + DataSource::new(inode_gen.next(), "atlas".to_string(), directories, vec![]) +} + +fn lookup_inode_in_directory( + dir: &CSDirectory, + inode_no: u64, + file_contents: &HashMap, +) -> Option { if inode_no == dir.inode_no { - return Some(FileAttr { - ino: dir.inode_no, - size: 0, - blocks: 0, - atime: UNIX_EPOCH, - mtime: UNIX_EPOCH, - ctime: UNIX_EPOCH, - crtime: UNIX_EPOCH, - kind: FileType::Directory, - perm: 0o755, - nlink: 2, - uid: unsafe { libc::getuid() }, - gid: unsafe { libc::getgid() }, - rdev: 0, - flags: 0, - blksize: 512, - }); + return Some(directory_attr(dir.inode_no)); } for file in &dir.files { if inode_no == file.inode_no { - return Some(FileAttr { - ino: file.inode_no, - size: 0, - blocks: 0, - atime: UNIX_EPOCH, - mtime: UNIX_EPOCH, - ctime: UNIX_EPOCH, - crtime: UNIX_EPOCH, - kind: FileType::RegularFile, - perm: 0o644, - nlink: 1, - uid: unsafe { libc::getuid() }, - gid: unsafe { libc::getgid() }, - rdev: 0, - flags: 0, - blksize: 512, - }); + let size = file_contents + .get(&file.inode_no) + .map_or(0, |content| content.len() as u64); + return Some(regular_file_attr(file.inode_no, size)); } } for subdir in &dir.directories { - if let Some(attr) = lookup_inode_in_directory(subdir, inode_no) { + if let Some(attr) = lookup_inode_in_directory(subdir, inode_no, file_contents) { return Some(attr); } } @@ -91,163 +140,128 @@ fn lookup_inode_in_directory(dir: &CSDirectory, inode_no: u64) -> Option Option { - +fn lookup_attr_in_directory( + dir: &CSDirectory, + name: &OsStr, + parent_inode: u64, + file_contents: &HashMap, +) -> Option { for file in &dir.files { if parent_inode == dir.inode_no && name.to_str() == Some(file.name.as_str()) { - return Some(FileAttr { - ino: file.inode_no, - size: 0, - blocks: 0, - atime: UNIX_EPOCH, - mtime: UNIX_EPOCH, - ctime: UNIX_EPOCH, - crtime: UNIX_EPOCH, - kind: FileType::RegularFile, - perm: 0o644, - nlink: 1, - uid: unsafe { libc::getuid() }, - gid: unsafe { libc::getgid() }, - rdev: 0, - flags: 0, - blksize: 512, - }); + let size = file_contents + .get(&file.inode_no) + .map_or(0, |content| content.len() as u64); + return Some(regular_file_attr(file.inode_no, size)); } } for subdir in &dir.directories { if parent_inode == dir.inode_no && name.to_str() == Some(subdir.name.as_str()) { - return Some(FileAttr { - ino: subdir.inode_no, - size: 0, - blocks: 0, - atime: UNIX_EPOCH, - mtime: UNIX_EPOCH, - ctime: UNIX_EPOCH, - crtime: UNIX_EPOCH, - kind: FileType::Directory, - perm: 0o755, - nlink: 2, - uid: unsafe { libc::getuid() }, - gid: unsafe { libc::getgid() }, - rdev: 0, - flags: 0, - blksize: 512, - }); + return Some(directory_attr(subdir.inode_no)); } } for subdir in &dir.directories { - if let Some(attr) = lookup_attr_in_directory(subdir, name, parent_inode) { + if let Some(attr) = lookup_attr_in_directory(subdir, name, parent_inode, file_contents) { return Some(attr); } } - None } +fn find_directory_listing<'a>( + dir: &'a CSDirectory, + inode_no: u64, + parent_inode: u64, +) -> Option<(u64, &'a [CSFile], &'a [CSDirectory])> { + if inode_no == dir.inode_no { + return Some((parent_inode, &dir.files, &dir.directories)); + } + + for subdir in &dir.directories { + if let Some(listing) = find_directory_listing(subdir, inode_no, dir.inode_no) { + return Some(listing); + } + } -impl DataSource { + None +} +impl DataSource { fn new(inode_no: u64, name: String, directories: Vec, files: Vec) -> Self { - DataSource { inode_no, name, directories, files } + DataSource { + inode_no, + name, + directories, + files, + } } - fn get_attr(&self, ) -> FileAttr { - FileAttr { - ino: self.inode_no, - size: 0, - blocks: 0, - atime: UNIX_EPOCH, - mtime: UNIX_EPOCH, - ctime: UNIX_EPOCH, - crtime: UNIX_EPOCH, - kind: FileType::Directory, - perm: 0o755, - nlink: 2, - uid: unsafe { libc::getuid() }, - gid: unsafe { libc::getgid() }, - rdev: 0, - flags: 0, - blksize: 512, - } + fn get_attr(&self) -> FileAttr { + directory_attr(self.inode_no) } - fn lookup_by_inode(&self, inode_no: u64) -> Option { + fn lookup_by_inode( + &self, + inode_no: u64, + file_contents: &HashMap, + ) -> Option { if inode_no == self.inode_no { return Some(self.get_attr()); } for file in &self.files { if inode_no == file.inode_no { - return Some(FileAttr { - ino: file.inode_no, - size: 0, - blocks: 0, - atime: UNIX_EPOCH, - mtime: UNIX_EPOCH, - ctime: UNIX_EPOCH, - crtime: UNIX_EPOCH, - kind: FileType::RegularFile, - perm: 0o644, - nlink: 1, - uid: unsafe { libc::getuid() }, - gid: unsafe { libc::getgid() }, - rdev: 0, - flags: 0, - blksize: 512, - }); + let size = file_contents + .get(&file.inode_no) + .map_or(0, |content| content.len() as u64); + return Some(regular_file_attr(file.inode_no, size)); } } for dir in &self.directories { - if let Some(attr) = lookup_inode_in_directory(dir, inode_no) { + if let Some(attr) = lookup_inode_in_directory(dir, inode_no, file_contents) { return Some(attr); } } None } - - fn lookup_by_name(&self, parent_inode: u64, name: &OsStr) -> Option { + fn lookup_by_name( + &self, + parent_inode: u64, + name: &OsStr, + file_contents: &HashMap, + ) -> Option { for file in &self.files { if parent_inode == self.inode_no && name.to_str() == Some(file.name.as_str()) { - return Some(FileAttr { - ino: file.inode_no, - size: 0, - blocks: 0, - atime: UNIX_EPOCH, - mtime: UNIX_EPOCH, - ctime: UNIX_EPOCH, - crtime: UNIX_EPOCH, - kind: FileType::RegularFile, - perm: 0o644, - nlink: 1, - uid: unsafe { libc::getuid() }, - gid: unsafe { libc::getgid() }, - rdev: 0, - flags: 0, - blksize: 512, - }); + let size = file_contents + .get(&file.inode_no) + .map_or(0, |content| content.len() as u64); + return Some(regular_file_attr(file.inode_no, size)); + } + } + + for dir in &self.directories { + if parent_inode == self.inode_no && name.to_str() == Some(dir.name.as_str()) { + return Some(directory_attr(dir.inode_no)); } } for dir in &self.directories { - if let Some(attr) = lookup_attr_in_directory(dir, name, parent_inode) { + if let Some(attr) = lookup_attr_in_directory(dir, name, parent_inode, file_contents) { return Some(attr); } } None - } - + } } - // implement an incrementing inode number generator struct InodeGenerator { current: u64, -} +} impl InodeGenerator { fn new() -> Self { @@ -261,7 +275,6 @@ impl InodeGenerator { } } - const TTL: Duration = Duration::from_secs(1); const HELLO_CONTENT: &str = "Hello, FUSE from Rust!\n"; @@ -309,19 +322,24 @@ fn hello_attr() -> FileAttr { } } struct CybershuttleFS { - data_sources: Vec, + data_sources: Vec, + file_contents: HashMap, //inode -> content } impl Filesystem for CybershuttleFS { fn lookup(&mut self, _req: &Request<'_>, parent: u64, name: &OsStr, reply: ReplyEntry) { - for ds in &self.data_sources { if parent == ROOT_INO && name.to_str() == Some(ds.name.as_str()) { reply.entry(&TTL, &ds.get_attr(), 0); return; } - if let Some(attr) = ds.lookup_by_name(parent, name) { - reply.entry(&TTL, &attr, 0); + if let Some(attr) = ds.lookup_by_name(parent, name, &self.file_contents) { + if let Some(content) = self.file_contents.get(&attr.ino) { + let file_attr = regular_file_attr(attr.ino, content.len() as u64); + reply.entry(&TTL, &file_attr, 0); + } else { + reply.entry(&TTL, &attr, 0); + } return; } } @@ -334,12 +352,17 @@ impl Filesystem for CybershuttleFS { } fn getattr(&mut self, _req: &Request<'_>, ino: u64, reply: ReplyAttr) { + if let Some(content) = self.file_contents.get(&ino) { + reply.attr(&TTL, ®ular_file_attr(ino, content.len() as u64)); + return; + } + match ino { ROOT_INO => reply.attr(&TTL, &root_attr()), HELLO_INO => reply.attr(&TTL, &hello_attr()), _ => { for ds in &self.data_sources { - if let Some(attr) = ds.lookup_by_inode(ino) { + if let Some(attr) = ds.lookup_by_inode(ino, &self.file_contents) { reply.attr(&TTL, &attr); return; } @@ -360,14 +383,19 @@ impl Filesystem for CybershuttleFS { _lock_owner: Option, reply: ReplyData, ) { - if ino != HELLO_INO { + if let Some(content) = self.file_contents.get(&ino) { + let data = content.as_bytes(); + let start = (offset as usize).min(data.len()); + let end = (start + size as usize).min(data.len()); + reply.data(&data[start..end]); + } else if ino == HELLO_INO { + let data = HELLO_CONTENT.as_bytes(); + let start = (offset as usize).min(data.len()); + let end = (start + size as usize).min(data.len()); + reply.data(&data[start..end]); + } else { reply.error(ENOENT); - return; } - let data = HELLO_CONTENT.as_bytes(); - let start = (offset as usize).min(data.len()); - let end = (start + size as usize).min(data.len()); - reply.data(&data[start..end]); } fn readdir( @@ -378,14 +406,13 @@ impl Filesystem for CybershuttleFS { offset: i64, mut reply: ReplyDirectory, ) { - if ino == ROOT_INO { // Add the "hello.txt" entry to the root directory let mut entries = vec![ (ROOT_INO, FileType::Directory, "."), (ROOT_INO, FileType::Directory, ".."), ]; - + for ds in self.data_sources.iter() { entries.push((ds.inode_no, FileType::Directory, ds.name.as_str())); } @@ -399,32 +426,37 @@ impl Filesystem for CybershuttleFS { reply.ok(); return; } else { - for ds in &self.data_sources { - if let Some(attr) = ds.lookup_by_inode(ino) { - if attr.kind == FileType::Directory { - let mut entries = vec![ - (attr.ino, FileType::Directory, "."), - (ROOT_INO, FileType::Directory, ".."), - ]; - - for file in &ds.files { - entries.push((file.inode_no, FileType::RegularFile, file.name.as_str())); - } + let listing = if ino == ds.inode_no { + Some((ROOT_INO, ds.files.as_slice(), ds.directories.as_slice())) + } else { + ds.directories + .iter() + .find_map(|dir| find_directory_listing(dir, ino, ds.inode_no)) + }; + + if let Some((parent_inode, files, directories)) = listing { + let mut entries = vec![ + (ino, FileType::Directory, "."), + (parent_inode, FileType::Directory, ".."), + ]; + + for file in files { + entries.push((file.inode_no, FileType::RegularFile, file.name.as_str())); + } - for dir in &ds.directories { - entries.push((dir.inode_no, FileType::Directory, dir.name.as_str())); - } + for dir in directories { + entries.push((dir.inode_no, FileType::Directory, dir.name.as_str())); + } - for (i, entry) in entries.iter().enumerate().skip(offset as usize) { - // i + 1 is the next offset to resume from. - if reply.add(entry.0, (i + 1) as i64, entry.1, entry.2) { - break; - } + for (i, entry) in entries.iter().enumerate().skip(offset as usize) { + // i + 1 is the next offset to resume from. + if reply.add(entry.0, (i + 1) as i64, entry.1, entry.2) { + break; } - reply.ok(); - return; } + reply.ok(); + return; } } } @@ -435,68 +467,35 @@ impl Filesystem for CybershuttleFS { fn main() { env_logger::init(); - let mountpoint = std::env::args_os().nth(1).unwrap_or_else(|| { - eprintln!("Usage: cs-filesystem "); + let args: Vec = std::env::args().collect(); + if args.len() < 3 { + eprintln!("Usage: cs-filesystem "); std::process::exit(1); - }); + } - let options = vec![ - MountOption::RO, - MountOption::FSName("cybershuttlefs".to_string()), - MountOption::AutoUnmount, - MountOption::AllowOther, - ]; + let tsv_path = &args[1]; + let mountpoint = &args[2]; + let entries = load_atlas(tsv_path); + println!("Loaded {} ATLAS entries", entries.len()); let mut inode_gen = InodeGenerator::new(); + let mut file_contents = HashMap::new(); + let atlas_ds = build_atlas_datasource(&entries, &mut inode_gen, &mut file_contents); - let alp_dirs = vec![ - CSDirectory { - inode_no: inode_gen.next(), - name: "pdb".to_string(), - cs_data_id: "alphafold_pdb".to_string(), - files: vec![], - directories: vec![], - }, - CSDirectory { - inode_no: inode_gen.next(), - name: "fasta".to_string(), - cs_data_id: "alphafold_fasta".to_string(), - files: vec![], - directories: vec![], - }, - ]; - - - let alp_files = vec![ - CSFile { - inode_no: inode_gen.next(), - cs_data_id: "alphafold_summary".to_string(), - name: "summary.txt".to_string(), - }, - ]; - - - let protein_data_dirs = vec![ - CSDirectory { - inode_no: inode_gen.next(), - name: "uniprot".to_string(), - cs_data_id: "protein_data_uniprot".to_string(), - files: vec![], - directories: vec![], - }, - ]; + let data_sources = vec![atlas_ds]; + let fs = CybershuttleFS { + data_sources, + file_contents, + }; - - let data_sources = vec![ - DataSource::new(inode_gen.next(), "alphafold".to_string(), alp_dirs, alp_files), - DataSource::new(inode_gen.next(), "protein_data".to_string(), protein_data_dirs, vec![]), + let options = vec![ + MountOption::RO, + MountOption::FSName("cybershuttlefs".to_string()), + MountOption::AutoUnmount, ]; - - let fs = CybershuttleFS { data_sources }; - - if let Err(e) = fuser::mount2(fs, &mountpoint, &options) { + if let Err(e) = fuser::mount2(fs, mountpoint, &options) { eprintln!("Failed to mount filesystem: {e}"); std::process::exit(1); }