Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
129 changes: 123 additions & 6 deletions packages/package-crawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,27 @@
const crypto = require('crypto');
const fs = require('fs');
const path = require('path');
const dns = require('dns');
const http = require('http');
const https = require('https');
const ipaddr = require('ipaddr.js');
const {debugLog} = require("../tx/operation-context");

// True if an IP literal is anything other than a normal public (unicast) address:
// loopback, private, link-local (incl. 169.254.169.254 cloud metadata), unique-local,
// CGNAT, multicast, reserved, etc. IPv4-mapped IPv6 addresses are unwrapped first.
function isNonPublicAddress(ip) {
try {
let addr = ipaddr.parse(ip);
if (addr.kind() === 'ipv6' && addr.isIPv4MappedAddress()) {
addr = addr.toIPv4Address();
}
return addr.range() !== 'unicast';
} catch (e) {
return true; // unparseable - treat as unsafe
}
}

class PackageCrawler {
log;
packages = new Set();
Expand Down Expand Up @@ -139,18 +158,106 @@
return url.replace(/^http:/, 'https:');
}

// Roots under which local-file feed reads are permitted. Reading local files supports
// locally-configured feeds (e.g. for testing); allowed roots come from
// config.localFeedDirs, plus the directory of a local master feed url. Anything else
// is rejected so a third-party feed can't point a read at an arbitrary server file.
allowedLocalRoots() {
const roots = [];
const cfg = this.config && this.config.localFeedDirs;
if (Array.isArray(cfg)) {
roots.push(...cfg);
} else if (typeof cfg === 'string' && cfg.length > 0) {
roots.push(cfg);
}
if (this.config && typeof this.config.masterUrl === 'string' && this.config.masterUrl.startsWith('/')) {
roots.push(path.dirname(this.config.masterUrl));
}
return roots.map((r) => path.resolve(r));
}

// A DNS lookup wrapper that rejects any host resolving to a non-public address.
// Enforced at connection time (so it also covers redirect targets and defeats
// DNS-rebinding), this is the SSRF guard for all outbound http(s) fetches.
// Set config.allowPrivateAddresses = true to disable (e.g. for local test registries).
ssrfLookup() {
const allowPrivate = !!(this.config && this.config.allowPrivateAddresses);
return (hostname, options, callback) => {
if (typeof options === 'function') {
callback = options;
options = {};
}
const wantAll = !!(options && options.all);
dns.lookup(hostname, Object.assign({}, options, { all: true }), (err, addresses) => {
if (err) {
callback(err);
return;
}
if (!allowPrivate) {
for (const a of addresses) {
if (isNonPublicAddress(a.address)) {
callback(new Error('Blocked request to non-public address ' + a.address + ' (host ' + hostname + ')'));
return;
}
}
}
if (wantAll) {
callback(null, addresses);
} else {
callback(null, addresses[0].address, addresses[0].family);
}
});
};
}

// http/https agents that route every connection through the SSRF lookup. Cached so
// connections can be pooled across requests.
guardedAgents() {
if (!this._guardedAgents) {
const lookup = this.ssrfLookup();
class GuardedHttpAgent extends http.Agent {
createConnection(options, cb) {
return super.createConnection(Object.assign({}, options, { lookup }), cb);
}
}
class GuardedHttpsAgent extends https.Agent {
createConnection(options, cb) {
return super.createConnection(Object.assign({}, options, { lookup }), cb);
}
}
this._guardedAgents = {
httpAgent: new GuardedHttpAgent({ keepAlive: true }),
httpsAgent: new GuardedHttpsAgent({ keepAlive: true })
};
}
return this._guardedAgents;
}

// Resolve a local feed path and confine it to an allowed root (path-injection guard).
resolveLocalReadPath(url) {
const resolved = path.resolve(url);
const roots = this.allowedLocalRoots();
const allowed = roots.some((root) => resolved === root || resolved.startsWith(root + path.sep));
if (!allowed) {
throw new Error('Refusing to read local file outside the allowed feed directories: ' + url);
}
return resolved;
}

async fetchJson(url) {
try {
if (url.startsWith("/")) {
const content = await fs.promises.readFile(url, "utf8");
const content = await fs.promises.readFile(this.resolveLocalReadPath(url), "utf8");
return JSON.parse(content);
} else {
const response = await axios.get(url, {
timeout: 30000,
signal: this.abortController?.signal,
headers: {
'User-Agent': 'FHIR Package Crawler/1.0'
}
},
httpAgent: this.guardedAgents().httpAgent,
httpsAgent: this.guardedAgents().httpsAgent
});
return response.data;
}
Expand All @@ -166,7 +273,7 @@
async fetchXml(url) {
try {
if (url.startsWith("/")) {
const content = await fs.promises.readFile(url, 'utf8');
const content = await fs.promises.readFile(this.resolveLocalReadPath(url), 'utf8');
const parser = new XMLParser({
ignoreAttributes: false,
attributeNamePrefix: '@_',
Expand All @@ -180,7 +287,9 @@
signal: this.abortController?.signal,
headers: {
'User-Agent': 'FHIR Package Crawler/1.0'
}
},
httpAgent: this.guardedAgents().httpAgent,
httpsAgent: this.guardedAgents().httpsAgent
});

const parser = new XMLParser({
Expand All @@ -203,18 +312,20 @@
async fetchUrl(url) {
try {
if (url.startsWith("/")) {
const buffer = await fs.promises.readFile(url);
const buffer = await fs.promises.readFile(this.resolveLocalReadPath(url));

Check failure

Code scanning / CodeQL

Uncontrolled data used in path expression High

This path depends on a
user-provided value
.
This path depends on a
user-provided value
.
This path depends on a
user-provided value
.
This path depends on a
user-provided value
.
Comment thread
grahamegrieve marked this conversation as resolved.
Dismissed
this.totalBytes += buffer.byteLength;
return buffer;
} else {
const response = await axios.get(url, {
timeout: 60000,
responseType: 'arraybuffer',
signal: this.abortController?.signal,
headers: {
'User-Agent': 'FHIR Package Crawler/1.0'
}
},
httpAgent: this.guardedAgents().httpAgent,
httpsAgent: this.guardedAgents().httpsAgent
});

Check failure

Code scanning / CodeQL

Server-side request forgery Critical

The
URL
of this request depends on a
user-provided value
.
The
URL
of this request depends on a
user-provided value
.
The
URL
of this request depends on a
user-provided value
.
The
URL
of this request depends on a
user-provided value
.

this.totalBytes += response.data.byteLength;
return Buffer.from(response.data);
Expand Down Expand Up @@ -558,6 +669,12 @@
if (npmPackage.hasJavaScript && !isTemplate && id !== 'hl7.fhir.pubpack') {
throw new Error(`Package ${idver} rejected: contains JavaScript files but is not a template package`);
}
// The feed gate (item.notForPublication) only sees the RSS entry. A package whose
// feed entry is clean can still carry notForPublication inside the tarball - that is
// a draft build that must never enter the registry. Reject it here too.
if (npmPackage.notForPublication) {
throw new Error(`Package ${idver} rejected: tarball is flagged notForPublication (draft build, not suitable for publication)`);
}

// Extract URLs from package
const urls = this.processPackageUrls(npmPackage);
Expand Down
104 changes: 104 additions & 0 deletions packages/packages.js
Original file line number Diff line number Diff line change
Expand Up @@ -592,6 +592,64 @@ class PackagesModule {
}
}

// Re-fetch a single package tarball and replace whatever is stored for it, bypassing
// the crawler's GUID dedup and notForPublication feed gate. Used to push out a
// corrected package that was already (mis)published.
async forceUpdatePackage(link) {
// Only allow fetching over http(s). This endpoint must never be usable to read
// local server files (path injection) - tarballs are always published web URLs.
if (!link || !/^https?:\/\//i.test(link)) {
throw new Error('Invalid package link (must be an http(s) URL): ' + link);
}
if (!this.crawler) {
this.crawler = new PackageCrawler(this.config, this.db, this.stats);
}

// The feed uses the versioned package.tgz url as the (permalink) GUID, so reusing
// the link as the GUID keeps a later crawl from inserting a duplicate.
const guid = link;

const buffer = await this.crawler.fetchUrl(link);
const npm = await this.crawler.extractNpmPackage(buffer, link);

// Refuse to re-store a still-broken package - that would just re-publish the bug.
if (npm.notForPublication) {
throw new Error('Refusing to store ' + npm.id + '#' + npm.version + ': fetched tarball is still flagged notForPublication');
}

const idver = npm.id + '#' + npm.version;
const replaced = await this.deleteVersionsByGuid(guid);

const itemLog = { status: '??' };
await this.crawler.store(link, link, guid, new Date(), buffer, idver, itemLog);

pckLog.info('Force-updated ' + idver + ' from ' + link + ' (replaced ' + replaced + ' existing row(s))');
return { status: 'updated', id: npm.id, version: npm.version, replaced };
}

// Delete a stored package version and all of its child rows, by GUID.
deleteVersionsByGuid(guid) {
return new Promise((resolve, reject) => {
this.db.all('SELECT PackageVersionKey FROM PackageVersions WHERE GUID = ?', [guid], (err, rows) => {
if (err) return reject(err);
const keys = (rows || []).map(r => r.PackageVersionKey);
if (keys.length === 0) return resolve(0);
const ph = keys.map(() => '?').join(',');
const stmts = [
'DELETE FROM PackageFHIRVersions WHERE PackageVersionKey IN (' + ph + ')',
'DELETE FROM PackageDependencies WHERE PackageVersionKey IN (' + ph + ')',
'DELETE FROM PackageURLs WHERE PackageVersionKey IN (' + ph + ')',
'DELETE FROM PackageVersions WHERE PackageVersionKey IN (' + ph + ')'
];
const runNext = (i) => {
if (i >= stmts.length) return resolve(keys.length);
this.db.run(stmts[i], keys, (e) => e ? reject(e) : runNext(i + 1));
};
runNext(0);
});
});
}

async initializeDatabase() {
return new Promise((resolve, reject) => {
// Use config path if absolute, otherwise resolve relative to data dir
Expand Down Expand Up @@ -1223,6 +1281,52 @@ class PackagesModule {
}
});

// Force-refresh specific packages, bypassing the feed. The crawler only fetches a
// package once (it dedupes on the feed GUID) and skips anything flagged
// notForPublication, so there is normally no way to make it re-pick-up a package
// that was published incorrectly and later corrected. This endpoint re-fetches the
// tarball(s) directly and replaces whatever is stored.
//
// POST /update-package { "links": ["http://hl7.org/fhir/uv/ips/2.0.1/package.tgz", ...] }
//
// The link is the versioned package.tgz url, which is exactly the GUID the feed uses,
// so the replacement keeps the same GUID and a later crawl won't create a duplicate.
// If config.updateToken is set, the request must carry it in the x-update-token header.
this.router.post('/update-package', async (req, res) => {
const start = Date.now();
try {
if (this.config.updateToken && req.headers['x-update-token'] !== this.config.updateToken) {
res.status(403).json({ error: 'forbidden: missing or invalid x-update-token' });
return;
}
let links = req.body && (req.body.links || req.body.packages || (req.body.url ? [req.body.url] : (req.body.link ? [req.body.link] : null)));
if (typeof links === 'string') links = [links];
if (!Array.isArray(links) || links.length === 0) {
res.status(400).json({ error: 'Provide a JSON body like {"links": ["<package.tgz url>", ...]}' });
return;
}
const results = [];
for (const link of links) {
try {
results.push(Object.assign({ link }, await this.forceUpdatePackage(link)));
} catch (e) {
pckLog.error('Force update failed for ' + link + ': ' + e.message);
results.push({ link, status: 'error', error: e.message });
}
}
const failed = results.filter(r => r.status === 'error').length;
res.status(failed === results.length ? 500 : 200).json({
message: 'Processed ' + results.length + ' package(s), ' + failed + ' failed',
results
});
} catch (error) {
pckLog.error('update-package endpoint failed:', error);
res.status(500).json({ error: 'update-package failed', message: error.message });
} finally {
this.stats.countRequest('update-package', Date.now() - start);
}
});

// Crawler statistics endpoint (existing)
this.router.get('/stats', async (req, res) => {
const start = Date.now();
Expand Down
Loading
Loading