/**
* Parse BED-family files, which have 3-12 columns
* https://genome.ucsc.edu/FAQ/FAQformat.html#format1
*/
import {normalizeChr} from './utils';
/**
* @private
*/
function _bedMissing(value) {
// BED files specify . as the missing/ null value character
if (value === null || value === undefined || value === '.') {
return null;
}
return value;
}
/**
* @private
*/
function _hasNum(value) {
// Return a number, or null if value marked as missing
value = _bedMissing(value);
return value ? +value : null;
}
/**
* Parse a BED file, according to the widely used UCSC (quasi-)specification
*
* NOTE: This original version is aimed at tabix region queries, and carries an implicit assumption that data is the
* only thing that will be parsed. It makes no attempt to identify or handle header rows / metadata fields.
*
* @function
* @alias module:ext/lz-parsers~makeBed12Parser
* @param {object} options
* @param {Boolean} options.normalize Whether to normalize the output to the format expected by LocusZoom (eg type coercion
* for numbers, removing chr chromosome prefixes, and using 1-based and inclusive coordinates instead of 0-based disjoint intervals)
* @return function A configured parser function that runs on one line of text from an input file
*/
function makeBed12Parser({normalize = true} = {}) {
/*
* @param {String} line The line of text to be parsed
*/
return (line) => {
const tokens = line.trim().split('\t');
// The BED file format has 12 standardized columns. 3 are required and 9 are optional. At present, we will not
// attempt to parse any remaining tokens, or nonstandard files that reuse columns with a different meaning.
// https://en.wikipedia.org/wiki/BED_(file_format)
let [
chrom,
chromStart,
chromEnd,
name,
score,
strand,
thickStart,
thickEnd,
itemRgb,
blockCount,
blockSizes,
blockStarts,
] = tokens;
if (!(chrom && chromStart && chromEnd)) {
throw new Error('Sample data must provide all required BED columns');
}
strand = _bedMissing(strand);
if (normalize) {
// Mandatory fields
chrom = normalizeChr(chrom);
chromStart = +chromStart + 1; // BED is 0 based start, but LZ plots start at 1
chromEnd = +chromEnd; // 0-based positions, intervals exclude end position
// Optional fields, require checking for blanks
score = _hasNum(score);
thickStart = _hasNum(thickStart);
thickEnd = _hasNum(thickEnd);
itemRgb = _bedMissing(itemRgb);
// LocusZoom doesn't use these fields for rendering. Parsing below is theoretical/best-effort.
blockCount = _hasNum(blockCount);
blockSizes = _bedMissing(blockSizes);
blockSizes = !blockSizes ? null : blockSizes.replace(/,$/, '').split(',').map((value) => +value); // Comma separated list of sizes -> array of integers
blockStarts = _bedMissing(blockStarts);
blockStarts = !blockStarts ? null : blockStarts.replace(/,$/, '').split(',').map((value) => +value + 1); // Comma separated list of sizes -> array of integers (start positions)
if (blockSizes && blockStarts && blockCount && (blockSizes.length !== blockCount || blockStarts.length !== blockCount)) {
throw new Error('Block size and start information should provide the same number of items as in blockCount');
}
}
return {
chrom,
chromStart,
chromEnd,
name,
score,
strand,
thickStart,
thickEnd,
itemRgb,
blockCount,
blockSizes,
blockStarts,
};
};
}
export { makeBed12Parser };