@@ -262,6 +262,8 @@ def parse_mrna(raw_mrna, biotypes_list):
262262def build_structures (structures_by_id ):
263263 biotypes_list = list (biotypes .keys ())
264264
265+ prev_gene = ''
266+
265267 structures = []
266268 for id in structures_by_id :
267269 structure_lists = structures_by_id [id ]
@@ -273,8 +275,19 @@ def build_structures(structures_by_id):
273275
274276 for structure_list in structure_lists [1 :]:
275277 subpart = parse_transcript_subpart (structure_list , mrna_start )
276- structure += [";" .join (subpart ) ]
277-
278+ structure += [";" .join (subpart )]
279+
280+ # Set transcript start coordinate relative to most-upstream transcript
281+ # This enables projecting genomic features (e.g. variants) onto
282+ # transcript coordinates. It also enables viewing multiple transcripts
283+ # in genomic coordinates, like typical genome browsers (Ensembl, IGV).
284+ gene_name = structure [1 ].split ('-' )[0 ]
285+ if gene_name != prev_gene :
286+ gene_start = int (mrna_start ) # Start of 1st transcript is gene start
287+ prev_gene = gene_name
288+ mrna_start_offset = str (int (mrna_start ) - gene_start )
289+
290+ structure .insert (2 , mrna_start_offset )
278291 structures .append (structure )
279292
280293 return structures
@@ -288,8 +301,8 @@ def parse_structures(canonical_ids, gff_path, gff_url):
288301
289302 Parts of a transcript that comprise "gene structure" here:
290303 * Exons: regions of gene not removed by RNA splicing
291- * 3 '-UTR: Three prime untranslated region; start region
292- * 5 '-UTR: Fix prime untranslated region; end region
304+ * 5 '-UTR: Fix prime untranslated region; start region (for +, end for -)
305+ * 3 '-UTR: Three prime untranslated region; end region (for +, start for -)
293306
294307 (Introns are the regions between 3'- and 5'-UTRs that are not exons.
295308 These are implied in the structure, and not modeled explicitly.)
0 commit comments