#!/usr/bin/awk -f
BEGIN {
FS=OFS="\t"
}
{
gsub(/; *$/, "", $9) # trim trailing `;'
split($9, pairs, / *; */) # split attributes into pairs
for (i in pairs) {
split(pairs[i], kv, / */) # split pair into key and value
attr[kv[1]] = kv[2] # add it to `attr'
}
# fill missing fields
if (!("gene_name" in attr))
attr["gene_name"] = attr["gene_id"]
if (!("transcript_id" in attr))
attr["transcript_id"] = attr["gene_id"]
if (!("transcript_name" in attr))
attr["transcript_name"] = attr["transcript_id"];
# recreate the attributes field
attr_all = sep = ""
for (k in attr) {
attr_all = attr_all sep k " " attr[k]
sep = "; "
}
# update the record with new attributes
$9 = attr_all
}
1 # print record
IyEvdXNyL2Jpbi9hd2sgLWYKQkVHSU4gewogIEZTPU9GUz0iXHQiCn0KewogIGdzdWIoLzsgKiQvLCAiIiwgJDkpICAgICAgICAjIHRyaW0gdHJhaWxpbmcgYDsnCiAgc3BsaXQoJDksIHBhaXJzLCAvICo7ICovKSAgICMgc3BsaXQgYXR0cmlidXRlcyBpbnRvIHBhaXJzCiAgZm9yIChpIGluIHBhaXJzKSB7CiAgICBzcGxpdChwYWlyc1tpXSwga3YsIC8gKi8pICMgc3BsaXQgcGFpciBpbnRvIGtleSBhbmQgdmFsdWUKICAgIGF0dHJba3ZbMV1dID0ga3ZbMl0gICAgICAgIyBhZGQgaXQgdG8gYGF0dHInCiAgfQogICMgZmlsbCBtaXNzaW5nIGZpZWxkcwogIGlmICghKCJnZW5lX25hbWUiIGluIGF0dHIpKQogICAgYXR0clsiZ2VuZV9uYW1lIl0gPSBhdHRyWyJnZW5lX2lkIl0KICBpZiAoISgidHJhbnNjcmlwdF9pZCIgaW4gYXR0cikpCiAgICBhdHRyWyJ0cmFuc2NyaXB0X2lkIl0gPSBhdHRyWyJnZW5lX2lkIl0KICBpZiAoISgidHJhbnNjcmlwdF9uYW1lIiBpbiBhdHRyKSkKICAgIGF0dHJbInRyYW5zY3JpcHRfbmFtZSJdID0gYXR0clsidHJhbnNjcmlwdF9pZCJdOwogICMgcmVjcmVhdGUgdGhlIGF0dHJpYnV0ZXMgZmllbGQKICBhdHRyX2FsbCA9IHNlcCA9ICIiCiAgZm9yIChrIGluIGF0dHIpIHsKICAgIGF0dHJfYWxsID0gYXR0cl9hbGwgc2VwIGsgIiAiIGF0dHJba10KICAgIHNlcCA9ICI7ICIKICB9CiAgIyB1cGRhdGUgdGhlIHJlY29yZCB3aXRoIG5ldyBhdHRyaWJ1dGVzCiAgJDkgPSBhdHRyX2FsbCAKfQoxICMgcHJpbnQgcmVjb3Jk
MQllbnNlbWJsCWdlbmUJNTI3MwkxMDA2MQkuCS0JLglnZW5lX2lkIEVOU0dBTEcwMDAwMDA1NDgxODsgZ2VuZV92ZXJzaW9uIDE7IGdlbmVfc291cmNlIGVuc2VtYmw7IGdlbmVfYmlvdHlwZSBwcm90ZWluX2NvZGluZzsKMQllbnNlbWJsCXRyYW5zY3JpcHQJNTI3MwkxMDA2MQkuCS0JLglnZW5lX2lkIEVOU0dBTEcwMDAwMDA1NDgxODsgZ2VuZV92ZXJzaW9uIDE7IHRyYW5zY3JpcHRfaWQgRU5TR0FMVDAwMDAwMDk4OTg0OyB0cmFuc2NyaXB0X3ZlcnNpb24gMTsgZ2VuZV9zb3VyY2UgZW5zZW1ibDsgZ2VuZV9iaW90eXBlIHByb3RlaW5fY29kaW5nOyB0cmFuc2NyaXB0X3NvdXJjZSBlbnNlbWJsOyB0cmFuc2NyaXB0X2Jpb3R5cGUgcHJvdGVpbl9jb2Rpbmc7CjEJZW5zZW1ibAlnZW5lCTU4NDI3CTU4NjE3CS4JKwkuCWdlbmVfaWQgRU5TR0FMRzAwMDAwMDQ3NTk0OyBnZW5lX3ZlcnNpb24gMTsgZ2VuZV9uYW1lIFJGMDAwMDQ7IGdlbmVfc291cmNlIGVuc2VtYmw7IGdlbmVfYmlvdHlwZSBzblJOQTsKMQllbnNlbWJsCXRyYW5zY3JpcHQJNTg0MjcJNTg2MTcJLgkrCS4JZ2VuZV9pZCBFTlNHQUxHMDAwMDAwNDc1OTQ7IGdlbmVfdmVyc2lvbiAxOyB0cmFuc2NyaXB0X2lkIEVOU0dBTFQwMDAwMDA5NDM4MjsgdHJhbnNjcmlwdF92ZXJzaW9uIDE7IGdlbmVfbmFtZSBSRjAwMDA0OyBnZW5lX3NvdXJjZSBlbnNlbWJsOyBnZW5lX2Jpb3R5cGUgc25STkE7IHRyYW5zY3JpcHRfbmFtZSBSRjAwMDA0LTIwMTsgdHJhbnNjcmlwdF9zb3VyY2UgZW5zZW1ibDsgdHJhbnNjcmlwdF9iaW90eXBlIHNuUk5BOwoxCWVuc2VtYmwJZXhvbgk1ODQyNwk1ODYxNwkuCSsJLglnZW5lX2lkIEVOU0dBTEcwMDAwMDA0NzU5NDsgZ2VuZV92ZXJzaW9uIDE7IHRyYW5zY3JpcHRfaWQgRU5TR0FMVDAwMDAwMDk0MzgyOyB0cmFuc2NyaXB0X3ZlcnNpb24gMTsgZXhvbl9udW1iZXIgMTsgZ2VuZV9uYW1lIFJGMDAwMDQ7IGdlbmVfc291cmNlIGVuc2VtYmw7IGdlbmVfYmlvdHlwZSBzblJOQTsgdHJhbnNjcmlwdF9uYW1lIFJGMDAwMDQtMjAxOyB0cmFuc2NyaXB0X3NvdXJjZSBlbnNlbWJsOyB0cmFuc2NyaXB0X2Jpb3R5cGUgc25STkE7IGV4b25faWQgRU5TR0FMRTAwMDAwNDYwMTI1OyBleG9uX3ZlcnNpb24gMTsKMQllbnNlbWJsCWdlbmUJNjMyNjQJNjM0NTQJLgkrCS4JZ2VuZV9pZCBFTlNHQUxHMDAwMDAwNDkyMDY7IGdlbmVfdmVyc2lvbiAxOyBnZW5lX25hbWUgUkYwMDAwNDsgZ2VuZV9zb3VyY2UgZW5zZW1ibDsgZ2VuZV9iaW90eXBlIHNuUk5BOwoxCWVuc2VtYmwJdHJhbnNjcmlwdAk2MzI2NAk2MzQ1NAkuCSsJLglnZW5lX2lkIEVOU0dBTEcwMDAwMDA0OTIwNjsgZ2VuZV92ZXJzaW9uIDE7IHRyYW5zY3JpcHRfaWQgRU5TR0FMVDAwMDAwMDkyNzgwOyB0cmFuc2NyaXB0X3ZlcnNpb24gMTsgZ2VuZV9uYW1lIFJGMDAwMDQ7IGdlbmVfc291cmNlIGVuc2VtYmw7IGdlbmVfYmlvdHlwZSBzblJOQTsgdHJhbnNjcmlwdF9uYW1lIFJGMDAwMDQtMjAxOyB0cmFuc2NyaXB0X3NvdXJjZSBlbnNlbWJsOyB0cmFuc2NyaXB0X2Jpb3R5cGUgc25STkE7CjEJZW5zZW1ibAlleG9uCTYzMjY0CTYzNDU0CS4JKwkuCWdlbmVfaWQgRU5TR0FMRzAwMDAwMDQ5MjA2OyBnZW5lX3ZlcnNpb24gMTsgdHJhbnNjcmlwdF9pZCBFTlNHQUxUMDAwMDAwOTI3ODA7IHRyYW5zY3JpcHRfdmVyc2lvbiAxOyBleG9uX251bWJlciAxOyBnZW5lX25hbWUgUkYwMDAwNDsgZ2VuZV9zb3VyY2UgZW5zZW1ibDsgZ2VuZV9iaW90eXBlIHNuUk5BOyB0cmFuc2NyaXB0X25hbWUgUkYwMDAwNC0yMDE7IHRyYW5zY3JpcHRfc291cmNlIGVuc2VtYmw7IHRyYW5zY3JpcHRfYmlvdHlwZSBzblJOQTsgZXhvbl9pZCBFTlNHQUxFMDAwMDA1MDE5NDE7IGV4b25fdmVyc2lvbiAxOw==
1 ensembl gene 5273 10061 . - . gene_id ENSGALG00000054818; gene_version 1; gene_source ensembl; gene_biotype protein_coding;
1 ensembl transcript 5273 10061 . - . gene_id ENSGALG00000054818; gene_version 1; transcript_id ENSGALT00000098984; transcript_version 1; gene_source ensembl; gene_biotype protein_coding; transcript_source ensembl; transcript_biotype protein_coding;
1 ensembl gene 58427 58617 . + . gene_id ENSGALG00000047594; gene_version 1; gene_name RF00004; gene_source ensembl; gene_biotype snRNA;
1 ensembl transcript 58427 58617 . + . gene_id ENSGALG00000047594; gene_version 1; transcript_id ENSGALT00000094382; transcript_version 1; gene_name RF00004; gene_source ensembl; gene_biotype snRNA; transcript_name RF00004-201; transcript_source ensembl; transcript_biotype snRNA;
1 ensembl exon 58427 58617 . + . gene_id ENSGALG00000047594; gene_version 1; transcript_id ENSGALT00000094382; transcript_version 1; exon_number 1; gene_name RF00004; gene_source ensembl; gene_biotype snRNA; transcript_name RF00004-201; transcript_source ensembl; transcript_biotype snRNA; exon_id ENSGALE00000460125; exon_version 1;
1 ensembl gene 63264 63454 . + . gene_id ENSGALG00000049206; gene_version 1; gene_name RF00004; gene_source ensembl; gene_biotype snRNA;
1 ensembl transcript 63264 63454 . + . gene_id ENSGALG00000049206; gene_version 1; transcript_id ENSGALT00000092780; transcript_version 1; gene_name RF00004; gene_source ensembl; gene_biotype snRNA; transcript_name RF00004-201; transcript_source ensembl; transcript_biotype snRNA;
1 ensembl exon 63264 63454 . + . gene_id ENSGALG00000049206; gene_version 1; transcript_id ENSGALT00000092780; transcript_version 1; exon_number 1; gene_name RF00004; gene_source ensembl; gene_biotype snRNA; transcript_name RF00004-201; transcript_source ensembl; transcript_biotype snRNA; exon_id ENSGALE00000501941; exon_version 1;