I have a file.txt, and I am trying to extract the rows when the last column ($9) contains the string gene_type=transcribed_unprocessed
I tried awk '{ if [[$9 == *"gene_type=transcribed_unprocessed"*]] { print }}' file.txt > output.txt
and also awk -F, '$9 ~ /gene_type=transcribed_unprocessed/' file.txt > output.txt
but they both gave me empty output files...
The ideal output would give me rows 1, 3, 4, 6 as they contain the string"gene_type=transcribed_unprocessed"
Any suggestions would be greatly appreciated, thank you.
file.txt:
chr1 HAVANA gene 29554 31109 . . ID=ENSG00000243485.5;gene_id=ENSG00000243485.5;gene_type=transcribed_unprocessed;gene_name=MIR1302-2HG;level=2;hgnc_id=HGNC:52482;tag=ncRNA_host;havana_gene=OTTHUMG00000000959.2
chr1 HAVANA transcript 29554 31097 . . ID=ENST00000473358.1;Parent=ENSG00000243485.5;gene_id=ENSG00000243485.5;transcript_id=ENST00000473358.1;gene_type=lncRNA;gene_name=MIR1302-2HG;transcript_type=lncRNA;transcript_name=MIR1302-2HG-202;level=2;transcript_support_level=5;hgnc_id=HGNC:52482;tag=not_best_in_genome_evidence,dotter_confirmed,basic,Ensembl_canonical;havana_gene=OTTHUMG00000000959.2;havana_transcript=OTTHUMT00000002840.1
chr1 HAVANA exon 29554 30039 . . ID=exon:ENST00000473358.1:1;Parent=ENST00000473358.1;gene_id=ENSG00000243485.5;transcript_id=ENST00000473358.1;gene_type=transcribed_unprocessed;gene_name=MIR1302-2HG;transcript_type=lncRNA;transcript_name=MIR1302-2HG-202;exon_number=1;exon_id=ENSE00001947070.1;level=2;transcript_support_level=5;hgnc_id=HGNC:52482;tag=not_best_in_genome_evidence,dotter_confirmed,basic,Ensembl_canonical;havana_gene=OTTHUMG00000000959.2;havana_transcript=OTTHUMT00000002840.1
chr1 HAVANA exon 30564 30667 . . ID=exon:ENST00000473358.1:2;Parent=ENST00000473358.1;gene_id=ENSG00000243485.5;transcript_id=ENST00000473358.1;gene_type=transcribed_unprocessed;gene_name=MIR1302-2HG;transcript_type=lncRNA;transcript_name=MIR1302-2HG-202;exon_number=2;exon_id=ENSE00001922571.1;level=2;transcript_support_level=5;hgnc_id=HGNC:52482;tag=not_best_in_genome_evidence,dotter_confirmed,basic,Ensembl_canonical;havana_gene=OTTHUMG00000000959.2;havana_transcript=OTTHUMT00000002840.1
chr1 HAVANA exon 30976 31097 . . ID=exon:ENST00000473358.1:3;Parent=ENST00000473358.1;gene_id=ENSG00000243485.5;transcript_id=ENST00000473358.1;gene_type=lncRNA;gene_name=MIR1302-2HG;transcript_type=lncRNA;transcript_name=MIR1302-2HG-202;exon_number=3;exon_id=ENSE00001827679.1;level=2;transcript_support_level=5;hgnc_id=HGNC:52482;tag=not_best_in_genome_evidence,dotter_confirmed,basic,Ensembl_canonical;havana_gene=OTTHUMG00000000959.2;havana_transcript=OTTHUMT00000002840.1
chr1 HAVANA transcript 30267 31109 . . ID=ENST00000469289.1;Parent=ENSG00000243485.5;gene_id=ENSG00000243485.5;transcript_id=ENST00000469289.1;gene_type=transcribed_unprocessed;gene_name=MIR1302-2HG;transcript_type=lncRNA;transcript_name=MIR1302-2HG-201;level=2;transcript_support_level=5;hgnc_id=HGNC:52482;tag=not_best_in_genome_evidence,basic;havana_gene=OTTHUMG00000000959.2;havana_transcript=OTTHUMT00000002841.2
chr1 HAVANA exon 30267 30667 . . ID=exon:ENST00000469289.1:1;Parent=ENST00000469289.1;gene_id=ENSG00000243485.5;transcript_id=ENST00000469289.1;gene_type=lncRNA;gene_name=MIR1302-2HG;transcript_type=lncRNA;transcript_name=MIR1302-2HG-201;exon_number=1;exon_id=ENSE00001841699.1;level=2;transcript_support_level=5;hgnc_id=HGNC:52482;tag=not_best_in_genome_evidence,basic;havana_gene=OTTHUMG00000000959.2;havana_transcript=OTTHUMT00000002841.2
CodePudding user response:
[[$9 == *"gene_type=transcribed_unproc>"*]]
-- this is (almost but not quite) bash syntax. Awk syntax is more C-like.
Next, judging by the sample data, the lines end with the given string.
awk '/gene_type=transcribed_unproc>$/' file.txt > output.txt
See https://stackoverflow.com/tags/awk/info for more into about awk.
OK, I see the updated question.
grep -F 'gene_type=transcribed_unprocessed' file.txt
awk '/gene_type=transcribed_unprocessed/' file.txt
awk 'index($0, "gene_type=transcribed_unprocessed") != 0' file.txt
CodePudding user response:
and also
awk -F, '$9 ~ /gene_type=transcribed_unprocessed/' file.txt > output.txt
but they both gave me empty output files...
Your file.txt
chr1 HAVANA gene 29554 31109 . . ID=ENSG00000243485.5;gene_id=ENSG00000243485.5;gene_type=transcribed_unprocessed;gene_name=MIR1302-2HG;level=2;hgnc_id=HGNC:52482;tag=ncRNA_host;havana_gene=OTTHUMG00000000959.2
chr1 HAVANA transcript 29554 31097 . . ID=ENST00000473358.1;Parent=ENSG00000243485.5;gene_id=ENSG00000243485.5;transcript_id=ENST00000473358.1;gene_type=lncRNA;gene_name=MIR1302-2HG;transcript_type=lncRNA;transcript_name=MIR1302-2HG-202;level=2;transcript_support_level=5;hgnc_id=HGNC:52482;tag=not_best_in_genome_evidence,dotter_confirmed,basic,Ensembl_canonical;havana_gene=OTTHUMG00000000959.2;havana_transcript=OTTHUMT00000002840.1
chr1 HAVANA exon 29554 30039 . . ID=exon:ENST00000473358.1:1;Parent=ENST00000473358.1;gene_id=ENSG00000243485.5;transcript_id=ENST00000473358.1;gene_type=transcribed_unprocessed;gene_name=MIR1302-2HG;transcript_type=lncRNA;transcript_name=MIR1302-2HG-202;exon_number=1;exon_id=ENSE00001947070.1;level=2;transcript_support_level=5;hgnc_id=HGNC:52482;tag=not_best_in_genome_evidence,dotter_confirmed,basic,Ensembl_canonical;havana_gene=OTTHUMG00000000959.2;havana_transcript=OTTHUMT00000002840.1
chr1 HAVANA exon 30564 30667 . . ID=exon:ENST00000473358.1:2;Parent=ENST00000473358.1;gene_id=ENSG00000243485.5;transcript_id=ENST00000473358.1;gene_type=transcribed_unprocessed;gene_name=MIR1302-2HG;transcript_type=lncRNA;transcript_name=MIR1302-2HG-202;exon_number=2;exon_id=ENSE00001922571.1;level=2;transcript_support_level=5;hgnc_id=HGNC:52482;tag=not_best_in_genome_evidence,dotter_confirmed,basic,Ensembl_canonical;havana_gene=OTTHUMG00000000959.2;havana_transcript=OTTHUMT00000002840.1
chr1 HAVANA exon 30976 31097 . . ID=exon:ENST00000473358.1:3;Parent=ENST00000473358.1;gene_id=ENSG00000243485.5;transcript_id=ENST00000473358.1;gene_type=lncRNA;gene_name=MIR1302-2HG;transcript_type=lncRNA;transcript_name=MIR1302-2HG-202;exon_number=3;exon_id=ENSE00001827679.1;level=2;transcript_support_level=5;hgnc_id=HGNC:52482;tag=not_best_in_genome_evidence,dotter_confirmed,basic,Ensembl_canonical;havana_gene=OTTHUMG00000000959.2;havana_transcript=OTTHUMT00000002840.1
chr1 HAVANA transcript 30267 31109 . . ID=ENST00000469289.1;Parent=ENSG00000243485.5;gene_id=ENSG00000243485.5;transcript_id=ENST00000469289.1;gene_type=transcribed_unprocessed;gene_name=MIR1302-2HG;transcript_type=lncRNA;transcript_name=MIR1302-2HG-201;level=2;transcript_support_level=5;hgnc_id=HGNC:52482;tag=not_best_in_genome_evidence,basic;havana_gene=OTTHUMG00000000959.2;havana_transcript=OTTHUMT00000002841.2
chr1 HAVANA exon 30267 30667 . . ID=exon:ENST00000469289.1:1;Parent=ENST00000469289.1;gene_id=ENSG00000243485.5;transcript_id=ENST00000469289.1;gene_type=lncRNA;gene_name=MIR1302-2HG;transcript_type=lncRNA;transcript_name=MIR1302-2HG-201;exon_number=1;exon_id=ENSE00001841699.1;level=2;transcript_support_level=5;hgnc_id=HGNC:52482;tag=not_best_in_genome_evidence,basic;havana_gene=OTTHUMG00000000959.2;havana_transcript=OTTHUMT00000002841.2
is not comma (,
) separated, it does contain some ,
as part of data, but at most 3 per line, so $9
is always empty and thus condition never holds, your data is apparently 1-or-more-whitespaces separated, so GNU AWK
by default would parse it in expected way - there is no need to set field separator, that is
awk '$9 ~ /gene_type=transcribed_unprocessed/' file.txt
gives output
chr1 HAVANA gene 29554 31109 . . ID=ENSG00000243485.5;gene_id=ENSG00000243485.5;gene_type=transcribed_unprocessed;gene_name=MIR1302-2HG;level=2;hgnc_id=HGNC:52482;tag=ncRNA_host;havana_gene=OTTHUMG00000000959.2
chr1 HAVANA exon 29554 30039 . . ID=exon:ENST00000473358.1:1;Parent=ENST00000473358.1;gene_id=ENSG00000243485.5;transcript_id=ENST00000473358.1;gene_type=transcribed_unprocessed;gene_name=MIR1302-2HG;transcript_type=lncRNA;transcript_name=MIR1302-2HG-202;exon_number=1;exon_id=ENSE00001947070.1;level=2;transcript_support_level=5;hgnc_id=HGNC:52482;tag=not_best_in_genome_evidence,dotter_confirmed,basic,Ensembl_canonical;havana_gene=OTTHUMG00000000959.2;havana_transcript=OTTHUMT00000002840.1
chr1 HAVANA exon 30564 30667 . . ID=exon:ENST00000473358.1:2;Parent=ENST00000473358.1;gene_id=ENSG00000243485.5;transcript_id=ENST00000473358.1;gene_type=transcribed_unprocessed;gene_name=MIR1302-2HG;transcript_type=lncRNA;transcript_name=MIR1302-2HG-202;exon_number=2;exon_id=ENSE00001922571.1;level=2;transcript_support_level=5;hgnc_id=HGNC:52482;tag=not_best_in_genome_evidence,dotter_confirmed,basic,Ensembl_canonical;havana_gene=OTTHUMG00000000959.2;havana_transcript=OTTHUMT00000002840.1
chr1 HAVANA transcript 30267 31109 . . ID=ENST00000469289.1;Parent=ENSG00000243485.5;gene_id=ENSG00000243485.5;transcript_id=ENST00000469289.1;gene_type=transcribed_unprocessed;gene_name=MIR1302-2HG;transcript_type=lncRNA;transcript_name=MIR1302-2HG-201;level=2;transcript_support_level=5;hgnc_id=HGNC:52482;tag=not_best_in_genome_evidence,basic;havana_gene=OTTHUMG00000000959.2;havana_transcript=OTTHUMT00000002841.2
Be warned that this does always check against 9th column, if you want to check against last field then you should do
awk '$NF ~ /gene_type=transcribed_unprocessed/' file.txt
(tested in gawk 4.2.1)