-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathgetCodingDensity.pl
More file actions
executable file
·107 lines (74 loc) · 2.58 KB
/
Copy pathgetCodingDensity.pl
File metadata and controls
executable file
·107 lines (74 loc) · 2.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/usr/bin/perl -w
=head1 NAME
getCodingDensity.pl - estimates coding density of a sequence using its genbank file
=head1 USAGE
getCodingDensity.pl -g|gbk [.gbk] -f|features [comma separated features] > [report.out]
=head1 DESCRIPTION
Estimates the coding density of a sequence using its genbank file.
Works also when there are multiple contigs within the genbank file.
Incorporates also ori spanning features.
=head1 EXAMPLE
getCodingDensity.pl -g Holospora_undulata_HU1.gbk -f CDS,rRNA,tRNA > Holospora_codingDensity.report
=head1 TODO
Script is still quite ineffecient.
Will load regions that code for multiple coding features (for example overlapping CDS) multiple times in the hash.
This should be in principle unnecessary.
=head1 AUTHOR
Joran Martijn (joran.martijn@icm.uu.se)
=cut
use strict;
use Bio::SeqIO;
use Getopt::Long;
use Pod::Usage;
pod2usage (-msg => "Not enough arguments") unless @ARGV == 4;
my ($gbk,$fts);
GetOptions('g|gbk=s' => \$gbk,
'f|features=s' => \$fts);
my $in = Bio::SeqIO->new(-file => $gbk,
-format => 'genbank');
my @fts = split ',', $fts;
my $total_coding = 0;
my $total_genome_size = 0;
my $contig_count = 0;
while (my $seq = $in->next_seq) {
# declare data structures
my %coding_sites;
my $contig_size;
# add 1 to contig count
$contig_count++;
# loop over features of first contig
for my $ft ($seq->get_SeqFeatures) {
# get contig size
$contig_size = $ft->end() if ($ft->primary_tag eq 'source');
# skip all non-coding features
next unless ($ft->primary_tag ~~ @fts);
## count coding sites
# if CDS spans ori
if ( $ft->location->isa('Bio::Location::SplitLocationI') ) {
for my $loc ( $ft->location->sub_Location ) {
# print $loc->start . ".." . $loc->end . "\n";
my ($start, $end) = ($loc->start, $loc->end);
for (my $i = $start; $i <= $end; $i++) {
$coding_sites{$i}++;
}
}
}
# all other CDSs
else {
# print $start, "\t", $end, "\n";
my ($start, $end) = ($ft->start, $ft->end);
for (my $i = $start; $i <= $end; $i++) {
$coding_sites{$i}++;
}
}
}
# add number of coding sites of first contig to total count
my $coding = scalar keys %coding_sites;
$total_coding += $coding;
# add contig size to total assembly/genome size
$total_genome_size += $contig_size;
}
print "NUMBER OF CONTIGS: ", "\t", $contig_count, "\n";
print "CODING: ", "\t", $total_coding, "\n";
print "GENOME SIZE: ", "\t", $total_genome_size, "\n";
print "CODING DENSITY: ", "\t"; printf("%.3f", $total_coding / $total_genome_size); print "\n";