ensembl-hive  2.8.1
karyotype_rank_assigner.pl
Go to the documentation of this file.
1 #!/usr/bin/env perl
2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 
17 
18 use strict;
19 use warnings;
20 
22 use Bio::EnsEMBL::Utils::Net qw/do_FTP/;
23 use Bio::EnsEMBL::Utils::Exception qw/throw/;
24 use Getopt::Long;
25 
26 my ($db_name, $db_host, $db_user, $db_pass, $db_port, $help, $species, $group, $no_interactive);
27 my @user_karyotype;
28 
29 my $NCBI_BASE = 'https://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/All';
30 
31 $db_port = 3306;
32 $species = 'human';
33 $group = 'core';
34 
35 GetOptions(
36  "db_name|dbname|database=s" => \$db_name,
37  "db_host|dbhost|host=s" => \$db_host,
38  "db_user|dbuser|user|username=s" => \$db_user,
39  "db_pass|dbpass|pass|password=s" => \$db_pass,
40  "db_port|dbport|port=s" => \$db_port,
41  "species=s" => \$species,
42  'group=s' => \$group,
43  'karyotype=s@' => \@user_karyotype,
44  'no_interactive!' => \$no_interactive,
45  "h!" => \$help,
46  "help!" => \$help,
47 );
48 
49 if ($help) { &usage; exit 0; }
50 unless ($db_name and $db_host) { print "Insufficient arguments\n"; &usage; exit 1; }
51 
52 if(@user_karyotype) {
53  @user_karyotype = strings_to_array(@user_karyotype);
54 }
55 
56 sub get_adaptor {
58  -species => $species,
59  -group => $group,
60  -dbname => $db_name,
61  -host => $db_host,
62  -user => $db_user,
63  -port => $db_port
64  );
65  $dba->dbc->password($db_pass) if $db_pass;
66  return $dba;
67 }
68 
69 sub run {
70  my $dba = get_adaptor();
71 
72  my @karyotype;
73  if(@user_karyotype) {
74  @karyotype = @user_karyotype;
75  }
76  else {
77  @karyotype = @{ncbi_karyotype($dba)};
78  }
79 
80  print_karyotype(@karyotype);
81 
82  my $ok_to_write = confirm('Is this correct?');
83  if(!$ok_to_write) {
84  my $new_karyotype = capture_user_input("Please give the correct karyotype (comma separated)");
85  @karyotype = strings_to_array($new_karyotype);
86  print_karyotype(@karyotype);
87  }
88 
89  my $slices = karyotype_to_Slices($dba, @karyotype);
90  my $has_karyotype = has_karyotype($slices);
91  my $overwrite = 0;
92  if($has_karyotype) {
93  $overwrite = confirm('Karyotypes have already been assigned. Are you sure you want to overwrite');
94  if(!$overwrite) {
95  die "Cannot continue. Karyotypes have already been assigned to these Slices";
96  }
97  }
98 
99  write_ranks($dba, $slices);
100 
101  return;
102 }
103 
104 sub accession {
105  my ($dba) = @_;
106  my $mc = $dba->get_MetaContainer();
107  my $acc = $mc->single_value_by_key('assembly.accession', 1);
108  if(!$acc) {
109  print "Cannot continue. Species does not have a valid Genome Collections accession in its meta table\n";
110  exit 1;
111  }
112  return $acc;
113 }
114 
115 sub ncbi_karyotype {
116  my ($dba) = @_;
117  my $accession = accession($dba);
118  print STDOUT "Fetching karyotype from NCBI using accession '$accession'\n";
119  my $url = "$NCBI_BASE/${accession}.assembly.txt";
120  my $content = do_FTP($url, 5, 2);
121  my @lines = split(/\n/, $content);
122  my @karyotype;
123  foreach my $line (@lines) {
124  next if $line =~ /^#/;
125  chomp $line;
126  my ($name, $role) = split(/\t/, $line);
127  if($role eq 'assembled-molecule') {
128  push(@karyotype, $name);
129  }
130  }
131  return \@karyotype;
132 }
133 
135  my ($dba, @karyotype) = @_;
136  my $sa = $dba->get_SliceAdaptor();
137  my @slices;
138  foreach my $name (@karyotype) {
139  my $slice = $sa->fetch_by_region('toplevel', $name);
140  push(@slices, $slice);
141  }
142  return \@slices;
143 }
144 
145 sub confirm {
146  my ($question) = @_;
147  if($no_interactive) {
148  return 1;
149  }
150  my $userinput = capture_user_input("$question [y(es)/N(o)]");
151  return ($userinput =~ /^y(?:es)?$/xmsi) ? 1 : 0;
152 }
153 
154 sub capture_user_input {
155  my ($msg) = @_;
156  die "Cannot continue; asking for user input but -no_interactive is on" if $no_interactive;
157  print STDOUT "$msg: ";
158  my $userinput = <STDIN>;
159  chomp ($userinput);
160  return $userinput;
161 }
162 
163 sub print_karyotype {
164  my (@karyotype) = @_;
165  print STDOUT "Karyotype to be used: ";
166  print STDOUT join(q{,},@karyotype);
167  print "\n";
168  return;
169 }
170 
171 sub strings_to_array {
172  return split(q{,}, join(q{,}, @_));
173 }
174 
175 sub has_karyotype {
176  my ($slices) = @_;
177  my $has_karyotype = 0;
178  foreach my $slice (@{$slices}) {
179  if($slice->has_karyotype()) {
180  $has_karyotype = 1;
181  last;
182  }
183  }
184  return $has_karyotype;
185 }
186 
187 sub write_ranks {
188  my ($dba, $slices) = @_;
189  my $aa = $dba->get_AttributeAdaptor();
190  my $code = 'karyotype_rank';
191  my $rank = 1;
192  foreach my $slice (@{$slices}) {
193  printf STDOUT '%s has been assigned rank %d', $slice->seq_region_name(), $rank;
194  print "\n";
195  $aa->remove_from_Slice($slice, $code);
196  $aa->store_on_Slice($slice, $code, $rank);
197  $rank++;
198  }
199  return;
200 }
201 
202 sub usage {
203  print "Description:
204 
205 A program for writing karyotype ranks into a core-like database. If one is
206 not specified then we query NCBI's assembly report resource for a candidate
207 list. The program also allows you to specify a custom rank if required.
208 
209 This module uses LWP to communicate with NCBI's FTP site. Should you have
210 any issues please consult its documentation on how to debug the issue. Also
211 make sure you can access $NCBI_BASE
212 
213 Synopsis:
214 
215  perl karyotype_rank_assigner.pl -db_name NAME -db_host HOST -db_user USR -db_pass PASS -species human -group GRP
216 
217 Options:
218 
219  -db_name The DB to add the rank to
220  -database
221  -dbname
222 
223  -db_host Hostname for the DB
224  -host
225  -dbhost
226 
227  -db_user Username for the DB
228  -user
229  -username
230  -dbuser
231 
232  -db_pass Password for the DB
233  -pass
234  -password
235  -dbpass
236 
237  -db_port Port for the DB
238  -dbport
239  -port
240 
241  -species Name of the species; defaults to human
242  -group Name of the DB group; defaults to core
243 
244  -karyotype Comma separated list of karyotypes to write. Also
245  supports specifying this option multiple times
246 
247  -no_interactive Do not prompt for confirmation
248 
249  -help Print this message
250 ";
251 }
252 
253 run();
Bio::EnsEMBL::Utils::Net
Definition: Net.pm:25
confirm
public confirm()
run
public run()
Bio::EnsEMBL::DBSQL::DBAdaptor
Definition: DBAdaptor.pm:40
print_karyotype
public print_karyotype()
karyotype_to_Slices
public karyotype_to_Slices()
Bio::EnsEMBL::DBSQL::DBAdaptor::dbc
public Bio::EnsEMBL::DBSQL::DBConnection dbc()
accession
public accession()
ncbi_karyotype
public ncbi_karyotype()
capture_user_input
public capture_user_input()
strings_to_array
public strings_to_array()
Bio::EnsEMBL::DBSQL::DBConnection::password
public String password()
has_karyotype
public has_karyotype()
usage
public usage()
get_adaptor
public get_adaptor()
Bio::EnsEMBL::DBSQL::DBAdaptor::new
public Bio::EnsEMBL::DBSQL::DBAdaptor new()
write_ranks
public write_ranks()
Bio::EnsEMBL::Utils::Exception
Definition: Exception.pm:68