ensembl-hive  2.6
import_bed_simple_feature.pl
Go to the documentation of this file.
1 #!/usr/bin/env perl
2 # Copyright [1999-2015] Wellcome Trust Sanger Institute and the EMBL-European Bioinformatics Institute
3 # Copyright [2016-2024] EMBL-European Bioinformatics Institute
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 
17 
18 # Designed to work on BED data such as that available from UCSC or 1KG:
19 # https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase1/analysis_results/supporting/accessible_genome_masks
20 #
21 
22 use strict;
23 use warnings;
24 
29 use Bio::EnsEMBL::Utils::IO qw/iterate_file/;
30 use POSIX qw/strftime/;
31 use Getopt::Long;
32 
33 my ($file,$db_name,$db_host,$db_user,$db_pass,$db_port,$help,$species,$group);
34 my ($dna_db_name,$dna_db_host,$dna_db_user,$dna_db_pass,$dna_db_port, $dna_group);
35 my ($logic_name, $description, $display_label);
36 my $write_every = -1;
37 $species = "human";
38 $group = 'core';
39 
40 GetOptions ("file=s" => \$file,
41  "db_name|dbname|database=s" => \$db_name,
42  "db_host|dbhost|host=s" => \$db_host,
43  "db_user|dbuser|user|username=s" => \$db_user,
44  "db_pass|dbpass|pass|password=s" => \$db_pass,
45  "db_port|dbport|port=s" => \$db_port,
46  "dna_db_name|dna_dbname|dna_database=s" => \$dna_db_name,
47  "dna_db_host|dna_dbhost|dna_host=s" => \$dna_db_host,
48  "dna_db_user|dna_dbuser|dna_user|dna_username=s" => \$dna_db_user,
49  "dna_db_pass|dna_dbpass|dna_pass|dna_password=s" => \$dna_db_pass,
50  "dna_db_port|dna_dbport|dna_port=s" => \$dna_db_port,
51  "dna_group=s" => \$dna_group,
52  "species=s" => \$species,
53  'group=s' => \$group,
54  'logic_name=s' => \$logic_name,
55  'description=s' => \$description,
56  'display_label=s' => \$display_label,
57  'write_every=i' => \$write_every,
58  "h!" => \$help,
59  "help!" => \$help,
60 );
61 
62 if ($help) {&usage; exit 0;}
63 unless ($file and $db_name and $db_host) {print "Insufficient arguments\n"; &usage; exit 1;}
64 unless ($logic_name) { print "No logic name given\n"; usage(); exit 1; }
65 
66 sub get_adaptor {
68  -species => $species,
69  -group => $group,
70  -dbname => $db_name,
71  -host => $db_host,
72  -user => $db_user,
73  -port => $db_port
74  );
75  $dba->dbc->password($db_pass) if $db_pass;
76 
77  if($dna_db_name) {
78  $dna_group ||= 'core';
79  $dna_db_host ||= $db_host;
80  $dna_db_port ||= $db_port;
81  $dna_db_user ||= $db_user;
82  my $dna_dba = Bio::EnsEMBL::DBSQL::DBAdaptor->new(
83  -species => $species,
84  -group => $dna_group,
85  -dbname => $dna_db_name,
86  -host => $dna_db_host,
87  -user => $dna_db_user,
88  -port => $dna_db_port
89  );
90  $dna_dba->dbc->password($dna_db_pass) if $dna_db_pass;
91  $dba->dnadb($dna_dba);
92  }
93  return $dba;
94 }
95 
96 # see bottom of file for this method call
97 sub run {
98  my $dba = get_adaptor();
99  if(! -f $file) {
100  die "No file found at $file";
101  }
102  process_file($file, $dba);
103  return;
104 }
105 
106 sub process_file {
107  my ($f, $dba) = @_;
108  my $analysis = get_Analysis($dba);
109  my @features;
110  my $count = 0;
111  my $commit_count = 0;
112  iterate_file($f, sub {
113  my ($line) = @_;
114  if($count != 0 && $count % 2000 == 0) {
115  info("Processed %s records", $count);
116  }
117  chomp $line;
118  my $sf = line_to_SimpleFeature($line, $analysis, $dba);
119  push(@features, $sf);
120  $count++;
121  $commit_count++;
122 
123  if($commit_count == $write_every) {
124  _store(\@features, $dba);
125  @features = ();
126  $commit_count = 0;
127  }
128  });
129  _store(\@features, $dba);
130  return;
131 }
132 
133 sub _store {
134  my ($features, $dba) = @_;
135  my $sfa = $dba->get_SimpleFeatureAdaptor();
136  my $count = scalar(@{$features});
137  if($count > 0) {
138  info("Writing %d feature(s)", $count);
139  $sfa->store(@{$features});
140  info("Done");
141  }
142  return;
143 }
144 
146  my ($line, $analysis, $dba) = @_;
147  my ($chr, $start, $end, $label, $score, $ucsc_strand) = split(/\t/, $line);
148  $start++; # UCSC is 0 idx start
149  #if was defined & -ve then set as so. +ve is default
150  my $strand = (defined $ucsc_strand && $strand eq '-') ? -1 : 1;
151  my $slice = get_Slice($chr, $dba);
152  my %args = (
153  -start => $start,
154  -end => $end,
155  -analysis => $analysis,
156  -slice => $slice,
157  -strand => $strand,
158  -display_label => $label,
159  );
160  $args{-SCORE} = $score if defined $score; # only add score if it was there
161  my $sf = Bio::EnsEMBL::SimpleFeature->new(%args);
162  return $sf;
163 }
164 
165 my %slices;
166 sub get_Slice {
167  my ($original, $dba) = @_;
168  my $name = $original;
169  $name =~ s/^chr//;
170  return $slices{$name} if exists $slices{name};
171  my $slice = $dba->get_SliceAdaptor()->fetch_by_region('toplevel', $name);
172  if(!$slice) {
173  die "Could not get a Slice from the Ensembl database for the given region '$original' or '$name' and coorindate system 'toplevel'. Check your core database";
174  }
175  $slices{$name} = $slice;
176  return $slice;
177 }
178 
179 sub get_Analysis {
180  my ($dba) = @_;
181  my $aa = $dba->get_AnalysisAdaptor();
182  my $analysis = $aa->fetch_by_logic_name($logic_name);
183  if(!$analysis) {
184  $analysis = Bio::EnsEMBL::Analysis->new(
185  -displayable => 1,
186  -logic_name => $logic_name,
187  );
188  $analysis->description($description) if $description;
189  $analysis->display_label($display_label) if $display_label;
190  $aa->store($analysis);
191  }
192  return $analysis;
193 }
194 
195 sub info {
196  my ($msg, @args) = @_;
197  my $m = sprintf $msg, @args;
198  my $time = strftime('%c',localtime());
199  printf STDERR '[%s] %s', $time, $m;
200  print STDERR "\n";
201  return;
202 }
203 
204 sub usage {
205  print "Launching instructions:
206  Run from a folder you are happy to have filled with files.
207 
208 Description:
209 
210 Import data from a BED file into the simple_feature table. Only supports
211 6 column BED files (location, name and score).
212 
213 Synopsis:
214 
215  perl import_bed_simple_feature.pl -file [PATH} -db_name NAME
216 
217 Options:
218 
219  -file Supply the file path
220  -logic_name Analysis logic name import data against
221 
222  -db_name The DB to add these features to
223  -database
224  -dbname
225 
226  -db_host Hostname for the DB
227  -host
228  -dbhost
229 
230  -db_user Username for the DB
231  -user
232  -username
233  -dbuser
234 
235  -db_pass Password for the DB
236  -pass
237  -password
238  -dbpass
239 
240  -db_port Port for the DB
241  -dbport
242  -port
243 
244  -dna_db_name The DNA DB to use if DB does not contain coordinate systems and DNA
245  -dna_database
246  -dna_dbname
247 
248  -dna_db_host Hostname for the DNA DB. Defaults to -host
249  -dna_host
250  -dna_dbhost
251 
252  -dna_db_user Username for the DNA DB. Defaults to -user
253  -dna_user
254  -dna_username
255  -dna_dbuser
256 
257  -dna_db_pass Password for the DNA DB. Defaults to -pass
258  -dna_pass
259  -dna_password
260  -dna_dbpass
261 
262  -dna_db_port Port for the DNA DB. Defaults to -port
263  -dna_dbport
264  -dna_port
265 
266  -dna_group
267 
268  -species Name of the species; defaults to human
269  -group Name of the DB group; defaults to core
270  -description Analysis description; only needed if analysis is not already in the DB
271  -display_label Analysis display label for the website; only needed if analysis is not already in the DB
272 
273  -write_every Write features once every N lines. Defaults to -1 (write once all records are parsed)
274  -help
275 ";
276 }
277 
278 run();
get_Analysis
public get_Analysis()
Bio::EnsEMBL::DBSQL::DBAdaptor
Definition: DBAdaptor.pm:40
_store
protected _store()
Bio::EnsEMBL::DBSQL::DBAdaptor::dbc
public Bio::EnsEMBL::DBSQL::DBConnection dbc()
run
public run()
line_to_SimpleFeature
public line_to_SimpleFeature()
Bio::EnsEMBL::Utils::IO
Definition: IO.pm:80
Bio::EnsEMBL::DBSQL::SimpleFeatureAdaptor
Definition: SimpleFeatureAdaptor.pm:31
Bio::EnsEMBL::DBSQL::DBConnection::password
public String password()
Bio::EnsEMBL::Analysis
Definition: PairAlign.pm:3
Bio::EnsEMBL::DBSQL::DBAdaptor::new
public Bio::EnsEMBL::DBSQL::DBAdaptor new()
get_Slice
public get_Slice()
Bio::EnsEMBL::SimpleFeature
Definition: SimpleFeature.pm:31
info
public info()
get_adaptor
public get_adaptor()
usage
public usage()
process_file
public process_file()
Bio::EnsEMBL::SimpleFeature::new
public Bio::EnsEMBL::Feature new()