ensembl-hive  2.7.0
xref_parser.pl
Go to the documentation of this file.
1 #!/usr/bin/env perl
2 # See the NOTICE file distributed with this work for additional information
3 # regarding copyright ownership.
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 
17 use strict;
18 use warnings;
19 
20 use Getopt::Long qw(:config pass_through);
23 use File::Basename;
24 
25 my ( $host, $port, $user,
26  $pass, $dbname, $release,
27  $species, $taxon, $division_id,
28  $parser, $source, $file,
29  $db, $keep_db, $help);
30 
31 my $options = join(" ",@ARGV);
32 
33 print "Options: ".join(" ",@ARGV)."\n";
34 
35 GetOptions(
36  'dbuser|user=s' => \$user,
37  'dbpass|pass=s' => \$pass,
38  'dbhost|host=s' => \$host,
39  'dbport|port=i' => \$port,
40  'dbname=s' => \$dbname,
41  'release=s' => \$release,
42  'species=s' => \$species,
43  'taxon=s' => \$taxon,
44  'division_id=s' => \$division_id,
45  'parser=s' => \$parser,
46  'source=s' => \$source,
47  'file=s' => \$file,
48  'db=s' => \$db,
49  'keep_db=s' => \$keep_db,
50  'help' => sub { usage(); exit(0); } );
51 
52 if($ARGV[0]){
53  print STDERR "Unknown command line arguments:-\n";
54  foreach my $a (@ARGV){
55  print STDERR "\t".$a."\n";
56  }
57  print STDERR "Stopping script. Please fix the command line.\n";
58  print STDERR "use -help for full list of command line options.\n";;
59  exit(1);
60 }
61 
62  if ( !$host || !$species || !$parser) {
63  usage();
64  exit(1);
65  }
66 
67  my $registry = 'Bio::EnsEMBL::Registry';
68  $registry->load_registry_from_multiple_dbs(
69  {
70  -host => $host,
71  -port => $port,
72  -user => $user,
73  -pass => $pass,
74  -db_version => $release
75  });
76 
77  if (!defined $taxon) {
78  my $meta_container = $registry->get_adaptor($species,'core', 'MetaContainer');
79  $taxon = $meta_container->get_taxonomy_id();
80  }
81  if (!defined $division_id) {
82  my $meta_container = $registry->get_adaptor($species,'core', 'MetaContainer');
83  my $division = $meta_container->get_division();
84  my %division_taxon = (
85  'Ensembl' => 7742,
86  'EnsemblVertebrates' => 7742,
87  'Vertebrates' => 7742,
88  'EnsemblMetazoa' => 33208,
89  'Metazoa' => 33208
90  );
91  $division_id = $division_taxon{$division};
92  }
93 
94  my $sql_dir = dirname($0);
95 
96  my $xref_dbc = XrefParser::Database->new({
97  host => $host,
98  dbname => $dbname,
99  port => $port,
100  user => $user,
101  pass => $pass });
102  $xref_dbc->create($sql_dir, 1, 1) unless $keep_db;
103  my $xref_db_url = sprintf("mysql://%s:%s@%s:%s/%s", $user, $pass, $host, $port, $dbname);
104  my $xref_dbi = $xref_dbc->dbi();
105 
106  my $module = "XrefParser::$parser";
107  eval "require $module";
108  my $xref_run = $module->new($xref_dbc);
109 
110  my $source_id = get_source_id($xref_dbi, $parser, $taxon, $source, $division_id);
111 
112  if (defined $db) {
113  my $dba = $registry->get_DBAdaptor($species, $db);
114  $dba->dbc()->disconnect_if_idle();
115  $xref_run->run_script( { source_id => $source_id,
116  species_id => $taxon,
117  dba => $dba,
118  dbi => $xref_dbi,
119  species => $species,
120  file => $file}) ;
121  } else {
122  my @files;
123  push @files, $file;
124  $xref_run->run( { source_id => $source_id,
125  species_id => $taxon,
126  species => $species,
127  dbi => $xref_dbi,
128  files => [@files] }) ;
129  }
130 
131 sub get_source_id {
132  my ($dbi, $parser, $species_id, $name, $division_id) = @_;
133  $name = "%$name%";
134  my $source_id;
135  my $select_source_id_sth = $dbi->prepare("SELECT u.source_id FROM source_url u, source s WHERE s.source_id = u.source_id AND parser = ? and species_id = ?");
136  my $select_count_source_id_sth = $dbi->prepare("SELECT count(*) FROM source_url u, source s WHERE s.source_id = u.source_id AND parser = ? AND species_id = ?");
137  $select_count_source_id_sth->execute($parser, $species_id);
138  my $count = ($select_count_source_id_sth->fetchrow_array());
139  if ($count == 1) {
140  $select_source_id_sth->execute($parser, $species_id);
141  $source_id = ($select_source_id_sth->fetchrow_array());
142  }
143  $select_source_id_sth = $dbi->prepare("SELECT u.source_id FROM source_url u, source s WHERE s.source_id = u.source_id AND parser = ? and species_id = ? and name like ?");
144  $select_count_source_id_sth = $dbi->prepare("SELECT count(*) FROM source_url u, source s WHERE s.source_id = u.source_id AND parser = ? AND species_id = ? AND name like ?");
145  $select_count_source_id_sth->execute($parser, $species_id, $name);
146  $count = ($select_count_source_id_sth->fetchrow_array());
147  if ($count == 1) {
148  $select_source_id_sth->execute($parser, $species_id, $name);
149  $source_id = ($select_source_id_sth->fetchrow_array());
150  }
151  # If no species-specific source, look for common sources
152  if (!defined $source_id) {
153  $select_source_id_sth->execute($parser, $division_id, $name);
154  $source_id = ($select_source_id_sth->fetchrow_array())[0];
155  }
156  $select_source_id_sth->finish();
157  $select_count_source_id_sth->finish();
158  return $source_id;
159 }
160 
161 
162 
163 # --------------------------------------------------------------------------------
164 
165 sub usage {
166 
167  print << "EOF";
168 
169  xref_parser.pl -host {host} -port {port} -user {user} -pass {pass} -dbname {dbname} -release {release} \\
170  -species {species} -taxon_id {taxon_id} \\
171  -parser {parser} -source {source_id} -file {file} \\
172  -db {db} =keep_db {keep_db} \\
173  -help
174 
175  -user User name to access database. Must allow writing.
176 
177  -pass Password for user.
178 
179  -host Database host.
180 
181  -port Database port.
182 
183  -dbname Name of xref database to use/create.
184 
185  -release Release version of the species to parse
186  Used to find the right database on the server specified in the arguments
187 
188  -species Which species to import.
189  Species may be referred to by genus/species
190  (e.g. homo_sapiens) or common aliases (e.g. human).
191  Specifying an unknown species will cause a list
192  of valid species to be printed.
193 
194  -taxon Which taxon to import.
195  Can be used as an alternative to species.
196 
197  -division Which division the species belongs to.
198  This defines which sources will be parsed and does
199  not necessarily imply taxonomic relationship
200  (e.g. ciona intestinalis is a vertebrate in this context)
201 
202  -parser Which parser to run
203 
204  -source Name of the source to extra data for (should match equivalent parser)
205 
206  -file Location and name of the file to be parsed
207  Path should be absolute
208 
209  -db If the parser requires connection to a database, specify here
210  For example, specify otherfeatures when running RefSeqCoordinateParser
211 
212  -keep_db When re-using an existing xref database, use the option
213  By default, deletes any existing one and creates a new one
214 
215 EOF
216 
217 }
218 
219 #--------------------------------------------------------------------------------
get_source_id
public get_source_id()
Bio::EnsEMBL::Registry
Definition: Registry.pm:113
XrefParser::Database::new
public new()
run
public run()
usage
public usage()
XrefParser::Database
Definition: Database.pm:8