ensembl-hive  2.8.1
RFAMParser.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 package XrefParser::RFAMParser;
21 
22 use strict;
23 use warnings;
24 use Carp;
25 use DBI;
26 
27 use base qw( XrefParser::BaseParser );
29 
30 
31 sub run_script {
32 
33  my ($self, $ref_arg) = @_;
34  my $source_id = $ref_arg->{source_id};
35  my $species_id = $ref_arg->{species_id};
36  my $species_name = $ref_arg->{species};
37  my $file = $ref_arg->{file};
38  my $verbose = $ref_arg->{verbose};
39  my $core_db = $ref_arg->{dba};
40  my $dbi = $ref_arg->{dbi};
41  $dbi = $self->dbi unless defined $dbi;
42 
43  if((!defined $source_id) or (!defined $species_id) or (!defined $file) ){
44  croak "Need to pass source_id, species_id and file as pairs";
45  }
46  $verbose |=0;
47 
48  my $wget;
49  my $user = "ensro";
50  my $host;
51  my $port = 3306;
52  my $dbname;
53  my $pass;
54 
55  if($file =~ /wget[=][>](\S+?)[,]/){
56  $wget = $1;
57  }
58  if($file =~ /host[=][>](\S+?)[,]/){
59  $host = $1;
60  }
61  if($file =~ /port[=][>](\S+?)[,]/){
62  $port = $1;
63  }
64  if($file =~ /dbname[=][>](\S+?)[,]/){
65  $dbname = $1;
66  }
67  if($file =~ /pass[=][>](\S+?)[,]/){
68  $pass = $1;
69  }
70  if($file =~ /user[=][>](\S+?)[,]/){
71  $user = $1;
72  }
73 
74 
75  #get direct RFAM xrefs from core
76  my $registry = "Bio::EnsEMBL::Registry";
77  my $dba;
78 
79  #get the species name
80  my %id2name = $self->species_id2name($dbi);
81  if (defined $species_name) { push @{$id2name{$species_id}}, $species_name; }
82  if (!defined $id2name{$species_id}) { next; }
83  $species_name = $id2name{$species_id}[0];
84 
85  if ($host) {
87  '-host' => $host,
88  '-user' => $user,
89  '-pass' => $pass,
90  '-dbname' => $dbname,
91  '-port' => $port,
92  '-species' => $species_name,
93  '-group' => 'core',
94  );
95  } elsif (defined $core_db) {
96  $dba = $core_db;
97  } else {
98  $registry->load_registry_from_multiple_dbs(
99  {
100  '-host' => 'mysql-ens-sta-1',
101  '-port' => 4519,
102  '-user' => 'ensro',
103  },
104  );
105  $dba = $registry->get_DBAdaptor($species_name, 'core');
106  }
107 
108  my $rfam_sql = "select distinct t.stable_id, hit_name from analysis a join transcript t on (a.analysis_id = t.analysis_id and a.logic_name like 'ncrna%' and t.biotype != 'miRNA') join exon_transcript et on (t.transcript_id = et.transcript_id) join supporting_feature sf on (et.exon_id = sf.exon_id and sf.feature_type = 'dna_align_feature' ) join dna_align_feature df on (sf.feature_id = df.dna_align_feature_id) order by hit_name";
109 
110  my $sth = $dba->dbc->prepare($rfam_sql);
111  $sth->execute();
112 
113  #hash keyed on RFAM accessions, value is an array of ensembl transcript stable_ids
114  my %rfam_transcript_stable_ids;
115 
116  while (my ($stable_id, $hit_name) = $sth->fetchrow_array ) {
117 
118  my $rfam_id;
119  if ($hit_name =~ /^(RF\d+)/) {
120  $rfam_id = $1;
121  }
122  if ($rfam_id) {
123  push @{$rfam_transcript_stable_ids{$rfam_id}}, $stable_id;
124  }
125  }
126  $sth->finish;
127 
128  my @lines;
129  if (defined $wget) {
130  my $ua = LWP::UserAgent->new();
131  $ua->timeout(10);
132  $ua->env_proxy();
133  my $request = HTTP::Request->new(GET => $wget);
134  my $response = $ua->request($request);
135 
136  if ( !$response->is_success() ) {
137  warn($response->status_line);
138  return 1;
139  }
140  @lines = split(/\n\n\n/, $response->decoded_content);
141  } else {
142  my $file_io = $self->get_filehandle($file);
143  if ( !defined $file_io ) {
144  print "ERROR: Can't open RFAM file $file\n";
145  return 1;
146  }
147  my $temp_line;
148  while (my $line = $file_io->getline()) {
149  if ($line =~ /\/\//) {
150  push @lines, $temp_line;
151  $temp_line = '';
152  } else {
153  $temp_line .= $line . "\n";
154  }
155  }
156  }
157 
158  my @xrefs;
159  my $xref_count = 0;
160  my $direct_count = 0;
161 
162  while (my $entry = shift @lines) {
163 
164  my $xref;
165  chomp $entry;
166  next if (!$entry);
167 
168  my ($accession) = $entry =~ /#=GF\sAC\s+(\w+)/ ;
169  my ($label) = $entry =~ /#=GF\sID\s+([^\n]+)/;
170  my ($description) = $entry =~ /#=GF\sDE\s+([^\n]+)/;
171  if ($accession) {
172  if (exists($rfam_transcript_stable_ids{$accession})){
173  #add xref
174  my $xref_id = $self->add_xref({ acc => $accession,
175  version => 0,
176  label => $label || $accession ,
177  desc => $description,
178  source_id => $source_id,
179  species_id => $species_id,
180  dbi => $dbi,
181  info_type => "DIRECT"} );
182 
183  my @transcript_stable_ids = @{$rfam_transcript_stable_ids{$accession}};
184  foreach my $stable_id (@transcript_stable_ids){
185  $self->add_direct_xref($xref_id, $stable_id, "Transcript", "", $dbi);
186  $direct_count++;
187  }
188  $xref_count++;
189  }
190  }
191  }
192 
193  print "Added $xref_count RFAM xrefs and $direct_count direct xrefs\n" if($verbose);
194 
195  return 0; # successfull
196 
197 
198 }
199 
200 1;
Bio::EnsEMBL::DBSQL::DBAdaptor
Definition: DBAdaptor.pm:40
Bio::EnsEMBL::DBSQL::DBAdaptor::dbc
public Bio::EnsEMBL::DBSQL::DBConnection dbc()
XrefParser::BaseParser
Definition: BaseParser.pm:8
Bio::EnsEMBL::Registry
Definition: Registry.pm:113
Bio::EnsEMBL::DBSQL::DBConnection::prepare
public DBI prepare()
Bio::EnsEMBL::DBSQL::DBAdaptor::new
public Bio::EnsEMBL::DBSQL::DBAdaptor new()