ensembl-hive  2.8.1
MGI_CCDS_Parser.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 package XrefParser::MGI_CCDS_Parser;
21 
22 use strict;
23 use warnings;
24 use Carp;
25 use DBI;
26 
27 use base qw( XrefParser::BaseParser );
28 
29 
30 sub run {
31 
32  my ($self, $ref_arg) = @_;
33  my $source_id = $ref_arg->{source_id};
34  my $species_id = $ref_arg->{species_id};
35  my $files = $ref_arg->{files};
36  my $verbose = $ref_arg->{verbose};
37  my $dbi = $ref_arg->{dbi};
38  $dbi = $self->dbi unless defined $dbi;
39 
40  if((!defined $source_id) or (!defined $species_id) or (!defined $files) ){
41  croak "Need to pass source_id, species_id and file as pairs";
42  }
43  $verbose |=0;
44 
45  my $file = @{$files}[0];
46 
47  my %label;
48  my %version;
49  my %description;
50  my %accession;
51 
52  my $sql = 'select source_id, priority_description from source where name like "MGI"';
53  my $sth = $dbi->prepare($sql);
54 
55  $sth->execute();
56  my ($mgi_source_id, $desc);
57  $sth->bind_columns(\$mgi_source_id, \$desc);
58  my @arr;
59  while($sth->fetch()){
60  push @arr, $mgi_source_id;
61  }
62  $sth->finish;
63 
64  $sql = "select accession, label, version, description from xref where source_id in (".join(", ",@arr).")";
65 
66  $sth = $dbi->prepare($sql);
67  $sth->execute();
68  my ($acc, $lab, $ver);
69  $sth->bind_columns(\$acc, \$lab, \$ver, \$desc);
70  while (my @row = $sth->fetchrow_array()) {
71  if(defined($desc)){
72  $accession{$lab} = $acc;
73  $label{$acc} = $lab;
74  $version{$acc} = $ver;
75  $description{$acc} = $desc;
76  }
77  }
78  $sth->finish;
79 
80 
81 
82  #
83  # Get master xref ids via the ccds label.
84  #
85 
86  $sql = 'select x.label, x.xref_id from xref x, source s where x.source_id = s.source_id and s.name ="CCDS"';
87 
88  my %ccds_label_to_xref_id;
89  $sth = $dbi->prepare($sql);
90  $sth->execute();
91  my ($xref_id);
92  $sth->bind_columns(\$lab, \$xref_id);
93  while (my @row = $sth->fetchrow_array()) {
94  $ccds_label_to_xref_id{$row[0]} = $row[1];
95  }
96  $sth->finish;
97 
98 
99 
100  my $ua = LWP::UserAgent->new();
101  $ua->timeout(10);
102  $ua->env_proxy();
103 
104 
105  my $count = 0;
106  my $ccds_missing = 0;
107  my $entrezgene_missing = 0;
108 
109  my $mgi_io = $self->get_filehandle($file);
110  if ( !defined $mgi_io ) {
111  print STDERR "ERROR: Could not open $file\n";
112  return 1; # 1 is an error
113  }
114 
115  #
116  #
117  ##chromosome g_accession gene gene_id ccds_id ccds_status cds_strand cds_from cds_to cds_locations match_type
118  #1 NC_000067.5 Xkr4 497097 CCDS14803.1 Public - 3206102 3661428 [3206102-3207048, 3411782-3411981, 3660632-3661428] Identical
119  #1 NC_000067.5 Rp1h 19888 CCDS14804.1 Public - 4334680 4342905 [4334680-4340171, 4341990-4342161, 4342282-4342905] Identical
120  while (my $line = $mgi_io->getline()) {
121  my($chrom, $g_acc, $gene_name, $entrez_id, $ccds, @junk) = split(/\t/,$line);
122  if(defined($ccds_label_to_xref_id{$ccds})){
123  if(defined($accession{$gene_name}) and
124  defined($label{$accession{$gene_name}})){
125  my $acc = $accession{$gene_name};
126  $self->add_dependent_xref({ master_xref_id => $ccds_label_to_xref_id{$ccds},
127  acc => $acc,
128  version => $version{$acc},
129  label => $label{$acc},
130  desc => $description{$acc},
131  source_id => $source_id,
132  dbi => $dbi,
133  species_id => $species_id });
134 
135  $count++;
136  }
137  else{
138  $entrezgene_missing++;
139  }
140  }
141  else{
142  $ccds_missing++;
143  }
144  }
145  print "$ccds_missing ccds not resolved, $entrezgene_missing mgi not found. Added $count MGI xrefs via CCDS\n" if($verbose);
146  return 0;
147 }
148 
149 1;
150 
accession
public accession()
XrefParser::BaseParser
Definition: BaseParser.pm:8
run
public run()