ensembl-hive  2.8.1
MGI_Desc_Parser.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 =head1 NAME
21 
23 
24 =head1 DESCRIPTION
25 
26 A parser class to parse the MGI (descriptions) source. Creates 'MISC' xref using MGI accession with description and
27 also creates the synonyms extracted from the pipe seperated synonym_field
28 
29 -species = mus_musculus
30 -species_id = 10090
31 -data_uri = http://www.informatics.jax.org/downloads/reports/MRK_List2.rpt
32 -file_format = TSV
33 -columns = [accession chromosome position start end strand label status marker marker_type feature_type synonym_field]
34 
35 =head1 SYNOPSIS
36 
37  my $parser = XrefParser::MGI_Desc_Parser->new($db->dbh);
38  $parser->run({
39  source_id => 58,
40  species_id => 10090,
41  files => ["MRK_List2.rpt"],
42  });
43 
44 =cut
45 
46 package XrefParser::MGI_Desc_Parser;
47 
48 use strict;
49 use warnings;
50 use Carp;
51 use Text::CSV;
52 
53 use parent qw( XrefParser::BaseParser );
54 
55 my $EXPECTED_NUMBER_OF_COLUMNS = 12;
56 
57 
58 
59 =head2
60 
61 The run method does the actual parsing and creation of xrefs and synonyms.
62 Parser gets initialized as noted above and run is called from
63 Bio::EnsEMBL::Production::Pipeline::Xrefs::ParseSource
64 
65 =cut
66 
67 sub run {
68 
69  my ($self, $ref_arg) = @_;
70  my $source_id = $ref_arg->{source_id};
71  my $species_id = $ref_arg->{species_id};
72  my $files = $ref_arg->{files};
73  my $verbose = $ref_arg->{verbose} // 0;
74  my $dbi = $ref_arg->{dbi} // $self->dbi;
75 
76  if ( ( !defined $source_id ) or
77  ( !defined $species_id ) or
78  ( !defined $files ) ) {
79  confess 'Need to pass source_id, species_id and files as pairs';
80  }
81 
82  my $file = @{$files}[0];
83 
84  my $mgi_io = $self->get_filehandle($file);
85  if ( !defined $mgi_io ) {
86  confess "Could not open $file\n";
87  }
88 
89  my $input_file = Text::CSV->new({
90  sep_char => "\t",
91  quote_char => undef,
92  escape_char => undef,
93  strict => 1,
94  binary => 1
95  }) or confess "Cannot use file $file: " . Text::CSV->error_diag();
96 
97  my $xref_count = 0;
98  my $syn_count = 0;
99  my %acc_to_xref;
100 
101  # read and validate header
102  if ( ! is_file_header_valid( $input_file->header( $mgi_io ) ) ) {
103  confess "Malformed or unexpected header in MGI_Desc file '${file}'";
104  }
105 
106  while ( my $data = $input_file->getline($mgi_io) ) {
107  my $accession = $data->[0];
108  my $marker = $data->[8];
109 
110  $acc_to_xref{$accession} = $self->add_xref({
111  acc => $accession,
112  label => $data->[6],
113  desc => $marker,
114  source_id => $source_id,
115  species_id => $species_id,
116  dbi => $dbi,
117  info_type => 'MISC',
118  });
119  if ( $verbose && !$marker ) {
120  print "$accession has no description\n";
121  }
122  $xref_count += 1;
123 
124  if ( defined $acc_to_xref{$accession} ) {
125  my @synonyms;
126  my $synonym_field = $data->[11];
127  if ( $synonym_field ) {
128  @synonyms = split qr{ [|] }msx, $synonym_field;
129  }
130  foreach my $syn (@synonyms) {
131  $self->add_synonym( $acc_to_xref{$accession}, $syn, $dbi );
132  $syn_count += 1;
133  }
134  }
135 
136  } ## end while ( my $data = $input_file...)
137 
138  $mgi_io->eof
139  || confess "Error parsing file $file: " . $input_file->error_diag();
140  $mgi_io->close();
141 
142  if ($verbose) {
143  print "$xref_count MGI Description Xrefs added\n";
144  print "$syn_count synonyms added\n";
145  }
146 
147  return 0; #successful
148 } ## end sub run
149 
150 
151 =head2 is_file_header_valid
152 
153  Arg [1..N] : list of column names provided by Text::CSV::header()
154  Example : if ( ! is_file_header_valid( $csv->header( $fh ) ) {
155  confess 'Bad header';
156  }
157  Description: Verifies if the header of a MGI_Desc file follows
158  expected syntax.
159  Return type: boolean
160  Exceptions : none
161  Caller : internal
162  Status : Stable
163 
164 =cut
165 
166 sub is_file_header_valid {
167  my ( @header ) = @_;
168 
169  # Don't bother with parsing column names if their number does not
170  # match to begin with
171  if ( scalar @header != $EXPECTED_NUMBER_OF_COLUMNS ) {
172  return 0;
173  }
174 
175  my @field_patterns
176  = (
177  'mgi accession id',
178  'chr',
179  'cm position',
180  'genome coordinate start',
181  'genome coordinate end',
182  'strand',
183  'marker symbol',
184  'status',
185  'marker name',
186  'marker type',
187  'feature type',
188  'marker synonyms (pipe-separated)',
189  );
190 
191  my $header_field;
192  foreach my $pattern (@field_patterns) {
193  $header_field = shift @header;
194  if ( $header_field ne $pattern ) {
195  return 0;
196  }
197  }
198 
199  # If we have made it this far, all should be in order
200  return 1;
201 }
202 
203 
204 1;
accession
public accession()
XrefParser::BaseParser
Definition: BaseParser.pm:8
run
public run()
XrefParser::MGI_Desc_Parser::run
public run()
XrefParser::MGI_Desc_Parser
Definition: MGI_Desc_Parser.pm:27