ensembl-hive  2.7.0
UCSCParser.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =head1 CONTACT
19 
20  Please email comments or questions to the public Ensembl
21  developers list at <http://lists.ensembl.org/mailman/listinfo/dev>.
22 
23  Questions may also be sent to the Ensembl help desk at
24  <http://www.ensembl.org/Help/Contact>.
25 
26 =head1 NAME
27 
29 
30 =head1 DESCRIPTION
31 
32 A parser class to parse UCSC data for human and mouse.
33 
34 -data_uri = ftp://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/knownGene.txt.gz
35 -file_format = TSV
36 -columns = [
37  ensembl_id
38  chromosome
39  strand
40  tx_start
41  tx_end
42  cds_start
43  cds_end
44  nb_exons
45  exon_starts
46  exon_ends
47  uniprot_accession
48  ucsc_accession
49  ]
50 
51 Only columns listed in @required_columns are mandatory.
52 
53 =head1 SYNOPSIS
54 
55  my $parser = XrefParser::UCSCParser->new($db->dbh);
56  $parser->run({
57  source_id => 1,
58  species_id => 9606,
59  files => ['UCSC_human/knownGene.txt.gz'],
60  });
61 
62 =cut
63 
64 package XrefParser::UCSCParser;
65 
66 use strict;
67 use warnings;
68 
69 use Carp;
70 use Text::CSV;
71 
72 use parent qw( XrefParser::CoordinateParser );
73 
74 
75 =head2 run
76  Description: Runs the UCSCParser
77  Return type: N/A
78  Caller : internal
79 =cut
80 
81 sub run {
82  my ( $self, $ref_arg ) = @_;
83 
84  my $source_id = $ref_arg->{source_id};
85  my $species_id = $ref_arg->{species_id};
86  my $files = $ref_arg->{files};
87  my $verbose = $ref_arg->{verbose} // 0;
88  my $dbi = $ref_arg->{dbi} // $self->{dbi};
89 
90  if ( (!defined $source_id) || (!defined $species_id) || (!defined $files) ) {
91  confess 'Need to pass source_id, species_id and files as pairs';
92  }
93 
94  my $file = @{$files}[0];
95 
96  my $count = 0;
97 
98  my $file_io = $self->get_filehandle($file);
99  if ( !defined $file_io ) {
100  confess "Can't open UCSC file $file\n";
101  }
102 
103  my $input_file = Text::CSV->new({
104  sep_char => "\t",
105  empty_is_undef => 1,
106  strict => 1,
107  }) || confess "Cannot use file $file: " . Text::CSV->error_diag();
108 
109  while ( my $data = $input_file->getline( $file_io ) ) {
110  my (undef, $chromosome, $strand, $tx_start, $tx_end, $cds_start, $cds_end,
111  undef, $exon_starts, $exon_ends, undef, $accession) = @{ $data };
112 
113  # UCSC uses slightly different chromosome names, at least for
114  # human and mouse, so chop off the 'chr' in the beginning. We do
115  # not yet translate the names of the special chromosomes, e.g.
116  # "chr6_cox_hap1" (UCSC) into "c6_COX" (Ensembl).
117  $chromosome =~ s{ \A chr }{}msx;
118 
119  # They also use '+' and '-' for the strand, instead of -1, 0, or 1.
120  if ( $strand eq q{+} ) {
121  $strand = 1;
122  }
123  elsif ( $strand eq q{-} ) {
124  $strand = -1;
125  }
126  else {
127  $strand = 0;
128  }
129 
130  # ... and non-coding transcripts have cds_start == cds_end. We would
131  # like these to be stored as NULLs.
132  if ( $cds_start == $cds_end ) {
133  undef $cds_start;
134  undef $cds_end;
135  }
136 
137  # $exon_starts and $exon_ends usually (always?) have trailing commas,
138  # remove them.
139  $exon_starts =~ s{ , \z }{}msx;
140  $exon_ends =~ s{ , \z }{}msx;
141 
142  # ... and they use the same kind of "inbetween" coordinates as e.g.
143  # exonerate, so increment all start coordinates by one.
144  $tx_start += 1;
145  if ( defined $cds_start ) {
146  $cds_start += 1;
147  }
148  # The string exon_starts is a comma-separated list of start coordinates
149  # for subsequent exons and we must increment each one. Split the string
150  # on commas, use map() to apply the "+1" transformation to every
151  # element of the resulting array, then join the result into a new
152  # comma-separated list.
153  $exon_starts =
154  join q{,}, map { $_ + 1 } split qr{ , }msx, $exon_starts;
155 
156  $self->add_xref( $source_id, $species_id, {
157  accession => $accession,
158  chromosome => $chromosome,
159  strand => $strand,
160  txStart => $tx_start,
161  txEnd => $tx_end,
162  cdsStart => $cds_start,
163  cdsEnd => $cds_end,
164  exonStarts => $exon_starts,
165  exonEnds => $exon_ends,
166  dbi => $dbi,
167  });
168 
169  $count += 1;
170 
171  }
172 
173  $input_file->eof || confess "Error parsing file $file: " . $input_file->error_diag();
174  $file_io->close();
175 
176  if ($verbose) {
177  print "Loaded a total of $count UCSC xrefs\n";
178  }
179 
180  return 0;
181 }
182 
183 
184 1;
XrefParser::UCSCParser
Definition: UCSCParser.pm:39
map
public map()
accession
public accession()
XrefParser::CoordinateParser
Definition: CoordinateParser.pm:4
run
public run()
XrefParser::UCSCParser::run
public N run()