3 See the NOTICE file distributed with
this work
for additional information
4 regarding copyright ownership.
6 Licensed under the Apache License, Version 2.0 (the
"License");
7 you may not use
this file except in compliance with the License.
8 You may obtain a copy of the License at
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an
"AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License
for the specific language governing permissions and
16 limitations under the License.
20 Please email comments or questions to the
public Ensembl
21 developers list at <http:
23 Questions may also be sent to the Ensembl help desk at
32 A parser
class to parse the HGNC source.
33 HGNC is the official naming source for Human.
47 Locus specific databases
50 A core database adaptor is required.
59 file =>
'hgnc_data.tsv',
67 package XrefParser::HGNCParser;
79 # HGNC sources to be processed
92 Exceptions :
throws on all processing errors
93 Caller : ParseSource in the xref pipeline
97 my ($self, $ref_arg) = @_;
99 my $source_id = $ref_arg->{source_id};
100 my $species_id = $ref_arg->{species_id};
101 my $file = $ref_arg->{file};
102 my $db = $ref_arg->{dba};
103 my $verbose = $ref_arg->{verbose}
104 my $dbi = $ref_arg->{dbi}
106 if ((!defined $source_id) or (!defined $species_id) or (!defined $file) ){
107 confess
"Need to pass source_id, species_id, file as pairs";
110 # parse the file string and set default user
114 # Prepare lookup lists
115 my (%swissprot) = %{$self->get_valid_codes(
'Uniprot/SWISSPROT',$species_id, $dbi)};
116 my (%refseq) = %{$self->get_valid_codes(
'refseq',$species_id, $dbi)};
117 my @list = (
'refseq_peptide',
'refseq_mRNA');
118 my (%entrezgene) = %{$self->get_valid_xrefs_for_dependencies(
'EntrezGene', $dbi, @list)};
121 my $self_source_name = $self->get_source_name_for_source_id($source_id, $dbi);
123 # get RefSeq source ids
124 foreach my $source_name (@SOURCES) {
125 $self->{source_ids}->{$source_name} = $self->get_source_id_for_source_name( $self_source_name, $source_name , $dbi );
127 $self->{source_ids}->{
'lrg'} = $self->get_source_id_for_source_name(
'LRG_HGNC_notransfer', undef, $dbi );
128 $self->{source_ids}->{
'genecards'} = $self->get_source_id_for_source_name(
'GeneCards', undef, $dbi);
134 # Get CCDS data from core db
137 $core_db = $db->dbc();
138 } elsif (defined $file_params->{host}) {
140 host => $file_params->{host},
141 port => $file_params->{port},
142 user => $file_params->{user},
143 dbname => $file_params->{dbname},
144 pass => $file_params->{pass}
147 confess
"No ensembl core database provided\n";
150 if (!defined $core_db) {
151 confess
"No ensembl core database!\n";
155 SELECT ta.value, t.stable_id
157 INNER JOIN transcript_attrib ta ON t.transcript_id = ta.transcript_id
158 INNER JOIN attrib_type a ON ta.attrib_type_id = a.attrib_type_id
159 WHERE a.code =
'ccds_transcript';
163 my $sth = $core_db->prepare($sql);
164 $sth->execute() or croak( $core_db->errstr() );
165 while ( my ($ccds_id, $ens_id) = $sth->fetchrow_array() ) {
168 $ccds_to_ens{$ccds_id} = $ens_id;
172 # in memory HGNC file
175 # use wget link to get file
176 if (defined $file_params->{wget}) {
177 my $ua = LWP::UserAgent->new();
180 my $request = HTTP::Request->new(
181 GET => $file_params->{wget}
183 my $response = $ua->request($request);
185 if ( !$response->is_success() ) {
186 confess $response->status_line;
189 $mem_file = $response->decoded_content;
191 # else get file from disk
193 my $disk_fh = $self->get_filehandle($file);
194 if ( !defined $disk_fh ) {
195 confess
"Can't open HGNC file '$file'\n";
197 $mem_file =
do { local $/; <$disk_fh> };
200 my $input_file = Text::CSV->new({
205 }) or croak
"Cannot use file $file: ".Text::CSV->error_diag ();
207 # make sure it's utf8
208 $mem_file = Encode::encode(
"UTF-8", $mem_file);
209 # get rid of non-conventional " used in the Locus specific databases field
210 $mem_file =~ s/
"//xg;
212 open my $fh, '<', \$mem_file or confess "Can
't open HGNC in-memory file: $!\n";
214 $input_file->column_names( @{ $input_file->getline( $fh ) } );
217 # loop through each row
218 while ( my $data = $input_file->getline_hr( $fh ) ) {
220 my $acc = $data->{'HGNC ID
'};
221 my $symbol = $data->{'Approved symbol
'};
222 my $name = $data->{'Approved name
'};
223 my $previous_symbols = $data->{'Previous symbols
'};
224 my $synonyms = $data->{'Alias symbols
'};
228 # Direct CCDS to ENST mappings
229 my $ccds = $data->{'CCDS IDs
'};
232 if ( defined $ccds ) {
233 @ccds_list = split( /,\s/x, $ccds );
237 foreach my $ccds (@ccds_list) {
238 my $enst_id = $ccds_to_ens{$ccds};
240 if (!defined $enst_id) {
244 $self->add_to_direct_xrefs({
245 stable_id => $enst_id,
250 source_id => $self->{source_ids}->{'ccds
'},
252 species_id => $species_id
255 $self->add_synonyms_for_hgnc({
256 source_id => $self->{source_ids}->{'ccds
'},
258 species_id => $species_id,
260 dead => $previous_symbols,
263 $name_count{'ccds
'}++;
266 # Direct LRG to ENST mappings
267 my $lrg_id = $data->{'Locus specific databases
'};
269 if ( defined $lrg_id && $lrg_id =~ m/(LRG_\d+)\|/x){
271 $self->add_to_direct_xrefs({
272 stable_id => $lrg_id,
277 source_id => $self->{source_ids}->{'lrg
'},
279 species_id => $species_id
282 $self->add_synonyms_for_hgnc({
283 source_id => $self->{source_ids}->{'lrg
'},
285 species_id => $species_id,
287 dead => $previous_symbols,
290 $name_count{'lrg
'}++;
293 # Direct Ensembl mappings
294 my $ensg_id = $data->{'Ensembl gene ID
'};
295 if ( defined $ensg_id ){
298 $self->add_to_direct_xrefs({
299 stable_id => $ensg_id,
305 source_id => $self->{source_ids}->{'ensembl_manual
'},
306 species_id => $species_id
309 $self->add_synonyms_for_hgnc({
310 source_id => $self->{source_ids}->{'ensembl_manual
'},
312 species_id => $species_id,
313 dead => $previous_symbols,
317 $name_count{'ensembl_manual
'}++;
320 my $direct_id = $self->get_xref($acc, $self->{source_ids}->{'ensembl_manual
'}, $species_id, $dbi);
321 my ($hgnc_id) = $acc =~ /HGNC:(\d+)/;
322 $self->add_dependent_xref({
323 master_xref_id => $direct_id,
327 source_id => $self->{source_ids}->{'genecards
'},
329 species_id => $species_id
332 $self->add_synonyms_for_hgnc({
333 source_id => $self->{source_ids}->{'genecards
'},
335 species_id => $species_id,
337 dead => $previous_symbols,
340 $name_count{'genecards
'}++;
344 my $refseq_id = $data->{'RefSeq IDs
'};
346 if ( defined $refseq{$refseq_id} ){
348 foreach my $xref_id ( @{$refseq{$refseq_id}} ){
349 $self->add_dependent_xref({
350 master_xref_id => $xref_id,
354 source_id => $self->{source_ids}->{'refseq_manual
'},
356 species_id => $species_id
358 $name_count{'refseq_manual
'}++;
361 $self->add_synonyms_for_hgnc({
362 source_id => $self->{source_ids}->{'refseq_manual
'},
364 species_id => $species_id,
366 dead => $previous_symbols,
373 my $entrez_id = $data->{'NCBI Gene ID
'};
374 if ( defined $entrez_id ){
375 if ( defined $entrezgene{$entrez_id} ){
377 $self->add_dependent_xref({
378 master_xref_id => $entrezgene{$entrez_id},
382 source_id => $self->{source_ids}->{'entrezgene_manual
'},
384 species_id => $species_id
387 $self->add_synonyms_for_hgnc({
388 source_id => $self->{source_ids}->{'entrezgene_manual
'},
390 species_id => $species_id,
391 dead => $previous_symbols,
395 $name_count{'entrezgene_manual
'}++;
399 # Store to keep descriptions if stored yet
405 source_id => $self->{source_ids}->{'desc_only
'},
406 species_id => $species_id,
411 $self->add_synonyms_for_hgnc({
412 source_id => $self->{source_ids}->{'desc_only
'},
414 species_id => $species_id,
416 dead => $previous_symbols,
427 print "HGNC xrefs loaded:\n";
428 foreach my $type (sort keys %name_count){
429 print "\t$type\t$name_count{$type}\n";
431 print "$mismatch HGNC ids could not be associated in xrefs\n";
433 return 0; # successful
438 =head2 add_synonyms_for_hgnc
439 Arg [1] : hashref : source_id, name, species_id, dead, alias
440 Description: Specialized class to add synonyms from HGNC and VGNC data
445 sub add_synonyms_for_hgnc {
446 my ($self, $ref_arg) = @_;
448 my $source_id = $ref_arg->{source_id};
449 my $name = $ref_arg->{name};
450 my $species_id = $ref_arg->{species_id};
451 my $dead_string = $ref_arg->{dead};
452 my $alias_string = $ref_arg->{alias};
453 my $dbi = $ref_arg->{dbi};
455 # dead name, add to synonym
456 if (defined $dead_string) {
457 $dead_string =~ s/"//xg;
458 my @dead_array = split( ',\s
', $dead_string );
459 foreach my $dead (@dead_array){
460 $dead = Encode::decode("UTF-8", $dead);
461 $dead = unidecode(uc($dead));
462 $self->add_to_syn($name, $source_id, $dead, $species_id, $dbi);
466 # alias name, add to synonym
467 if (defined $alias_string) {
468 $alias_string =~ s/"//xg;
469 my @alias_array = split( ',\s
', $alias_string );
470 foreach my $alias (@alias_array){
471 $alias = Encode::decode("UTF-8", $alias);
472 $alias = unidecode(uc($alias));
473 $self->add_to_syn($name, $source_id, $alias, $species_id, $dbi);
482 =head2 parse_file_string
483 Arg [1] : string : input file string
484 Description: parses the input string $file into an hash
485 string $file is in the format as the example:
486 script:project=>ensembl,host=>ens-staging1,dbname=>homo_sapiens_core_70_37,ofhost=>ens-staging1,...
487 string until : is ignored, hash is built with keys=>values provided
488 Return type: params hashref
492 sub parse_file_string {
493 my ($self, $file_string) = @_;
495 $file_string =~ s/\A\w+://x;
497 my @param_pairs = split( /,/x, $file_string );
501 # set provided values
502 foreach my $pair ( @param_pairs ) {
503 my ($key, $value) = split( /=>/x, $pair );
504 $params->{$key} = $value;