3 See the NOTICE file distributed with
this work
for additional information
4 regarding copyright ownership.
6 Licensed under the Apache License, Version 2.0 (the
"License");
7 you may not use
this file except in compliance with the License.
8 You may obtain a copy of the License at
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an
"AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License
for the specific language governing permissions and
16 limitations under the License.
20 package XrefMapper::DisplayXrefs;
34 my %genes_to_transcripts;
35 my %translation_to_transcript;
36 my %transcript_to_translation;
37 my %transcript_length;
41 # ignore should be some sql to return object_xref_ids that should be ignored. FOR full mode METHOD
42 # ignore should return regexp and source name as key for update METHODS
45 sub gene_description_sources {
48 my ($precedence, $ignore) = @{$self->gene_display_xref_sources()};
49 return [$precedence,$ignore];
53 sub gene_description_filter_regexps {
55 return (
'[0-9A-Z]{10}RIK PROTEIN[ \.]',
56 '\(?[0-9A-Z]{10}RIK PROTEIN\)?[ \.]',
57 '^BA\S+\s+\(NOVEL PROTEIN\)\.?',
59 'CDNA SEQUENCE\s?,? [A-Z]+\d+[ \.;]',
60 '^CGI\-\d+ PROTEIN\.?\;?',
61 '^CHROMOSOME\s+\d+\s+OPEN\s+READING\s+FRAME\s+\d+\.?.*',
62 'CLONE MGC:\d+[ \.;]',
63 '^\(CLONE REM\d+\) ORF \(FRAGMENT\)\.*',
65 '^DJ\S+\s+\(NOVEL PROTEIN\)\.?',
66 '^DKFZP[A-Z0-9]+\s+PROTEIN[\.;]?.*',
68 'EST [A-Z]+\d+[ \.;]',
69 'EXPRESSED SEQUENCE [A-Z]+\d+[ \.;]',
71 '^FLJ\d+\s+PROTEIN.*',
73 '^HSPC\d+\s+PROTEIN\.?.*',
74 'HYPOTHETICAL PROTEIN,',
75 'HYPOTHETICAL PROTEIN \S+[\.;]',
76 '^\(*HYPOTHETICAL\s+.*',
77 '\(*HYPOTHETICAL\s+.*',
78 '^KIAA\d+\s+GENE\s+PRODUCT\.?.*',
79 '^KIAA\d+\s+PROTEIN\.?.*',
80 '^LOC\d+\s*(PROTEIN)?\.?',
84 '^ORF\s*\d+\s+PROTEIN\.*',
85 '^PRED\d+\s+PROTEIN.*',
87 '^PRO\d+\s+PROTEIN\.?.*',
88 '^PROTEIN C\d+ORF\d+\.*',
89 'PROTEIN KIAA\d+[ \.].*',
90 'PROTEIN \S+ HOMOLOG\.?',
91 '^Putative uncharacterized protein.*',
93 'RIKEN CDNA [0-9A-Z]{10}[ \.;]',
94 'RIKEN CDNA [0-9A-Z]{10}[ \.]',
95 '.*RIKEN FULL-LENGTH ENRICHED LIBRARY.*',
96 '.*RIKEN FULL-LENGTH ENRICHED LIBRARY.*PRODUCT:',
97 '^\s*\(\d*\)\s*[ \.]$',
98 '^\s*\(\d*\)\s*[ \.]$',
99 '^\s*\(?FRAGMENT\)?\.?\s*$',
100 '^\s*\(FRAGMENT\)\.?\s*$',
101 '\s*\(?GENE\)?\.?;?',
102 '^\s*\(?GENE\)?\.?;?\s*$',
103 '^\s*\(?GENE\)?\.?\s*$',
104 'SIMILAR TO GENBANK ACCESSION NUMBER\s+\S+',
105 '^SIMILAR TO GENE.*',
106 '^SIMILAR TO HYPOTHETICAL.*',
107 '^SIMILAR TO (KIAA|LOC).*',
108 'SIMILAR TO (KIAA|LOC|RIKEN).*',
109 '^SIMILAR TO PUTATIVE[ \.]',
110 'SIMILAR TO PUTATIVE[ \.]',
113 '\s*\(?PRECURSOR\)?\.?;?',
114 '^\s*\(?PROTEIN\)?\.?\s*$',
119 '^WUGSC:.*\s+PROTEIN\.?.*');
123 sub transcript_display_xref_sources {
126 return $self->gene_display_xref_sources();
131 sub gene_display_xref_sources {
147 #don't use EntrezGene labels dependent on predicted RefSeqs
149 $ignore{
'EntrezGene'} =<<IEG;
150 SELECT DISTINCT ox.object_xref_id
151 FROM object_xref ox, dependent_xref dx,
152 xref xmas, xref xdep,
153 source smas, source sdep
154 WHERE ox.xref_id = dx.dependent_xref_id AND
155 dx.dependent_xref_id = xdep.xref_id AND
156 dx.master_xref_id = xmas.xref_id AND
157 xmas.source_id = smas.source_id AND
158 xdep.source_id = sdep.source_id AND
159 smas.name like
"Refseq%predicted" AND
160 sdep.name like
"EntrezGene" AND
161 ox.ox_status =
"DUMP_OUT" AND
162 ox.master_xref_id = dx.master_xref_id
165 #don't use labels starting with LOC
167 $ignore{
'LOC_prefix'} =<<LOCP;
168 SELECT object_xref_id
169 FROM object_xref JOIN xref USING(xref_id) JOIN source USING(source_id)
170 WHERE ox_status =
'DUMP_OUT' AND label REGEXP
'^LOC[[:digit:]]+'
173 return [\@list,\%ignore];
177 sub remove_source_priorities {
180 my $sql =
"DELETE from display_xref_priority";
181 my $sth = $self->xref->dbc->prepare($sql);
184 $sql =
"DELETE from gene_desc_priority";
185 $sth = $self->xref->dbc->prepare($sql);
193 my($class, $mapper) = @_;
197 $self->core($mapper->core);
198 $self->xref($mapper->xref);
199 $self->mapper($mapper);
200 $self->verbose($mapper->verbose);
206 my ($self, $arg) = @_;
209 ($self->{_mapper} = $arg );
210 return $self->{_mapper};
215 sub genes_and_transcripts_attributes_set{
216 # Runs build_transcript_and_gene_display_xrefs and
217 # build_meta_timestamp, and, if "-upload" is set, uses the SQL files
218 # produced to update the core database.
222 my $status = $self->mapper->xref_latest_status();
224 if($self->mapper->can(
"set_display_xrefs")){
225 $self->mapper->set_display_xrefs();
228 $self->set_display_xrefs();
230 if ($self->mapper->can(
"transcript_names_from_gene")) {
231 $self->mapper->transcript_names_from_gene();
233 $self->transcript_names_from_gene();
235 my $sth_stat = $self->xref->dbc->prepare(
"insert into process_status (status, date) values('display_xref_done',now())");
236 $sth_stat->execute();
238 if($self->mapper->can(
"set_gene_descriptions")){
239 $self->mapper->set_gene_descriptions();
242 $self->set_gene_descriptions(1);
245 $self->build_meta_timestamp;
247 $sth_stat = $self->xref->dbc->prepare(
"insert into process_status (status, date) values('gene_description_done',now())");
248 $sth_stat->execute();
255 sub set_gene_descriptions_from_display_xref{
258 $self->set_gene_descriptions(1);
264 sub set_display_xrefs_from_stable_table{
266 print
"Setting Transcript and Gene display_xrefs from xref database into core and setting the desc\n" if ($self->verbose);
268 my $xref_offset = $self->get_meta_value(
"xref_offset");
269 my $core_dbi = $self->core->dbc;
270 my $xref_dbi = $self->xref->dbc;
272 print
"Using xref_off set of $xref_offset\n" if($self->verbose);
274 my $reset_sth = $core_dbi->prepare(
"UPDATE gene SET display_xref_id = null");
275 $reset_sth->execute();
278 # Remove any leftover transcript description, as it is not used anywhere
279 $reset_sth = $core_dbi->prepare(
"UPDATE transcript SET display_xref_id = null");
280 $reset_sth->execute();
283 # Remove descriptions assigned through the xref pipeline, recognisable by the 'Source' field
284 # This will maintain any manually added descriptions
285 $reset_sth = $core_dbi->prepare(
"UPDATE gene SET description = null WHERE description like '%[Source:%]%'");
286 $reset_sth->execute();
289 my %name_to_external_name;
290 my $sql =
"select external_db_id, db_name, db_display_name from external_db";
291 my $sth = $core_dbi->prepare($sql);
293 my ($id, $name, $display_name);
294 $sth->bind_columns(\$id, \$name, \$display_name);
295 while($sth->fetch()){
296 $name_to_external_name{$name} = $display_name;
300 my %source_id_to_external_name;
302 $sql =
'select s.source_id, s.name from source s, xref x where x.source_id = s.source_id group by s.source_id'; # only get those of interest
303 $sth = $xref_dbi->prepare($sql);
305 $sth->bind_columns(\$id, \$name);
307 while($sth->fetch()){
308 if(defined($name_to_external_name{$name})){
309 $source_id_to_external_name{$id} = $name_to_external_name{$name};
315 my $update_gene_sth = $core_dbi->prepare(
"UPDATE gene g SET g.display_xref_id= ? WHERE g.gene_id=?");
316 my $update_gene_desc_sth = $core_dbi->prepare(
"UPDATE gene g SET g.description= ? WHERE g.gene_id=?");
318 my $update_tran_sth = $core_dbi->prepare(
"UPDATE transcript t SET t.display_xref_id= ? WHERE t.transcript_id=?");
320 my $get_gene_display_xref = $xref_dbi->prepare(
"SELECT gsi.internal_id, gsi.display_xref_id, x.description ,x.source_id, x.accession
321 FROM gene_stable_id gsi, xref x
322 WHERE gsi.display_xref_id = x.xref_id");
324 my $get_tran_display_xref = $xref_dbi->prepare(
"SELECT gsi.internal_id, gsi.display_xref_id from transcript_stable_id gsi");
326 $reset_sth = $xref_dbi->prepare(
"UPDATE gene_stable_id gsi SET gsi.desc_set=0");
327 $reset_sth->execute();
329 my $set_desc_done_sth = $xref_dbi->prepare(
"UPDATE gene_stable_id gsi SET gsi.desc_set=1 WHERE gsi.internal_id=?");
331 $get_gene_display_xref->execute();
337 $get_gene_display_xref->bind_columns(\$gene_id, \$xref_id, \$desc, \$source_id, \$label);
339 while($get_gene_display_xref->fetch()){
341 $update_gene_sth->execute($xref_id+$xref_offset, $gene_id);
343 if (defined($desc) and $desc ne
"") {
344 $desc .=
" [Source:".$source_id_to_external_name{$source_id}.
";Acc:".$label.
"]";
345 $update_gene_desc_sth->execute($desc,$gene_id);
346 $set_desc_done_sth->execute($gene_id);
352 $update_gene_desc_sth->finish;
353 $update_gene_sth->finish;
355 print
"$gene_count gene descriptions added\n" if($self->verbose);
357 $get_tran_display_xref->execute();
359 $get_tran_display_xref->bind_columns(\$tran_id, \$xref_id);
361 while($get_tran_display_xref->fetch()){
362 if(defined($xref_id)){
363 $update_tran_sth->execute($xref_id+$xref_offset, $tran_id);
364 if(!defined($tran_id) || !defined($xref_id) || !defined($xref_offset)){
365 print
"PROB: tran_id = $tran_id\nxref_id = $xref_id\n$xref_offset = $xref_offset\n";
371 # Clean up synonyms linked to xrefs which are not the display xref
372 # Synonyms are only used as alternative gene names, so should be synonyms of the gene symbol chosen
375 my $syn_clean_sth = $core_dbi->prepare(
"DELETE es FROM external_synonym es, xref x LEFT JOIN gene g ON g.display_xref_id = x.xref_id WHERE es.xref_id = x.xref_id AND isnull(g.display_xref_id)");
376 $syn_clean_sth->execute();
377 $syn_clean_sth->finish();
382 sub load_translation_to_transcript{
385 my $sth = $self->core->dbc->prepare(
"SELECT translation_id, transcript_id FROM translation");
388 my ($translation_id, $transcript_id);
389 $sth->bind_columns(\$translation_id, \$transcript_id);
391 while ($sth->fetch()) {
392 $translation_to_transcript{$translation_id} = $transcript_id;
393 $transcript_to_translation{$transcript_id} = $translation_id
if ($translation_id);
398 sub build_genes_to_transcripts {
401 my $sql =
"SELECT gene_id, transcript_id, seq_region_start, seq_region_end FROM transcript";
402 my $sth = $self->core->dbc->prepare($sql);
405 my ($gene_id, $transcript_id, $start, $end);
406 $sth->bind_columns(\$gene_id, \$transcript_id, \$start, \$end);
408 # Note %genes_to_transcripts is global
409 while ($sth->fetch()) {
410 push @{$genes_to_transcripts{$gene_id}}, $transcript_id;
411 $transcript_length{$transcript_id} = $end- $start;
418 sub build_meta_timestamp{
419 # Creates a file that contains the SQL needed to (re)set the
420 # 'xref.timestamp' key of the meta table.
424 my $sth = $self->core->dbc->prepare(
"DELETE FROM meta WHERE meta_key='xref.timestamp'");
428 $sth = $self->core->dbc->prepare(
"INSERT INTO meta (meta_key,meta_value) VALUES ('xref.timestamp', NOW())");
437 sub set_display_xrefs{
441 print
"Building Transcript and Gene display_xrefs\n" if ($self->verbose);
443 my $xref_offset = $self->get_meta_value(
"xref_offset");
444 my $core_dbi = $self->core->dbc();
445 my $xref_dbi = $self->xref->dbc();
447 print
"Using xref_off set of $xref_offset\n" if($self->verbose);
449 my $reset_sth = $core_dbi->prepare(
"UPDATE transcript SET display_xref_id = null WHERE biotype NOT IN ('LRG_gene')");
450 $reset_sth->execute();
453 my $update_gene_sth = $core_dbi->prepare(
"UPDATE gene g SET g.display_xref_id= ? WHERE g.gene_id=? and g.display_xref_id IS NULL");
454 my $update_tran_sth = $core_dbi->prepare(
"UPDATE transcript t SET t.display_xref_id= ? WHERE t.transcript_id=?");
458 my $ins_p_sth = $xref_dbi->prepare(
"INSERT ignore into display_xref_priority (ensembl_object_type,source_id, priority) values(?, ?, ?)");
459 my $get_source_id_sth = $xref_dbi->prepare(
"select source_id from source where name like ? order by priority");
460 my $list_sources_sth = $xref_dbi->prepare(
"select distinct name from display_xref_priority d join source using(source_id) where ensembl_object_type = ? order by d.priority");
463 my %object_types = (
'gene' =>
'Gene',
'transcript' =>
'Transcript');
465 foreach my $object_type (keys %object_types) {
469 my $method = $object_type .
'_display_xref_sources';
470 if( $self->mapper->can($method) ){
471 ($precedence, $ignore) = @{$self->mapper->$method()};
474 ($precedence, $ignore) = @{$self->$method()};
477 # The lower the priority number the better then
479 foreach my $name (@$precedence){
481 $get_source_id_sth->execute($name);
483 $get_source_id_sth->bind_columns(\$source_id);
484 while($get_source_id_sth->fetch){
485 $ins_p_sth->execute($object_types{$object_type},$source_id, $i);
489 $get_source_id_sth->finish;
492 if ($self->verbose) {
493 print
"Precedence for $object_type display xrefs (1- best name)\n";
494 $list_sources_sth->execute($object_types{$object_type});
496 $list_sources_sth->bind_columns(\$source_name);
497 while ($list_sources_sth->fetch() ) {
499 print
"\t$i\t$source_name\n";
504 $self->_apply_ignore($ignore, $xref_dbi);
507 #look at sources of display xrefs which are relevant for this object type
508 #(listed in gene_display_xref_sources() or transcript_display_xref_sources() )
509 #but get xrefs for all levels Gene, its Transcripts and Translations
510 #######################################################################
511 my $display_xref_sql =(<<DXS);
512 select CASE ox.ensembl_object_type
513 WHEN
'Gene' THEN gtt_gene.gene_id
514 WHEN
'Transcript' THEN gtt_transcript.gene_id
515 WHEN
'Translation' THEN gtt_translation.gene_id
517 CASE ox.ensembl_object_type
518 WHEN
'Gene' THEN gtt_gene.transcript_id
519 WHEN
'Transcript' THEN gtt_transcript.transcript_id
520 WHEN
'Translation' THEN gtt_translation.transcript_id
521 END AS d_transcript_id,
522 p.priority as priority,
524 from ( display_xref_priority p
527 join ( object_xref ox
528 join ( identity_xref ix
529 )
using (object_xref_id)
534 left join gene_transcript_translation gtt_gene
535 on (gtt_gene.gene_id = ox.ensembl_id)
536 left join gene_transcript_translation gtt_transcript
537 on (gtt_transcript.transcript_id = ox.ensembl_id)
538 left join gene_transcript_translation gtt_translation
539 on (gtt_translation.translation_id = ox.ensembl_id)
540 where ox.ox_status = 'DUMP_OUT'
541 and p.ensembl_object_type = ?
542 order by d_gene_id, ox.ensembl_object_type,
543 p.priority, (ix.target_identity + ix.query_identity) DESC, unused_priority DESC, x.
accession;
548 ########################################################################
552 my $display_xref_sth = $xref_dbi->prepare($display_xref_sql);
554 my $display_xref_count = 0;
555 $display_xref_sth->execute($object_type);
556 my ($gene_id, $transcript_id, $priority, $xref_id);
557 $display_xref_sth->bind_columns(\$gene_id, \$transcript_id, \$priority, \$xref_id);
558 while($display_xref_sth->fetch()){
560 if ($object_type eq
'gene') {
561 $object_id = $gene_id;
562 } elsif ($object_type eq
'transcript') {
563 $object_id = $transcript_id;
566 if (!exists($object_seen{$object_id}) ) {
567 if ($object_type eq
'gene') {
568 $update_gene_sth->execute($xref_id+$xref_offset, $object_id);
569 } elsif ($object_type eq
'transcript') {
570 $update_tran_sth->execute($xref_id+$xref_offset, $object_id);
572 $display_xref_count++;
573 $object_seen{$object_id} = 1;
577 $display_xref_sth->finish;
578 $update_gene_sth->finish;
579 $update_tran_sth->finish;
581 print
"Updated $display_xref_count $object_type display_xrefs\n" if($self->verbose);
586 # reset the status to DUMP_OUT fro object_xrefs that where ignored for the display_xref;
589 my $reset_status_sth = $xref_dbi->prepare(
'UPDATE object_xref SET ox_status = "DUMP_OUT" where ox_status = "NO_DISPLAY"');
590 $reset_status_sth->execute();
591 $reset_status_sth->finish;
594 # Clean up synonyms linked to xrefs which are not the display xref
595 # Synonyms are only used as alternative gene names, so should be synonyms of the gene symbol chosen
598 my $syn_clean_sth = $core_dbi->prepare(
"DELETE es FROM external_synonym es, xref x LEFT JOIN gene g ON g.display_xref_id = x.xref_id WHERE es.xref_id = x.xref_id AND isnull(g.display_xref_id)");
599 $syn_clean_sth->execute();
600 $syn_clean_sth->finish();
606 my ($self, $ignore, $xref_dbi) = @_;
608 # Set status to 'NO_DISPLAY' for object_xrefs with a display_label that is just numeric;
609 my $update_ignore_sth = $xref_dbi->prepare(
'UPDATE object_xref ox, source s, xref x SET ox_status = "NO_DISPLAY" where ox_status like "DUMP_OUT" and s.source_id = x.source_id and x.label REGEXP "^[0-9]+$" and ox.xref_id = x.xref_id');
610 $update_ignore_sth->execute();
611 $update_ignore_sth->finish;
613 $update_ignore_sth = $xref_dbi->prepare(
'UPDATE object_xref SET ox_status = "NO_DISPLAY" where object_xref_id = ?');
615 foreach my $ignore_sql (values %$ignore){
616 print
"IGNORE SQL: $ignore_sql\n" if($self->verbose);
617 my $ignore_sth = $xref_dbi->prepare($ignore_sql);
618 $ignore_sth->execute();
619 my ($object_xref_id);
620 $ignore_sth->bind_columns(\$object_xref_id);
621 while($ignore_sth->fetch()){
622 $update_ignore_sth->execute($object_xref_id);
626 $update_ignore_sth->finish;
630 sub transcript_names_from_gene {
632 my $core_dbi = $self->core->dbc;
633 my $xref_dbi = $self->xref->dbc;
635 print
"Assigning transcript names from gene names\n" if ($self->verbose);
637 my $reset_sth = $core_dbi->prepare(
"UPDATE transcript SET display_xref_id = null WHERE biotype NOT IN ('LRG_gene')");
638 $reset_sth->execute();
641 my $xref_id_sth = $core_dbi->prepare(
"SELECT max(xref_id) FROM xref");
642 my $ox_id_sth = $core_dbi->prepare(
"SELECT max(object_xref_id) FROM object_xref");
643 my $del_xref_sth = $core_dbi->prepare(
"DELETE x FROM xref x, object_xref ox WHERE x.xref_id = ox.xref_id AND ensembl_object_type = 'Transcript' AND display_label REGEXP '-2[0-9]{2}\$'");
644 my $reuse_xref_sth = $core_dbi->prepare(
"SELECT xref_id FROM xref x WHERE external_db_id = ? AND display_label = ? AND info_type = 'MISC'");
645 my $del_ox_sth = $core_dbi->prepare(
"DELETE ox FROM object_xref ox LEFT JOIN xref x ON x.xref_id = ox.xref_id WHERE isnull(x.xref_id)");
646 my $ins_xref_sth = $core_dbi->prepare(
"INSERT IGNORE into xref (xref_id, external_db_id, dbprimary_acc, display_label, version, description, info_type, info_text) values(?, ?, ?, ?, 0, ?, 'MISC', ?)");
647 my $ins_ox_sth = $core_dbi->prepare(
"INSERT into object_xref (object_xref_id, ensembl_id, ensembl_object_type, xref_id) values(?, ?, 'Transcript', ?)");
648 my $update_tran_sth = $core_dbi->prepare(
"UPDATE transcript t SET t.display_xref_id= ? WHERE t.transcript_id=?");
650 my $get_genes = $core_dbi->prepare(
"SELECT g.gene_id, e.db_name, x.dbprimary_acc, x.display_label, x.description FROM gene g, xref x, external_db e where g.display_xref_id = x.xref_id and e.external_db_id = x.external_db_id");
651 my $get_transcripts = $core_dbi->prepare(
"SELECT transcript_id FROM transcript WHERE gene_id = ? ORDER BY seq_region_start, seq_region_end");
652 my $get_source_id = $core_dbi->prepare(
"SELECT external_db_id FROM external_db WHERE db_name like ?");
654 $get_genes->execute();
655 my ($gene_id, $external_db, $external_db_id, $acc, $label, $description, $transcript_id, $xref_id, $ox_id, $ext, $reuse_xref_id, $info_text);
656 $get_genes->bind_columns(\$gene_id, \$external_db, \$acc, \$label, \$description);
657 $xref_id_sth->execute();
658 $xref_id_sth->bind_columns(\$xref_id);
659 $xref_id_sth->fetch();
660 $ox_id_sth->execute();
661 $ox_id_sth->bind_columns(\$ox_id);
663 $del_xref_sth->execute();
664 while ($get_genes->fetch()) {
666 $get_source_id->execute($external_db .
"_trans_name");
667 $get_source_id->bind_columns(\$external_db_id);
668 $get_source_id->fetch();
669 $get_transcripts->execute($gene_id);
670 $get_transcripts->bind_columns(\$transcript_id);
671 while ($get_transcripts->fetch) {
674 $reuse_xref_sth->execute($external_db_id, $label .
'-' . $ext);
675 $reuse_xref_sth->bind_columns(\$reuse_xref_id);
676 if ($reuse_xref_sth->fetch()) {
677 $ins_ox_sth->execute($ox_id, $transcript_id, $reuse_xref_id);
678 $update_tran_sth->execute($reuse_xref_id, $transcript_id);
680 $info_text =
'via gene ' . $acc;
681 $ins_xref_sth->execute($xref_id, $external_db_id, $label.
"-" . $ext, $label .
"-" . $ext, $description, $info_text);
682 $ins_ox_sth->execute($ox_id, $transcript_id, $xref_id);
683 $update_tran_sth->execute($xref_id, $transcript_id);
689 $del_xref_sth->finish();
690 $del_ox_sth->execute();
691 $del_ox_sth->finish();
692 $reuse_xref_sth->finish();
693 $xref_id_sth->finish();
694 $ox_id_sth->finish();
695 $get_genes->finish();
696 $get_source_id->finish();
697 $get_transcripts->finish();
698 $ins_xref_sth->finish();
699 $ins_ox_sth->finish();
700 $update_tran_sth->finish();
704 # Remove after sure everything is cool
714 $sth->bind_columns(\$old_label);
717 if($old_label ne $label){
718 print
"ERROR: $type ($id) has different display_xrefs ??? old:$old_label new:$label\n";
724 sub set_source_id_to_external_name {
727 my $name_to_external_name_href = shift;
729 my $source_id_to_external_name_href = {};
730 my $name_to_source_id_href = {};
732 my $sql =
'select s.source_id, s.name from source s, xref x where x.source_id = s.source_id group by s.source_id'; # only get those of interest
734 my $sth = $self->xref->dbc->prepare($sql);
737 $sth->bind_columns(\$id, \$name);
738 while($sth->fetch()){
739 if(defined($name_to_external_name_href->{$name})){
740 $source_id_to_external_name_href->{$id} = $name_to_external_name_href->{$name};
741 $name_to_source_id_href->{$name} = $id;
743 elsif($name =~ /notransfer$/){
746 die
"ERROR: Could not find $name in external_db table please add this too continue";
752 return ($source_id_to_external_name_href, $name_to_source_id_href);
757 sub set_gene_descriptions{
759 my $only_those_not_set = shift || 0;
761 my $core_dbi = $self->core->dbc;
762 my $xref_dbi = $self->xref->dbc;
764 my $update_gene_desc_sth = $core_dbi->prepare(
"UPDATE gene SET description = ? where gene_id = ? and description IS NULL");
766 if(!$only_those_not_set){
767 my $reset_sth = $core_dbi->prepare(
"UPDATE gene SET description = null");
768 $reset_sth->execute();
773 if($only_those_not_set){
774 print
"Only setting those not already set\n";
775 $sql =
"select internal_id from gene_stable_id where desc_set = 1";
776 my $sql_sth = $xref_dbi->prepare($sql);
779 $sql_sth->bind_columns(\$id);
780 while($sql_sth->fetch){
786 ##########################################
787 # Get source_id to external_disaply_name #
788 ##########################################
790 my %name_to_external_name;
791 $sql =
"select external_db_id, db_name, db_display_name from external_db";
792 my $sth = $core_dbi->prepare($sql);
794 my ($id, $name, $display_name);
795 $sth->bind_columns(\$id, \$name, \$display_name);
796 while($sth->fetch()){
797 $name_to_external_name{$name} = $display_name;
801 my ($source_id_to_external_name_href, $name_to_source_id_href);
802 if( $self->mapper->can(
"set_source_id_to_external_name") ){
803 ($source_id_to_external_name_href, $name_to_source_id_href) = $self->mapper->set_source_id_to_external_name (\%name_to_external_name);
806 ($source_id_to_external_name_href, $name_to_source_id_href) = $self->set_source_id_to_external_name (\%name_to_external_name);
809 my %source_id_to_external_name = %$source_id_to_external_name_href;
810 my %name_to_source_id = %$name_to_source_id_href;
815 my ($ignore, $precedence);
816 if( $self->mapper->can(
"gene_description_sources") ){
817 @precedence = $self->mapper->gene_description_sources();
820 ($precedence, $ignore) = @{$self->gene_description_sources()};
821 @precedence = @$precedence;
824 if( $self->mapper->can(
"gene_description_filter_regexps") ){
825 @regexps = $self->mapper->gene_description_filter_regexps();
828 @regexps = $self->gene_description_filter_regexps();
831 my $ins_p_sth = $xref_dbi->prepare(
"INSERT ignore into gene_desc_priority (source_id, priority) values(?, ?)");
832 my $get_source_id_sth = $xref_dbi->prepare(
"select source_id from source where name like ?");
833 my $list_sources_sth = $xref_dbi->prepare(
"select distinct name from gene_desc_priority d join source using(source_id) order by d.priority");
835 # The lower the priority number the better then
837 foreach my $name (@precedence){
839 $get_source_id_sth->execute($name);
841 $get_source_id_sth->bind_columns(\$source_id);
842 while($get_source_id_sth->fetch){
843 $ins_p_sth->execute($source_id, $i);
847 $get_source_id_sth->finish;
851 if ($self->verbose) {
852 print
"Precedence for gene descriptions (1- best description)\n";
853 $list_sources_sth->execute();
855 $list_sources_sth->bind_columns(\$source_name);
856 while ($list_sources_sth->fetch() ) {
858 print
"\t$i\t$source_name\n";
863 $self->_apply_ignore($ignore, $xref_dbi);
867 #######################################################################
868 my $gene_desc_sql =(<<DXS);
869 select CASE ox.ensembl_object_type
870 WHEN
'Gene' THEN gtt_gene.gene_id
871 WHEN
'Transcript' THEN gtt_transcript.gene_id
872 WHEN
'Translation' THEN gtt_translation.gene_id
874 x.description AS description,
875 s.source_id AS source_id,
877 from ( gene_desc_priority p
880 join ( object_xref ox
881 join ( identity_xref ix
882 )
using (object_xref_id)
887 left join gene_transcript_translation gtt_gene
888 on (gtt_gene.gene_id = ox.ensembl_id)
889 left join gene_transcript_translation gtt_transcript
890 on (gtt_transcript.transcript_id = ox.ensembl_id)
891 left join gene_transcript_translation gtt_translation
892 on (gtt_translation.translation_id = ox.ensembl_id)
893 where ox.ox_status = 'DUMP_OUT'
895 ox.ensembl_object_type,
897 (ix.target_identity+ix.query_identity) desc
901 ########################################################################
903 my $gene_sth = $core_dbi->prepare(
"select g.description from gene g where g.gene_id = ?");
905 my %no_source_name_in_desc;
906 if( $self->mapper->can(
"no_source_label_list") ){
907 foreach my $name (@{$self->mapper->no_source_label_list()}){
908 my $id = $name_to_source_id{$name};
909 print
"$name will not have [Source:...] info in desc\n";
910 $no_source_name_in_desc{$id} = 1;
914 my $gene_desc_sth = $xref_dbi->prepare($gene_desc_sql);
916 $gene_desc_sth->execute();
917 my ($gene_id, $desc,$source_id,$label);
918 $gene_desc_sth->bind_columns(\$gene_id, \$desc, \$source_id,\$label);
920 my %gene_desc_updated;
922 while($gene_desc_sth->fetch()){
924 next
if(exists($ignore{$gene_id}) || exists($gene_desc_updated{$gene_id}));
927 my $filtered_desc = $self->filter_by_regexp($desc, \@regexps);
928 if ($filtered_desc ne
"") {
929 if(!defined($no_source_name_in_desc{$source_id})){
930 $filtered_desc .=
" [Source:".$source_id_to_external_name{$source_id}.
";Acc:".$label.
"]";
932 $update_gene_desc_sth->execute($filtered_desc,$gene_id);
933 $gene_desc_updated{$gene_id} = 1;
937 $update_gene_desc_sth->finish;
938 $gene_desc_sth->finish;
939 print scalar(keys %gene_desc_updated) .
" gene descriptions added\n";
941 # reset the status to DUMP_OUT fro object_xrefs that where ignored for the display_xref;
943 my $reset_status_sth = $xref_dbi->prepare(
'UPDATE object_xref SET ox_status = "DUMP_OUT" where ox_status = "NO_DISPLAY"');
944 $reset_status_sth->execute();
945 $reset_status_sth->finish;
950 sub filter_by_regexp {
952 my ($self, $str, $regexps) = @_;
954 foreach my $regexp (@$regexps) {
971 $sth->bind_columns(\$old_desc);
974 if($old_desc ne $desc){
975 print
"ERROR: $type ($id) has different descriptions ??? \n\told:$old_desc \n\tnew:$desc\n";