ensembl-hive  2.7.0
DisplayXrefs.pm
Go to the documentation of this file.
1 =head1 LICENSE
2 
3 See the NOTICE file distributed with this work for additional information
4 regarding copyright ownership.
5 
6 Licensed under the Apache License, Version 2.0 (the "License");
7 you may not use this file except in compliance with the License.
8 You may obtain a copy of the License at
9 
10  http://www.apache.org/licenses/LICENSE-2.0
11 
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
17 
18 =cut
19 
20 package XrefMapper::DisplayXrefs;
21 use strict;
22 
23 use vars '@ISA';
24 @ISA = qw{ XrefMapper::BasicMapper };
25 
26 use warnings;
28 
29 use Cwd;
30 use DBI;
31 use File::Basename;
32 use IPC::Open3;
33 
34 my %genes_to_transcripts;
35 my %translation_to_transcript;
36 my %transcript_to_translation;
37 my %transcript_length;
38 
39 
40 #
41 # ignore should be some sql to return object_xref_ids that should be ignored. FOR full mode METHOD
42 # ignore should return regexp and source name as key for update METHODS
43 #
44 
45 sub gene_description_sources {
46  my $self = shift;
47 
48  my ($precedence, $ignore) = @{$self->gene_display_xref_sources()};
49  return [$precedence,$ignore];
50 
51 }
52 
53 sub gene_description_filter_regexps {
54 
55  return ('[0-9A-Z]{10}RIK PROTEIN[ \.]',
56  '\(?[0-9A-Z]{10}RIK PROTEIN\)?[ \.]',
57  '^BA\S+\s+\(NOVEL PROTEIN\)\.?',
58  '^BC\d+\_\d+\.?',
59  'CDNA SEQUENCE\s?,? [A-Z]+\d+[ \.;]',
60  '^CGI\-\d+ PROTEIN\.?\;?',
61  '^CHROMOSOME\s+\d+\s+OPEN\s+READING\s+FRAME\s+\d+\.?.*',
62  'CLONE MGC:\d+[ \.;]',
63  '^\(CLONE REM\d+\) ORF \(FRAGMENT\)\.*',
64  '\(CLONE \S+\)\s+',
65  '^DJ\S+\s+\(NOVEL PROTEIN\)\.?',
66  '^DKFZP[A-Z0-9]+\s+PROTEIN[\.;]?.*',
67  'DNA SEGMENT, CHR.*',
68  'EST [A-Z]+\d+[ \.;]',
69  'EXPRESSED SEQUENCE [A-Z]+\d+[ \.;]',
70  '^FKSG\d+\.?.*',
71  '^FLJ\d+\s+PROTEIN.*',
72  '^HSPC\d+.*',
73  '^HSPC\d+\s+PROTEIN\.?.*',
74  'HYPOTHETICAL PROTEIN,',
75  'HYPOTHETICAL PROTEIN \S+[\.;]',
76  '^\(*HYPOTHETICAL\s+.*',
77  '\(*HYPOTHETICAL\s+.*',
78  '^KIAA\d+\s+GENE\s+PRODUCT\.?.*',
79  '^KIAA\d+\s+PROTEIN\.?.*',
80  '^LOC\d+\s*(PROTEIN)?\.?',
81  ' MGC:\s*\d+[ \.;]',
82  'MGC:\s*\d+[ \.;]',
83  '^ORF.*',
84  '^ORF\s*\d+\s+PROTEIN\.*',
85  '^PRED\d+\s+PROTEIN.*',
86  '^PRO\d+\.?.*',
87  '^PRO\d+\s+PROTEIN\.?.*',
88  '^PROTEIN C\d+ORF\d+\.*',
89  'PROTEIN KIAA\d+[ \.].*',
90  'PROTEIN \S+ HOMOLOG\.?',
91  '^Putative uncharacterized protein.*',
92  'R\d{5}_\d[ \.,].*',
93  'RIKEN CDNA [0-9A-Z]{10}[ \.;]',
94  'RIKEN CDNA [0-9A-Z]{10}[ \.]',
95  '.*RIKEN FULL-LENGTH ENRICHED LIBRARY.*',
96  '.*RIKEN FULL-LENGTH ENRICHED LIBRARY.*PRODUCT:',
97  '^\s*\(\d*\)\s*[ \.]$',
98  '^\s*\(\d*\)\s*[ \.]$',
99  '^\s*\(?FRAGMENT\)?\.?\s*$',
100  '^\s*\(FRAGMENT\)\.?\s*$',
101  '\s*\(?GENE\)?\.?;?',
102  '^\s*\(?GENE\)?\.?;?\s*$',
103  '^\s*\(?GENE\)?\.?\s*$',
104  'SIMILAR TO GENBANK ACCESSION NUMBER\s+\S+',
105  '^SIMILAR TO GENE.*',
106  '^SIMILAR TO HYPOTHETICAL.*',
107  '^SIMILAR TO (KIAA|LOC).*',
108  'SIMILAR TO (KIAA|LOC|RIKEN).*',
109  '^SIMILAR TO PUTATIVE[ \.]',
110  'SIMILAR TO PUTATIVE[ \.]',
111  '^SIMILAR TO\s+$',
112  'SIMILAR TO\s+$',
113  '\s*\(?PRECURSOR\)?\.?;?',
114  '^\s*\(?PROTEIN\)?\.?\s*$',
115  '^\s+\(?\s*$',
116  '^\s*\(\s*\)\s*$',
117  '^UNKNOWN\s+.*',
118  '^WUGSC:H_.*',
119  '^WUGSC:.*\s+PROTEIN\.?.*');
120 
121 }
122 
123 sub transcript_display_xref_sources {
124  my $self = shift;
125 
126  return $self->gene_display_xref_sources();
127 
128 }
129 
130 
131 sub gene_display_xref_sources {
132  my $self = shift;
133 
134  my @list = qw(VGNC
135  HGNC
136  MGI
137  RGD
138  ZFIN_ID
139  Xenbase
140  RFAM
141  miRBase
142  EntrezGene
143  Uniprot_gn);
144 
145  my %ignore;
146 
147  #don't use EntrezGene labels dependent on predicted RefSeqs
148 
149 $ignore{'EntrezGene'} =<<IEG;
150 SELECT DISTINCT ox.object_xref_id
151  FROM object_xref ox, dependent_xref dx,
152  xref xmas, xref xdep,
153  source smas, source sdep
154  WHERE ox.xref_id = dx.dependent_xref_id AND
155  dx.dependent_xref_id = xdep.xref_id AND
156  dx.master_xref_id = xmas.xref_id AND
157  xmas.source_id = smas.source_id AND
158  xdep.source_id = sdep.source_id AND
159  smas.name like "Refseq%predicted" AND
160  sdep.name like "EntrezGene" AND
161  ox.ox_status = "DUMP_OUT" AND
162  ox.master_xref_id = dx.master_xref_id
163 IEG
164 
165  #don't use labels starting with LOC
166 
167 $ignore{'LOC_prefix'} =<<LOCP;
168 SELECT object_xref_id
169  FROM object_xref JOIN xref USING(xref_id) JOIN source USING(source_id)
170  WHERE ox_status = 'DUMP_OUT' AND label REGEXP '^LOC[[:digit:]]+'
171 LOCP
172 
173  return [\@list,\%ignore];
174 
175 }
176 
177 sub remove_source_priorities {
178  my $self = shift;
179 
180  my $sql = "DELETE from display_xref_priority";
181  my $sth = $self->xref->dbc->prepare($sql);
182  $sth->execute();
183 
184  $sql = "DELETE from gene_desc_priority";
185  $sth = $self->xref->dbc->prepare($sql);
186  $sth->execute();
187 
188  return;
189 }
190 
191 
192 sub new {
193  my($class, $mapper) = @_;
194 
195  my $self ={};
196  bless $self,$class;
197  $self->core($mapper->core);
198  $self->xref($mapper->xref);
199  $self->mapper($mapper);
200  $self->verbose($mapper->verbose);
201  return $self;
202 }
203 
204 
205 sub mapper{
206  my ($self, $arg) = @_;
207 
208  (defined $arg) &&
209  ($self->{_mapper} = $arg );
210  return $self->{_mapper};
211 }
212 
213 
214 
215 sub genes_and_transcripts_attributes_set{
216  # Runs build_transcript_and_gene_display_xrefs and
217  # build_meta_timestamp, and, if "-upload" is set, uses the SQL files
218  # produced to update the core database.
219 
220  my ($self) = @_;
221 
222  my $status = $self->mapper->xref_latest_status();
223 
224  if($self->mapper->can("set_display_xrefs")){
225  $self->mapper->set_display_xrefs();
226  }
227  else{
228  $self->set_display_xrefs();
229  }
230  if ($self->mapper->can("transcript_names_from_gene")) {
231  $self->mapper->transcript_names_from_gene();
232  } else {
233  $self->transcript_names_from_gene();
234  }
235  my $sth_stat = $self->xref->dbc->prepare("insert into process_status (status, date) values('display_xref_done',now())");
236  $sth_stat->execute();
237  $sth_stat->finish;
238  if($self->mapper->can("set_gene_descriptions")){
239  $self->mapper->set_gene_descriptions();
240  }
241  else{
242  $self->set_gene_descriptions(1);
243  }
244 
245  $self->build_meta_timestamp;
246 
247  $sth_stat = $self->xref->dbc->prepare("insert into process_status (status, date) values('gene_description_done',now())");
248  $sth_stat->execute();
249  $sth_stat->finish;
250 
251 
252  return 1;
253 }
254 
255 sub set_gene_descriptions_from_display_xref{
256  my $self = shift;
257 
258  $self->set_gene_descriptions(1);
259 }
260 
261 
262 
263 
264 sub set_display_xrefs_from_stable_table{
265  my $self = shift;
266  print "Setting Transcript and Gene display_xrefs from xref database into core and setting the desc\n" if ($self->verbose);
267 
268  my $xref_offset = $self->get_meta_value("xref_offset");
269  my $core_dbi = $self->core->dbc;
270  my $xref_dbi = $self->xref->dbc;
271 
272  print "Using xref_off set of $xref_offset\n" if($self->verbose);
273 
274  my $reset_sth = $core_dbi->prepare("UPDATE gene SET display_xref_id = null");
275  $reset_sth->execute();
276  $reset_sth->finish;
277 
278  # Remove any leftover transcript description, as it is not used anywhere
279  $reset_sth = $core_dbi->prepare("UPDATE transcript SET display_xref_id = null");
280  $reset_sth->execute();
281  $reset_sth->finish;
282 
283  # Remove descriptions assigned through the xref pipeline, recognisable by the 'Source' field
284  # This will maintain any manually added descriptions
285  $reset_sth = $core_dbi->prepare("UPDATE gene SET description = null WHERE description like '%[Source:%]%'");
286  $reset_sth->execute();
287  $reset_sth->finish;
288 
289  my %name_to_external_name;
290  my $sql = "select external_db_id, db_name, db_display_name from external_db";
291  my $sth = $core_dbi->prepare($sql);
292  $sth->execute();
293  my ($id, $name, $display_name);
294  $sth->bind_columns(\$id, \$name, \$display_name);
295  while($sth->fetch()){
296  $name_to_external_name{$name} = $display_name;
297  }
298  $sth->finish;
299 
300  my %source_id_to_external_name;
301 
302  $sql = 'select s.source_id, s.name from source s, xref x where x.source_id = s.source_id group by s.source_id'; # only get those of interest
303  $sth = $xref_dbi->prepare($sql);
304  $sth->execute();
305  $sth->bind_columns(\$id, \$name);
306 
307  while($sth->fetch()){
308  if(defined($name_to_external_name{$name})){
309  $source_id_to_external_name{$id} = $name_to_external_name{$name};
310  }
311  }
312  $sth->finish;
313 
314 
315  my $update_gene_sth = $core_dbi->prepare("UPDATE gene g SET g.display_xref_id= ? WHERE g.gene_id=?");
316  my $update_gene_desc_sth = $core_dbi->prepare("UPDATE gene g SET g.description= ? WHERE g.gene_id=?");
317 
318  my $update_tran_sth = $core_dbi->prepare("UPDATE transcript t SET t.display_xref_id= ? WHERE t.transcript_id=?");
319 
320  my $get_gene_display_xref = $xref_dbi->prepare("SELECT gsi.internal_id, gsi.display_xref_id, x.description ,x.source_id, x.accession
321  FROM gene_stable_id gsi, xref x
322  WHERE gsi.display_xref_id = x.xref_id");
323 
324  my $get_tran_display_xref = $xref_dbi->prepare("SELECT gsi.internal_id, gsi.display_xref_id from transcript_stable_id gsi");
325 
326  $reset_sth = $xref_dbi->prepare("UPDATE gene_stable_id gsi SET gsi.desc_set=0");
327  $reset_sth->execute();
328 
329  my $set_desc_done_sth = $xref_dbi->prepare("UPDATE gene_stable_id gsi SET gsi.desc_set=1 WHERE gsi.internal_id=?");
330 
331  $get_gene_display_xref->execute();
332  my $xref_id;
333  my $desc;
334  my $gene_id;
335  my $source_id;
336  my $label;
337  $get_gene_display_xref->bind_columns(\$gene_id, \$xref_id, \$desc, \$source_id, \$label);
338  my $gene_count =0;
339  while($get_gene_display_xref->fetch()){
340 
341  $update_gene_sth->execute($xref_id+$xref_offset, $gene_id);
342 
343  if (defined($desc) and $desc ne "") {
344  $desc .= " [Source:".$source_id_to_external_name{$source_id}.";Acc:".$label."]";
345  $update_gene_desc_sth->execute($desc,$gene_id);
346  $set_desc_done_sth->execute($gene_id);
347  $gene_count++;
348  }
349 
350  }
351 
352  $update_gene_desc_sth->finish;
353  $update_gene_sth->finish;
354 
355  print "$gene_count gene descriptions added\n" if($self->verbose);
356 
357  $get_tran_display_xref->execute();
358  my $tran_id;
359  $get_tran_display_xref->bind_columns(\$tran_id, \$xref_id);
360 
361  while($get_tran_display_xref->fetch()){
362  if(defined($xref_id)){
363  $update_tran_sth->execute($xref_id+$xref_offset, $tran_id);
364  if(!defined($tran_id) || !defined($xref_id) || !defined($xref_offset)){
365  print "PROB: tran_id = $tran_id\nxref_id = $xref_id\n$xref_offset = $xref_offset\n";
366  }
367  }
368  }
369 
370  #
371  # Clean up synonyms linked to xrefs which are not the display xref
372  # Synonyms are only used as alternative gene names, so should be synonyms of the gene symbol chosen
373  #
374 
375  my $syn_clean_sth = $core_dbi->prepare("DELETE es FROM external_synonym es, xref x LEFT JOIN gene g ON g.display_xref_id = x.xref_id WHERE es.xref_id = x.xref_id AND isnull(g.display_xref_id)");
376  $syn_clean_sth->execute();
377  $syn_clean_sth->finish();
378 
379 }
380 
381 
382 sub load_translation_to_transcript{
383  my ($self) = @_;
384 
385  my $sth = $self->core->dbc->prepare("SELECT translation_id, transcript_id FROM translation");
386  $sth->execute();
387 
388  my ($translation_id, $transcript_id);
389  $sth->bind_columns(\$translation_id, \$transcript_id);
390 
391  while ($sth->fetch()) {
392  $translation_to_transcript{$translation_id} = $transcript_id;
393  $transcript_to_translation{$transcript_id} = $translation_id if ($translation_id);
394  }
395 }
396 
397 
398 sub build_genes_to_transcripts {
399  my ($self) = @_;
400 
401  my $sql = "SELECT gene_id, transcript_id, seq_region_start, seq_region_end FROM transcript";
402  my $sth = $self->core->dbc->prepare($sql);
403  $sth->execute();
404 
405  my ($gene_id, $transcript_id, $start, $end);
406  $sth->bind_columns(\$gene_id, \$transcript_id, \$start, \$end);
407 
408  # Note %genes_to_transcripts is global
409  while ($sth->fetch()) {
410  push @{$genes_to_transcripts{$gene_id}}, $transcript_id;
411  $transcript_length{$transcript_id} = $end- $start;
412  }
413 
414  $sth->finish
415 }
416 
417 
418 sub build_meta_timestamp{
419  # Creates a file that contains the SQL needed to (re)set the
420  # 'xref.timestamp' key of the meta table.
421  my $self = shift;
422 
423 
424  my $sth = $self->core->dbc->prepare("DELETE FROM meta WHERE meta_key='xref.timestamp'");
425  $sth->execute();
426  $sth->finish;
427 
428  $sth = $self->core->dbc->prepare("INSERT INTO meta (meta_key,meta_value) VALUES ('xref.timestamp', NOW())");
429  $sth->execute();
430  $sth->finish;
431 
432  return;
433 }
434 
435 
436 
437 sub set_display_xrefs{
438  my $self = shift;
439 
440 
441  print "Building Transcript and Gene display_xrefs\n" if ($self->verbose);
442 
443  my $xref_offset = $self->get_meta_value("xref_offset");
444  my $core_dbi = $self->core->dbc();
445  my $xref_dbi = $self->xref->dbc();
446 
447  print "Using xref_off set of $xref_offset\n" if($self->verbose);
448 
449  my $reset_sth = $core_dbi->prepare("UPDATE transcript SET display_xref_id = null WHERE biotype NOT IN ('LRG_gene')");
450  $reset_sth->execute();
451  $reset_sth->finish;
452 
453  my $update_gene_sth = $core_dbi->prepare("UPDATE gene g SET g.display_xref_id= ? WHERE g.gene_id=? and g.display_xref_id IS NULL");
454  my $update_tran_sth = $core_dbi->prepare("UPDATE transcript t SET t.display_xref_id= ? WHERE t.transcript_id=?");
455 
456 
457 
458  my $ins_p_sth = $xref_dbi->prepare("INSERT ignore into display_xref_priority (ensembl_object_type,source_id, priority) values(?, ?, ?)");
459  my $get_source_id_sth = $xref_dbi->prepare("select source_id from source where name like ? order by priority");
460  my $list_sources_sth = $xref_dbi->prepare("select distinct name from display_xref_priority d join source using(source_id) where ensembl_object_type = ? order by d.priority");
461 
462 
463  my %object_types = ('gene' => 'Gene', 'transcript' => 'Transcript');
464 
465  foreach my $object_type (keys %object_types) {
466 
467  my $precedence;
468  my $ignore;
469  my $method = $object_type . '_display_xref_sources';
470  if( $self->mapper->can($method) ){
471  ($precedence, $ignore) = @{$self->mapper->$method()};
472  }
473  else{
474  ($precedence, $ignore) = @{$self->$method()};
475  }
476 
477  # The lower the priority number the better then
478  my $i=0;
479  foreach my $name (@$precedence){
480  $i++;
481  $get_source_id_sth->execute($name);
482  my $source_id;
483  $get_source_id_sth->bind_columns(\$source_id);
484  while($get_source_id_sth->fetch){
485  $ins_p_sth->execute($object_types{$object_type},$source_id, $i);
486  }
487  }
488  $ins_p_sth->finish;
489  $get_source_id_sth->finish;
490 
491  $i = 0;
492  if ($self->verbose) {
493  print "Precedence for $object_type display xrefs (1- best name)\n";
494  $list_sources_sth->execute($object_types{$object_type});
495  my $source_name;
496  $list_sources_sth->bind_columns(\$source_name);
497  while ($list_sources_sth->fetch() ) {
498  $i++;
499  print "\t$i\t$source_name\n";
500  }
501 
502  }
503 
504  $self->_apply_ignore($ignore, $xref_dbi);
505 
506 
507 #look at sources of display xrefs which are relevant for this object type
508 #(listed in gene_display_xref_sources() or transcript_display_xref_sources() )
509 #but get xrefs for all levels Gene, its Transcripts and Translations
510 #######################################################################
511 my $display_xref_sql =(<<DXS);
512 select CASE ox.ensembl_object_type
513  WHEN 'Gene' THEN gtt_gene.gene_id
514  WHEN 'Transcript' THEN gtt_transcript.gene_id
515  WHEN 'Translation' THEN gtt_translation.gene_id
516  END AS d_gene_id,
517  CASE ox.ensembl_object_type
518  WHEN 'Gene' THEN gtt_gene.transcript_id
519  WHEN 'Transcript' THEN gtt_transcript.transcript_id
520  WHEN 'Translation' THEN gtt_translation.transcript_id
521  END AS d_transcript_id,
522  p.priority as priority,
523  x.xref_id
524 from ( display_xref_priority p
525  join ( source s
526  join ( xref x
527  join ( object_xref ox
528  join ( identity_xref ix
529  ) using (object_xref_id)
530  ) using (xref_id)
531  ) using (source_id)
532  ) using (source_id)
533  )
534  left join gene_transcript_translation gtt_gene
535  on (gtt_gene.gene_id = ox.ensembl_id)
536  left join gene_transcript_translation gtt_transcript
537  on (gtt_transcript.transcript_id = ox.ensembl_id)
538  left join gene_transcript_translation gtt_translation
539  on (gtt_translation.translation_id = ox.ensembl_id)
540 where ox.ox_status = 'DUMP_OUT'
541  and p.ensembl_object_type = ?
542 order by d_gene_id, ox.ensembl_object_type,
543  p.priority, (ix.target_identity + ix.query_identity) DESC, unused_priority DESC, x.accession;
544 
545 DXS
546 
547 
548 ########################################################################
549 
550  my %object_seen;
551 
552  my $display_xref_sth = $xref_dbi->prepare($display_xref_sql);
553 
554  my $display_xref_count = 0;
555  $display_xref_sth->execute($object_type);
556  my ($gene_id, $transcript_id, $priority, $xref_id);
557  $display_xref_sth->bind_columns(\$gene_id, \$transcript_id, \$priority, \$xref_id);
558  while($display_xref_sth->fetch()){
559  my $object_id;
560  if ($object_type eq 'gene') {
561  $object_id = $gene_id;
562  } elsif ($object_type eq 'transcript') {
563  $object_id = $transcript_id;
564  }
565 
566  if (!exists($object_seen{$object_id}) ) {
567  if ($object_type eq 'gene') {
568  $update_gene_sth->execute($xref_id+$xref_offset, $object_id);
569  } elsif ($object_type eq 'transcript') {
570  $update_tran_sth->execute($xref_id+$xref_offset, $object_id);
571  }
572  $display_xref_count++;
573  $object_seen{$object_id} = 1;
574  }
575  }
576 
577  $display_xref_sth->finish;
578  $update_gene_sth->finish;
579  $update_tran_sth->finish;
580 
581  print "Updated $display_xref_count $object_type display_xrefs\n" if($self->verbose);
582 
583  }
584 
585  #
586  # reset the status to DUMP_OUT fro object_xrefs that where ignored for the display_xref;
587  #
588 
589  my $reset_status_sth = $xref_dbi->prepare('UPDATE object_xref SET ox_status = "DUMP_OUT" where ox_status = "NO_DISPLAY"');
590  $reset_status_sth->execute();
591  $reset_status_sth->finish;
592 
593  #
594  # Clean up synonyms linked to xrefs which are not the display xref
595  # Synonyms are only used as alternative gene names, so should be synonyms of the gene symbol chosen
596  #
597 
598  my $syn_clean_sth = $core_dbi->prepare("DELETE es FROM external_synonym es, xref x LEFT JOIN gene g ON g.display_xref_id = x.xref_id WHERE es.xref_id = x.xref_id AND isnull(g.display_xref_id)");
599  $syn_clean_sth->execute();
600  $syn_clean_sth->finish();
601 
602 
603 }
604 
605 sub _apply_ignore {
606  my ($self, $ignore, $xref_dbi) = @_;
607 
608  # Set status to 'NO_DISPLAY' for object_xrefs with a display_label that is just numeric;
609  my $update_ignore_sth = $xref_dbi->prepare('UPDATE object_xref ox, source s, xref x SET ox_status = "NO_DISPLAY" where ox_status like "DUMP_OUT" and s.source_id = x.source_id and x.label REGEXP "^[0-9]+$" and ox.xref_id = x.xref_id');
610  $update_ignore_sth->execute();
611  $update_ignore_sth->finish;
612 
613  $update_ignore_sth = $xref_dbi->prepare('UPDATE object_xref SET ox_status = "NO_DISPLAY" where object_xref_id = ?');
614 
615  foreach my $ignore_sql (values %$ignore){
616  print "IGNORE SQL: $ignore_sql\n" if($self->verbose);
617  my $ignore_sth = $xref_dbi->prepare($ignore_sql);
618  $ignore_sth->execute();
619  my ($object_xref_id);
620  $ignore_sth->bind_columns(\$object_xref_id);
621  while($ignore_sth->fetch()){
622  $update_ignore_sth->execute($object_xref_id);
623  }
624  $ignore_sth->finish;
625  }
626  $update_ignore_sth->finish;
627 }
628 
629 
630 sub transcript_names_from_gene {
631  my $self = shift;
632  my $core_dbi = $self->core->dbc;
633  my $xref_dbi = $self->xref->dbc;
634 
635  print "Assigning transcript names from gene names\n" if ($self->verbose);
636 
637  my $reset_sth = $core_dbi->prepare("UPDATE transcript SET display_xref_id = null WHERE biotype NOT IN ('LRG_gene')");
638  $reset_sth->execute();
639  $reset_sth->finish;
640 
641  my $xref_id_sth = $core_dbi->prepare("SELECT max(xref_id) FROM xref");
642  my $ox_id_sth = $core_dbi->prepare("SELECT max(object_xref_id) FROM object_xref");
643  my $del_xref_sth = $core_dbi->prepare("DELETE x FROM xref x, object_xref ox WHERE x.xref_id = ox.xref_id AND ensembl_object_type = 'Transcript' AND display_label REGEXP '-2[0-9]{2}\$'");
644  my $reuse_xref_sth = $core_dbi->prepare("SELECT xref_id FROM xref x WHERE external_db_id = ? AND display_label = ? AND info_type = 'MISC'");
645  my $del_ox_sth = $core_dbi->prepare("DELETE ox FROM object_xref ox LEFT JOIN xref x ON x.xref_id = ox.xref_id WHERE isnull(x.xref_id)");
646  my $ins_xref_sth = $core_dbi->prepare("INSERT IGNORE into xref (xref_id, external_db_id, dbprimary_acc, display_label, version, description, info_type, info_text) values(?, ?, ?, ?, 0, ?, 'MISC', ?)");
647  my $ins_ox_sth = $core_dbi->prepare("INSERT into object_xref (object_xref_id, ensembl_id, ensembl_object_type, xref_id) values(?, ?, 'Transcript', ?)");
648  my $update_tran_sth = $core_dbi->prepare("UPDATE transcript t SET t.display_xref_id= ? WHERE t.transcript_id=?");
649 
650  my $get_genes = $core_dbi->prepare("SELECT g.gene_id, e.db_name, x.dbprimary_acc, x.display_label, x.description FROM gene g, xref x, external_db e where g.display_xref_id = x.xref_id and e.external_db_id = x.external_db_id");
651  my $get_transcripts = $core_dbi->prepare("SELECT transcript_id FROM transcript WHERE gene_id = ? ORDER BY seq_region_start, seq_region_end");
652  my $get_source_id = $core_dbi->prepare("SELECT external_db_id FROM external_db WHERE db_name like ?");
653 
654  $get_genes->execute();
655  my ($gene_id, $external_db, $external_db_id, $acc, $label, $description, $transcript_id, $xref_id, $ox_id, $ext, $reuse_xref_id, $info_text);
656  $get_genes->bind_columns(\$gene_id, \$external_db, \$acc, \$label, \$description);
657  $xref_id_sth->execute();
658  $xref_id_sth->bind_columns(\$xref_id);
659  $xref_id_sth->fetch();
660  $ox_id_sth->execute();
661  $ox_id_sth->bind_columns(\$ox_id);
662  $ox_id_sth->fetch();
663  $del_xref_sth->execute();
664  while ($get_genes->fetch()) {
665  $ext = '201';
666  $get_source_id->execute($external_db . "_trans_name");
667  $get_source_id->bind_columns(\$external_db_id);
668  $get_source_id->fetch();
669  $get_transcripts->execute($gene_id);
670  $get_transcripts->bind_columns(\$transcript_id);
671  while ($get_transcripts->fetch) {
672  $xref_id++;
673  $ox_id++;
674  $reuse_xref_sth->execute($external_db_id, $label . '-' . $ext);
675  $reuse_xref_sth->bind_columns(\$reuse_xref_id);
676  if ($reuse_xref_sth->fetch()) {
677  $ins_ox_sth->execute($ox_id, $transcript_id, $reuse_xref_id);
678  $update_tran_sth->execute($reuse_xref_id, $transcript_id);
679  } else {
680  $info_text = 'via gene ' . $acc;
681  $ins_xref_sth->execute($xref_id, $external_db_id, $label. "-" . $ext, $label . "-" . $ext, $description, $info_text);
682  $ins_ox_sth->execute($ox_id, $transcript_id, $xref_id);
683  $update_tran_sth->execute($xref_id, $transcript_id);
684  }
685  $ext++;
686  }
687  }
688 
689  $del_xref_sth->finish();
690  $del_ox_sth->execute();
691  $del_ox_sth->finish();
692  $reuse_xref_sth->finish();
693  $xref_id_sth->finish();
694  $ox_id_sth->finish();
695  $get_genes->finish();
696  $get_source_id->finish();
697  $get_transcripts->finish();
698  $ins_xref_sth->finish();
699  $ins_ox_sth->finish();
700  $update_tran_sth->finish();
701 }
702 
703 
704 # Remove after sure everything is cool
705 sub check_label{
706  my $self = shift;
707  my $id = shift;
708  my $label = shift;
709  my $sth = shift;
710  my $type = shift;
711 
712  $sth->execute($id);
713  my $old_label;
714  $sth->bind_columns(\$old_label);
715  $sth->fetch;
716 
717  if($old_label ne $label){
718  print "ERROR: $type ($id) has different display_xrefs ??? old:$old_label new:$label\n";
719  }
720 }
721 
722 
723 
724 sub set_source_id_to_external_name {
725 
726  my $self = shift;
727  my $name_to_external_name_href = shift;
728 
729  my $source_id_to_external_name_href = {};
730  my $name_to_source_id_href = {};
731 
732  my $sql = 'select s.source_id, s.name from source s, xref x where x.source_id = s.source_id group by s.source_id'; # only get those of interest
733 
734  my $sth = $self->xref->dbc->prepare($sql);
735  $sth->execute();
736  my ($id, $name);
737  $sth->bind_columns(\$id, \$name);
738  while($sth->fetch()){
739  if(defined($name_to_external_name_href->{$name})){
740  $source_id_to_external_name_href->{$id} = $name_to_external_name_href->{$name};
741  $name_to_source_id_href->{$name} = $id;
742  }
743  elsif($name =~ /notransfer$/){
744  }
745  else{
746  die "ERROR: Could not find $name in external_db table please add this too continue";
747  }
748  }
749 
750  $sth->finish;
751 
752  return ($source_id_to_external_name_href, $name_to_source_id_href);
753 }
754 
755 
756 
757 sub set_gene_descriptions{
758  my $self = shift;
759  my $only_those_not_set = shift || 0;
760  my $sql;
761  my $core_dbi = $self->core->dbc;
762  my $xref_dbi = $self->xref->dbc;
763 
764  my $update_gene_desc_sth = $core_dbi->prepare("UPDATE gene SET description = ? where gene_id = ? and description IS NULL");
765 
766  if(!$only_those_not_set){
767  my $reset_sth = $core_dbi->prepare("UPDATE gene SET description = null");
768  $reset_sth->execute();
769  $reset_sth->finish;
770  }
771 
772  my %ignore;
773  if($only_those_not_set){
774  print "Only setting those not already set\n";
775  $sql = "select internal_id from gene_stable_id where desc_set = 1";
776  my $sql_sth = $xref_dbi->prepare($sql);
777  $sql_sth->execute;
778  my $id;
779  $sql_sth->bind_columns(\$id);
780  while($sql_sth->fetch){
781  $ignore{$id} = 1;
782  }
783  $sql_sth->finish;
784  }
785 
786  ##########################################
787  # Get source_id to external_disaply_name #
788  ##########################################
789 
790  my %name_to_external_name;
791  $sql = "select external_db_id, db_name, db_display_name from external_db";
792  my $sth = $core_dbi->prepare($sql);
793  $sth->execute();
794  my ($id, $name, $display_name);
795  $sth->bind_columns(\$id, \$name, \$display_name);
796  while($sth->fetch()){
797  $name_to_external_name{$name} = $display_name;
798  }
799  $sth->finish;
800 
801  my ($source_id_to_external_name_href, $name_to_source_id_href);
802  if( $self->mapper->can("set_source_id_to_external_name") ){
803  ($source_id_to_external_name_href, $name_to_source_id_href) = $self->mapper->set_source_id_to_external_name (\%name_to_external_name);
804  }
805  else{
806  ($source_id_to_external_name_href, $name_to_source_id_href) = $self->set_source_id_to_external_name (\%name_to_external_name);
807  }
808 
809  my %source_id_to_external_name = %$source_id_to_external_name_href;
810  my %name_to_source_id = %$name_to_source_id_href;
811 
812 
813  my @precedence;
814  my @regexps;
815  my ($ignore, $precedence);
816  if( $self->mapper->can("gene_description_sources") ){
817  @precedence = $self->mapper->gene_description_sources();
818  }
819  else{
820  ($precedence, $ignore) = @{$self->gene_description_sources()};
821  @precedence = @$precedence;
822  }
823 
824  if( $self->mapper->can("gene_description_filter_regexps") ){
825  @regexps = $self->mapper->gene_description_filter_regexps();
826  }
827  else{
828  @regexps = $self->gene_description_filter_regexps();
829  }
830 
831  my $ins_p_sth = $xref_dbi->prepare("INSERT ignore into gene_desc_priority (source_id, priority) values(?, ?)");
832  my $get_source_id_sth = $xref_dbi->prepare("select source_id from source where name like ?");
833  my $list_sources_sth = $xref_dbi->prepare("select distinct name from gene_desc_priority d join source using(source_id) order by d.priority");
834 
835  # The lower the priority number the better then
836  my $i=0;
837  foreach my $name (@precedence){
838  $i++;
839  $get_source_id_sth->execute($name);
840  my $source_id;
841  $get_source_id_sth->bind_columns(\$source_id);
842  while($get_source_id_sth->fetch){
843  $ins_p_sth->execute($source_id, $i);
844  }
845  }
846  $ins_p_sth->finish;
847  $get_source_id_sth->finish;
848 
849 
850  $i = 0;
851  if ($self->verbose) {
852  print "Precedence for gene descriptions (1- best description)\n";
853  $list_sources_sth->execute();
854  my $source_name;
855  $list_sources_sth->bind_columns(\$source_name);
856  while ($list_sources_sth->fetch() ) {
857  $i++;
858  print "\t$i\t$source_name\n";
859  }
860 
861  }
862 
863  $self->_apply_ignore($ignore, $xref_dbi);
864 
865 
866 
867 #######################################################################
868 my $gene_desc_sql =(<<DXS);
869 select CASE ox.ensembl_object_type
870  WHEN 'Gene' THEN gtt_gene.gene_id
871  WHEN 'Transcript' THEN gtt_transcript.gene_id
872  WHEN 'Translation' THEN gtt_translation.gene_id
873  END AS d_gene_id,
874  x.description AS description,
875  s.source_id AS source_id,
876  x.accession AS accession
877 from ( gene_desc_priority p
878  join ( source s
879  join ( xref x
880  join ( object_xref ox
881  join ( identity_xref ix
882  ) using (object_xref_id)
883  ) using (xref_id)
884  ) using (source_id)
885  ) using (source_id)
886  )
887  left join gene_transcript_translation gtt_gene
888  on (gtt_gene.gene_id = ox.ensembl_id)
889  left join gene_transcript_translation gtt_transcript
890  on (gtt_transcript.transcript_id = ox.ensembl_id)
891  left join gene_transcript_translation gtt_translation
892  on (gtt_translation.translation_id = ox.ensembl_id)
893 where ox.ox_status = 'DUMP_OUT'
894 order by d_gene_id,
895  ox.ensembl_object_type,
896  p.priority,
897  (ix.target_identity+ix.query_identity) desc
898 
899 DXS
900 
901 ########################################################################
902 
903  my $gene_sth = $core_dbi->prepare("select g.description from gene g where g.gene_id = ?");
904 
905  my %no_source_name_in_desc;
906  if( $self->mapper->can("no_source_label_list") ){
907  foreach my $name (@{$self->mapper->no_source_label_list()}){
908  my $id = $name_to_source_id{$name};
909  print "$name will not have [Source:...] info in desc\n";
910  $no_source_name_in_desc{$id} = 1;
911  }
912  }
913 
914  my $gene_desc_sth = $xref_dbi->prepare($gene_desc_sql);
915 
916  $gene_desc_sth->execute();
917  my ($gene_id, $desc,$source_id,$label);
918  $gene_desc_sth->bind_columns(\$gene_id, \$desc, \$source_id,\$label);
919 
920  my %gene_desc_updated;
921 
922  while($gene_desc_sth->fetch()){
923 
924  next if(exists($ignore{$gene_id}) || exists($gene_desc_updated{$gene_id}));
925 
926  if(defined($desc) ){
927  my $filtered_desc = $self->filter_by_regexp($desc, \@regexps);
928  if ($filtered_desc ne "") {
929  if(!defined($no_source_name_in_desc{$source_id})){
930  $filtered_desc .= " [Source:".$source_id_to_external_name{$source_id}.";Acc:".$label."]";
931  }
932  $update_gene_desc_sth->execute($filtered_desc,$gene_id);
933  $gene_desc_updated{$gene_id} = 1;
934  }
935  }
936  }
937  $update_gene_desc_sth->finish;
938  $gene_desc_sth->finish;
939  print scalar(keys %gene_desc_updated) ." gene descriptions added\n";
940 
941  # reset the status to DUMP_OUT fro object_xrefs that where ignored for the display_xref;
942 
943  my $reset_status_sth = $xref_dbi->prepare('UPDATE object_xref SET ox_status = "DUMP_OUT" where ox_status = "NO_DISPLAY"');
944  $reset_status_sth->execute();
945  $reset_status_sth->finish;
946 
947 
948 }
949 
950 sub filter_by_regexp {
951 
952  my ($self, $str, $regexps) = @_;
953 
954  foreach my $regexp (@$regexps) {
955  $str =~ s/$regexp//ig;
956  }
957 
958  return $str;
959 
960 }
961 
962 sub check_desc{
963  my $self = shift;
964  my $id = shift;
965  my $desc = shift;
966  my $sth = shift;
967  my $type = shift;
968 
969  $sth->execute($id);
970  my $old_desc;
971  $sth->bind_columns(\$old_desc);
972  $sth->fetch;
973 
974  if($old_desc ne $desc){
975  print "ERROR: $type ($id) has different descriptions ??? \n\told:$old_desc \n\tnew:$desc\n";
976  }
977 }
978 
979 
980 1;
XrefMapper::BasicMapper
Definition: BasicMapper.pm:8
accession
public accession()