RDKit
Open-source cheminformatics and machine learning.
SubstructureCache.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2014 Novartis Institutes for BioMedical Research
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 #include <RDGeneral/export.h>
11 #pragma once
12 #include <list>
13 #include <vector>
14 #include <string>
15 #include <stdexcept>
16 #include "../RDKitBase.h"
17 //#include "../Fingerprints/MorganFingerprints.h"
18 #include "Graph.h"
19 #include "Seed.h"
20 #include "DebugTrace.h" // algorithm filter definitions
21 
22 namespace RDKit {
23 namespace FMCS {
25  public:
26 #pragma pack(push, 1)
28  typedef unsigned long long TValue;
29  TValue Value{0};
30 
31  public:
33  };
34 #pragma pack(pop)
35 
36  struct HashKey {
38 
39  public:
40  void computeKey(const Seed& seed,
41  const std::vector<unsigned>& queryAtomLabels,
42  const std::vector<unsigned>& queryBondLabels) {
43  computeMorganCodeHash(seed, queryAtomLabels, queryBondLabels);
44  }
45 
46  private:
47  void computeMorganCodeHash(const Seed& seed,
48  const std::vector<unsigned>& queryAtomLabels,
49  const std::vector<unsigned>& queryBondLabels) {
50  size_t nv = seed.getNumAtoms();
51  size_t ne = seed.getNumBonds();
52  std::vector<unsigned long> currCodes(nv);
53  std::vector<unsigned long> prevCodes(nv);
54  size_t nIterations = seed.getNumBonds();
55  if (nIterations > 5) {
56  nIterations = 5;
57  }
58 
59  for (unsigned seedAtomIdx = 0; seedAtomIdx < seed.getNumAtoms();
60  seedAtomIdx++) {
61  currCodes[seedAtomIdx] =
62  queryAtomLabels[seed.MoleculeFragment.AtomsIdx[seedAtomIdx]];
63  }
64 
65  for (size_t iter = 0; iter < nIterations; iter++) {
66  for (size_t i = 0; i < nv; i++) {
67  prevCodes[i] = currCodes[i];
68  }
69 
70  for (size_t seedBondIdx = 0; seedBondIdx < ne; seedBondIdx++) {
71  const Bond* bond = seed.MoleculeFragment.Bonds[seedBondIdx];
72  unsigned order =
73  queryBondLabels[seed.MoleculeFragment.BondsIdx[seedBondIdx]];
74  unsigned atom1 = seed.MoleculeFragment.SeedAtomIdxMap
75  .find(bond->getBeginAtomIdx())
76  ->second;
77  unsigned atom2 =
78  seed.MoleculeFragment.SeedAtomIdxMap.find(bond->getEndAtomIdx())
79  ->second;
80  unsigned v1 = prevCodes[atom1];
81  unsigned v2 = prevCodes[atom2];
82 
83  currCodes[atom1] += v2 * v2 + (v2 + 23) * (order + 1721);
84  currCodes[atom2] += v1 * v1 + (v1 + 23) * (order + 1721);
85  }
86  }
87 
88  KeyNumericMetrics::TValue result = 0;
89  for (unsigned seedAtomIdx = 0; seedAtomIdx < nv; seedAtomIdx++) {
90  unsigned long code = currCodes[seedAtomIdx];
91  result += code * (code + 6849) + 29;
92  }
93 
94  NumericMetrics.Value = result;
95  }
96 
97  // not implemented yet:
98  /*
99  void computeFingerprint(const Seed& seed)
100  {
101  unsigned int radius = seed.getNumBonds();
102  if (radius > 5)
103  radius = 5;
104  ExplicitBitVect *mf =
105  RDKit::MorganFingerprints::getFingerprintAsBitVect(seed.GraphTopology,
106  radius); //SLOW !!!
107  // ...
108  delete mf;
109  NumericMetrics.Field.hasFingerprint = 1;
110  }
111  */
112  };
113 
114  typedef HashKey TKey;
115  typedef std::list<FMCS::Graph> TIndexEntry; // hash-key is not unique key
116  private:
117  std::vector<TIndexEntry> ValueStorage;
118  std::map<KeyNumericMetrics::TValue, size_t> NumericIndex; // TIndexEntry
119  public:
120  // returns computed key, and pointer to index entry with a set of subgraphs
121  // corresponding to the key if found.
122  // then caller must find exactly matched subgraph in the result set with own
123  // search algorithm,
124  // including a resolving of collisions of hash key
126  const std::vector<unsigned>& queryAtomLabels,
127  const std::vector<unsigned>& queryBondLabels,
128  TKey& key) { // compute key and find entry
129  key.computeKey(seed, queryAtomLabels, queryBondLabels);
130  std::map<KeyNumericMetrics::TValue, size_t>::const_iterator entryit =
131  NumericIndex.find(key.NumericMetrics.Value);
132  if (NumericIndex.end() != entryit) {
133  return &ValueStorage[entryit->second];
134  }
135  return nullptr; // not found
136  }
137 
138  // if find() did not found any entry for this key of seed a new entry will be
139  // created
140  void add(const Seed& seed, TKey& key,
141  TIndexEntry* entry) { // "compute" value and store it in NEW entry
142  // if not found
143  if (!entry) {
144  try {
145  ValueStorage.emplace_back();
146  } catch (...) {
147  return; // not enough memory room to add the item, but it's just a
148  // cache
149  }
150  entry = &ValueStorage.back();
151  }
152  entry->push_back(seed.Topology);
153 
154  if (!NumericIndex
155  .insert(std::pair<KeyNumericMetrics::TValue, size_t>(
156  key.NumericMetrics.Value, ValueStorage.size() - 1))
157  .second) {
158  return; // not enough memory room to add the item, but it is just cache
159  }
160  }
161 
162  size_t keyssize() const { // for statistics only
163  return ValueStorage.size();
164  }
165 
166  size_t fullsize() const { // for statistics only
167  size_t n = 0;
168  for (std::vector<TIndexEntry>::const_iterator e = ValueStorage.begin();
169  e != ValueStorage.end(); e++) {
170  n += e->size();
171  }
172  return n;
173  }
174 };
175 } // namespace FMCS
176 } // namespace RDKit
std::list< FMCS::Graph > TIndexEntry
void add(const Seed &seed, TKey &key, TIndexEntry *entry)
TIndexEntry * find(const Seed &seed, const std::vector< unsigned > &queryAtomLabels, const std::vector< unsigned > &queryBondLabels, TKey &key)
#define RDKIT_FMCS_EXPORT
Definition: export.h:145
const uint32_t seed
Definition: MHFP.h:29
Std stuff.
Definition: Abbreviations.h:18
void computeKey(const Seed &seed, const std::vector< unsigned > &queryAtomLabels, const std::vector< unsigned > &queryBondLabels)