RDKit
Open-source cheminformatics and machine learning.
SubstructLibrary.h
Go to the documentation of this file.
1 // Copyright (c) 2017-2021, Novartis Institutes for BioMedical Research Inc.
2 // and other RDKit contributors
3 //
4 // All rights reserved.
5 //
6 // Redistribution and use in source and binary forms, with or without
7 // modification, are permitted provided that the following conditions are
8 // met:
9 //
10 // * Redistributions of source code must retain the above copyright
11 // notice, this list of conditions and the following disclaimer.
12 // * Redistributions in binary form must reproduce the above
13 // copyright notice, this list of conditions and the following
14 // disclaimer in the documentation and/or other materials provided
15 // with the distribution.
16 // * Neither the name of Novartis Institutes for BioMedical Research Inc.
17 // nor the names of its contributors may be used to endorse or promote
18 // products derived from this software without specific prior written
19 // permission.
20 //
21 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 //
33 #ifndef RDK_SUBSTRUCT_LIBRARY
34 #define RDK_SUBSTRUCT_LIBRARY
35 #include <utility>
36 
37 #include <RDGeneral/export.h>
38 #include <GraphMol/RDKitBase.h>
39 #include <GraphMol/MolPickler.h>
40 #include <GraphMol/MolBundle.h>
46 #include <DataStructs/BitOps.h>
47 #include <GraphMol/MolOps.h>
49 
50 #include <algorithm>
51 #include <string>
52 #include <boost/lexical_cast.hpp>
53 
54 namespace RDKit {
55 
57 
58 //! Base class API for holding molecules to substructure search.
59 /*!
60  This is an API that hides the implementation details used for
61  indexing molecules for substructure searching. It simply
62  provides an API for adding and getting molecules from a set.
63  */
65  public:
66  virtual ~MolHolderBase() {}
67 
68  //! Add a new molecule to the substructure search library
69  //! Returns the molecules index in the library
70  virtual unsigned int addMol(const ROMol &m) = 0;
71 
72  // implementations should throw IndexError on out of range
73  virtual boost::shared_ptr<ROMol> getMol(unsigned int) const = 0;
74 
75  //! Get the current library size
76  virtual unsigned int size() const = 0;
77 };
78 
79 //! Concrete class that holds molecules in memory
80 /*!
81  This is currently one of the faster implementations.
82  However it is very memory intensive.
83 */
85  std::vector<boost::shared_ptr<ROMol>> mols;
86 
87  public:
88  MolHolder() : MolHolderBase(), mols() {}
89 
90  unsigned int addMol(const ROMol &m) override {
91  mols.push_back(boost::make_shared<ROMol>(m));
92  return size() - 1;
93  }
94 
95  boost::shared_ptr<ROMol> getMol(unsigned int idx) const override {
96  if (idx >= mols.size()) {
97  throw IndexErrorException(idx);
98  }
99  return mols[idx];
100  }
101 
102  unsigned int size() const override {
103  return rdcast<unsigned int>(mols.size());
104  }
105 
106  std::vector<boost::shared_ptr<ROMol>> &getMols() { return mols; }
107  const std::vector<boost::shared_ptr<ROMol>> &getMols() const { return mols; }
108 };
109 
110 //! Concrete class that holds binary cached molecules in memory
111 /*!
112  This implementation uses quite a bit less memory than the
113  non cached implementation. However, due to the reduced speed
114  it should be used in conjunction with a pattern fingerprinter.
115 
116  See RDKit::FPHolder
117 */
119  std::vector<std::string> mols;
120 
121  public:
123 
124  unsigned int addMol(const ROMol &m) override {
125  mols.emplace_back();
126  MolPickler::pickleMol(m, mols.back());
127  return size() - 1;
128  }
129 
130  //! Adds a pickled binary molecule, no validity checking of the input
131  //! is done.
132  unsigned int addBinary(const std::string &pickle) {
133  mols.push_back(pickle);
134  return size() - 1;
135  }
136 
137  boost::shared_ptr<ROMol> getMol(unsigned int idx) const override {
138  if (idx >= mols.size()) {
139  throw IndexErrorException(idx);
140  }
141  boost::shared_ptr<ROMol> mol(new ROMol);
142  MolPickler::molFromPickle(mols[idx], mol.get());
143  return mol;
144  }
145 
146  unsigned int size() const override {
147  return rdcast<unsigned int>(mols.size());
148  }
149 
150  std::vector<std::string> &getMols() { return mols; }
151  const std::vector<std::string> &getMols() const { return mols; }
152 };
153 
154 //! Concrete class that holds smiles strings in memory
155 /*!
156  This implementation uses quite a bit less memory than the
157  cached binary or uncached implementation. However, due to the
158  reduced speed it should be used in conjunction with a pattern
159  fingerprinter.
160 
161  See RDKit::FPHolder
162 */
164  : public MolHolderBase {
165  std::vector<std::string> mols;
166 
167  public:
169 
170  unsigned int addMol(const ROMol &m) override {
171  bool doIsomericSmiles = true;
172  mols.push_back(MolToSmiles(m, doIsomericSmiles));
173  return size() - 1;
174  }
175 
176  //! Add a smiles to the dataset, no validation is done
177  //! to the inputs.
178  unsigned int addSmiles(const std::string &smiles) {
179  mols.push_back(smiles);
180  return size() - 1;
181  }
182 
183  boost::shared_ptr<ROMol> getMol(unsigned int idx) const override {
184  if (idx >= mols.size()) {
185  throw IndexErrorException(idx);
186  }
187 
188  boost::shared_ptr<ROMol> mol(SmilesToMol(mols[idx]));
189  return mol;
190  }
191 
192  unsigned int size() const override {
193  return rdcast<unsigned int>(mols.size());
194  }
195 
196  std::vector<std::string> &getMols() { return mols; }
197  const std::vector<std::string> &getMols() const { return mols; }
198 };
199 
200 //! Concrete class that holds trusted smiles strings in memory
201 /*!
202  A trusted smiles is essentially a smiles string that
203  RDKit has generated. This indicates that fewer
204  sanitization steps are required. See
205  http://rdkit.blogspot.com/2016/09/avoiding-unnecessary-work-and.html
206 
207  This implementation uses quite a bit less memory than the
208  cached binary or uncached implementation. However, due to the
209  reduced speed it should be used in conjunction with a pattern
210  fingerprinter.
211 
212  See RDKit::FPHolder
213 */
215  : public MolHolderBase {
216  std::vector<std::string> mols;
217 
218  public:
220 
221  unsigned int addMol(const ROMol &m) override {
222  bool doIsomericSmiles = true;
223  mols.push_back(MolToSmiles(m, doIsomericSmiles));
224  return size() - 1;
225  }
226 
227  //! Add a smiles to the dataset, no validation is done
228  //! to the inputs.
229  unsigned int addSmiles(const std::string &smiles) {
230  mols.push_back(smiles);
231  return size() - 1;
232  }
233 
234  boost::shared_ptr<ROMol> getMol(unsigned int idx) const override {
235  if (idx >= mols.size()) {
236  throw IndexErrorException(idx);
237  }
238 
239  RWMol *m = SmilesToMol(mols[idx], 0, false);
240  if (m) {
241  m->updatePropertyCache();
242  }
243  return boost::shared_ptr<ROMol>(m);
244  }
245 
246  unsigned int size() const override {
247  return rdcast<unsigned int>(mols.size());
248  }
249 
250  std::vector<std::string> &getMols() { return mols; }
251  const std::vector<std::string> &getMols() const { return mols; }
252 };
253 
254 //! Base FPI for the fingerprinter used to rule out impossible matches
256  std::vector<ExplicitBitVect *> fps;
257 
258  public:
259  virtual ~FPHolderBase() {
260  for (size_t i = 0; i < fps.size(); ++i) {
261  delete fps[i];
262  }
263  }
264 
265  virtual unsigned int size() const { return rdcast<unsigned int>(fps.size()); }
266 
267  //! Adds a molecule to the fingerprinter
268  unsigned int addMol(const ROMol &m) {
269  fps.push_back(makeFingerprint(m));
270  return rdcast<unsigned int>(fps.size() - 1);
271  }
272 
273  //! Adds a raw bit vector pointer to the fingerprinter, which takes ownership
274  //! PLEASE NOTE: make sure that the passed ExplicitBitVect
275  //! is compatible with the one generated by makeFingerprint()
276  unsigned int addFingerprint(ExplicitBitVect *v) {
277  fps.push_back(v);
278  return rdcast<unsigned int>(fps.size() - 1);
279  }
280 
281  //! Adds a raw bit vector to the fingerprinter
282  //! PLEASE NOTE: make sure that the passed ExplicitBitVect
283  //! is compatible with the one generated by makeFingerprint()
284  unsigned int addFingerprint(const ExplicitBitVect &v) {
285  return addFingerprint(new ExplicitBitVect(v));
286  }
287 
288  //! Return false if a substructure search can never match the molecule
289  bool passesFilter(unsigned int idx, const ExplicitBitVect &query) const {
290  if (idx >= fps.size()) {
291  throw IndexErrorException(idx);
292  }
293 
294  return AllProbeBitsMatch(query, *fps[idx]);
295  }
296 
297  //! Get the bit vector at the specified index (throws IndexError if out of
298  //! range)
299  const ExplicitBitVect &getFingerprint(unsigned int idx) const {
300  if (idx >= fps.size()) {
301  throw IndexErrorException(idx);
302  }
303  return *fps[idx];
304  }
305 
306  //! make the query vector
307  //! Caller owns the vector!
308  virtual ExplicitBitVect *makeFingerprint(const ROMol &m) const = 0;
309 
310  std::vector<ExplicitBitVect *> &getFingerprints() { return fps; }
311  const std::vector<ExplicitBitVect *> &getFingerprints() const { return fps; }
312 };
313 
314 //! Uses the pattern fingerprinter with a user-defined number of bits (default:
315 //! 2048) to rule out matches
317  unsigned int numBits;
318 
319  public:
320  PatternHolder() : FPHolderBase(), numBits(defaultNumBits()) {}
321  PatternHolder(unsigned int numBits) : FPHolderBase(), numBits(numBits) {}
322  //! Caller owns the vector!
323  ExplicitBitVect *makeFingerprint(const ROMol &m) const override {
324  return PatternFingerprintMol(m, numBits);
325  }
326  const unsigned int &getNumBits() const { return numBits; };
327  unsigned int &getNumBits() { return numBits; };
328  static unsigned int defaultNumBits() {
329  static const unsigned int DEFAULT_NUM_BITS = 2048;
330  return DEFAULT_NUM_BITS;
331  };
332 };
333 
335  : public PatternHolder {
336  public:
338  TautomerPatternHolder(unsigned int numBits) : PatternHolder(numBits) {}
339  ExplicitBitVect *makeFingerprint(const ROMol &m) const override {
340  std::vector<unsigned int> *atomCounts = nullptr;
341  ExplicitBitVect *setOnlyBits = nullptr;
342  const bool tautomericFingerprint = true;
343  return PatternFingerprintMol(m, getNumBits(), atomCounts, setOnlyBits,
344  tautomericFingerprint);
345  }
346 };
347 
349  public:
350  virtual ~KeyHolderBase() {}
351 
352  //! Add a key to the database getting it from the molecule
353  virtual unsigned int addMol(const ROMol &m) = 0;
354 
355  //! Add a key to the database, this needs to be in the same order
356  //! as the molecule, no validation is done
357  virtual unsigned int addKey(const std::string &) = 0;
358 
359  // !get the key at the requested index
360  // implementations should throw IndexError on out of range
361  virtual const std::string &getKey(unsigned int) const = 0;
362 
363  // !get keys from a bunch of indices
364  virtual std::vector<std::string> getKeys(
365  const std::vector<unsigned int> &indices) const = 0;
366  //! Get the current keeyholder size
367  virtual unsigned int size() const = 0;
368 };
369 
371  std::string propname;
372  std::vector<std::string> keys;
373  const std::string empty_string = {};
374 
375  public:
376  KeyFromPropHolder(const std::string &propname = "_Name")
377  : propname(propname) {}
378 
379  std::string &getPropName() { return propname; }
380  const std::string &getPropName() const { return propname; }
381 
382  std::vector<std::string> &getKeys() { return keys; }
383  const std::vector<std::string> &getKeys() const { return keys; }
384 
385  unsigned int addMol(const ROMol &m) override {
386  std::string key;
387  if (m.getPropIfPresent(propname, key)) {
388  keys.push_back(std::move(key));
389  } else {
390  // XXX is this a warning? it could be verbose. Should we push back the
391  // string repr of the
392  // numeric index?
393  const static std::string prefix("LIBIDX-");
394  keys.emplace_back(prefix + boost::lexical_cast<std::string>(keys.size()));
395  }
396  return keys.size() - 1u;
397  };
398 
399  unsigned int addKey(const std::string &key) override {
400  keys.push_back(key);
401  return keys.size() - 1u;
402  }
403 
404  const std::string &getKey(unsigned int idx) const override {
405  if (idx >= keys.size()) {
406  throw IndexErrorException(idx);
407  }
408  return keys[idx];
409  }
410 
411  std::vector<std::string> getKeys(
412  const std::vector<unsigned int> &indices) const override {
413  std::vector<std::string> res;
414  std::transform(indices.begin(), indices.end(), std::back_inserter(res),
415  [=](unsigned idx) { return keys.at(idx); });
416  return res;
417  }
418  unsigned int size() const override { return keys.size(); }
419 };
420 
421 //! Substructure Search a library of molecules
422 /*! This class allows for multithreaded substructure searches of
423  large datasets.
424 
425  The implementations can use fingerprints to speed up searches
426  and have molecules cached as binary forms to reduce memory
427  usage.
428 
429  basic usage:
430  \code
431  SubstructLibrary lib;
432  lib.addMol(mol);
433  std::vector<unsigned int> results = lib.getMatches(query);
434  for(std::vector<unsigned int>::const_iterator matchIndex=results.begin();
435  matchIndex != results.end();
436  ++matchIndex) {
437  boost::shared_ptr<ROMol> match = lib.getMol(*matchIndex);
438  }
439  \endcode
440 
441  Using different mol holders and pattern fingerprints.
442 
443  \code
444  boost::shared_ptr<CachedTrustedSmilesMolHolder> molHolder = \
445  boost::make_shared<CachedTrustedSmilesMolHolder>();
446  boost::shared_ptr<PatternHolder> patternHolder = \
447  boost::make_shared<PatternHolder>();
448 
449  SubstructLibrary lib(molHolder, patternHolder);
450  lib.addMol(mol);
451  \endcode
452 
453  Cached molecule holders create molecules on demand. There are currently
454  three styles of cached molecules.
455 
456  CachedMolHolder: stores molecules in the rdkit binary format.
457  CachedSmilesMolHolder: stores molecules in smiles format.
458  CachedTrustedSmilesMolHolder: stores molecules in smiles format.
459 
460  The CachedTrustedSmilesMolHolder is made to add molecules from
461  a trusted source. This makes the basic assumption that RDKit was
462  used to sanitize and canonicalize the smiles string. In practice
463  this is considerably faster than using arbitrary smiles strings since
464  certain assumptions can be made. Molecules generated from trusted
465  smiles do not have ring information (although this is created
466  in the molecule being searched if necessary).
467 
468  When loading from external data, as opposed to using the "addMol" API,
469  care must be taken to ensure that the pattern fingerprints and smiles
470  are synchronized.
471 
472  Each pattern holder has an API point for making its fingerprint. This
473  is useful to ensure that the pattern stored in the database will be
474  compatible with the patterns made when analyzing queries.
475 
476  \code
477  boost::shared_ptr<CachedTrustedSmilesMolHolder> molHolder = \
478  boost::make_shared<CachedTrustedSmilesMolHolder>();
479  boost::shared_ptr<PatternHolder> patternHolder = \
480  boost::make_shared<PatternHolder>();
481 
482  // the PatternHolder instance is able to make fingerprints.
483  // These, of course, can be read from a file. For demonstration
484  // purposes we construct them here.
485  const std::string trustedSmiles = "c1ccccc1";
486  ROMol *m = SmilesToMol(trustedSmiles);
487  const ExplicitBitVect *bitVector = patternHolder->makeFingerprint(*m);
488 
489  // The trusted smiles and bitVector can be read from any source.
490  // This is the fastest way to load a substruct library.
491  molHolder->addSmiles( trustedSmiles );
492  patternHolder->addFingerprint( *bitVector );
493  SubstructLibrary lib(molHolder, patternHolder);
494  delete m;
495  delete bitVector;
496  \endcode
497 
498  Finally, using the KeyFromPropHolder will store user ids or keys.
499  By default, it uses RDKit's default _Name prop, but can be changed
500  to any property.
501 
502  \code
503  boost::shared_ptr<CachedTrustedSmilesMolHolder> molHolder = \
504  boost::make_shared<CachedTrustedSmilesMolHolder>();
505  boost::shared_ptr<KeyFromPropHolder> keyHolder = \
506  boost::make_shared<KeyFromPropHolder>();
507  SubstructLibrary lib(molHolder, keyHolder);
508  ...
509 
510  You can get the keys in multiple through the use of the keyholder
511  auto key = lib.getKeys().getKey(idx);
512  auto keys = lib.getKeys().getKeys(lib.GetMatch(query));
513  \endcode
514 
515 */
517  boost::shared_ptr<MolHolderBase> molholder;
518  boost::shared_ptr<FPHolderBase> fpholder;
519  boost::shared_ptr<KeyHolderBase> keyholder;
520 
521  MolHolderBase *mols; // used for a small optimization
522  FPHolderBase *fps{nullptr};
523  bool is_tautomerquery = false;
524  std::vector<unsigned int> searchOrder;
525 
526  public:
528  : molholder(new MolHolder),
529  fpholder(),
530  keyholder(),
531  mols(molholder.get()) {}
532 
533  SubstructLibrary(boost::shared_ptr<MolHolderBase> molecules)
534  : molholder(std::move(molecules)),
535  fpholder(),
536  keyholder(),
537  mols(molholder.get()),
538  fps(nullptr) {}
539 
540  SubstructLibrary(boost::shared_ptr<MolHolderBase> molecules,
541  boost::shared_ptr<FPHolderBase> fingerprints)
542  : molholder(std::move(molecules)),
543  fpholder(std::move(fingerprints)),
544  keyholder(),
545  mols(molholder.get()),
546  fps(fpholder.get()) {
547  if (fpholder.get() &&
548  dynamic_cast<TautomerPatternHolder *>(fpholder.get()) != nullptr) {
549  is_tautomerquery = true;
550  }
551  }
552 
553  SubstructLibrary(boost::shared_ptr<MolHolderBase> molecules,
554  boost::shared_ptr<KeyHolderBase> keys)
555  : molholder(std::move(molecules)),
556  fpholder(),
557  keyholder(std::move(keys)),
558  mols(molholder.get()),
559  fps(nullptr) {
560  if (fpholder.get() &&
561  dynamic_cast<TautomerPatternHolder *>(fpholder.get()) != nullptr) {
562  is_tautomerquery = true;
563  }
564  }
565 
566  SubstructLibrary(boost::shared_ptr<MolHolderBase> molecules,
567  boost::shared_ptr<FPHolderBase> fingerprints,
568  boost::shared_ptr<KeyHolderBase> keys)
569  : molholder(std::move(molecules)),
570  fpholder(std::move(fingerprints)),
571  keyholder(std::move(keys)),
572  mols(molholder.get()),
573  fps(fpholder.get()) {
574  if (fpholder.get() &&
575  dynamic_cast<TautomerPatternHolder *>(fpholder.get()) != nullptr) {
576  is_tautomerquery = true;
577  }
578  }
579 
580  SubstructLibrary(const std::string &pickle)
581  : molholder(new MolHolder),
582  fpholder(),
583  mols(molholder.get()),
584  fps(nullptr) {
585  initFromString(pickle);
586  if (fpholder.get() &&
587  dynamic_cast<TautomerPatternHolder *>(fpholder.get()) != nullptr) {
588  is_tautomerquery = true;
589  }
590  }
591 
592  //! Get the underlying molecule holder implementation
593  boost::shared_ptr<MolHolderBase> &getMolHolder() { return molholder; }
594 
595  const boost::shared_ptr<MolHolderBase> &getMolHolder() const {
596  return molholder;
597  }
598 
599  //! Get the underlying molecule holder implementation
600  boost::shared_ptr<FPHolderBase> &getFpHolder() { return fpholder; }
601 
602  //! Get the underlying molecule holder implementation
603  const boost::shared_ptr<FPHolderBase> &getFpHolder() const {
604  return fpholder;
605  }
606 
607  //! Get the underlying molecule holder implementation
608  boost::shared_ptr<KeyHolderBase> &getKeyHolder() { return keyholder; }
609 
610  //! Get the underlying molecule holder implementation
611  const boost::shared_ptr<KeyHolderBase> &getKeyHolder() const {
612  return keyholder;
613  }
614 
615  const MolHolderBase &getMolecules() const {
616  PRECONDITION(mols, "Molecule holder NULL in SubstructLibrary");
617  return *mols;
618  }
619 
620  //! Get the underlying fingerprint implementation.
621  /*! Throws a value error if no fingerprints have been set */
623  if (!fps) {
624  throw ValueErrorException("Substruct Library does not have fingerprints");
625  }
626  return *fps;
627  }
628 
629  const FPHolderBase &getFingerprints() const {
630  if (!fps) {
631  throw ValueErrorException("Substruct Library does not have fingerprints");
632  }
633  return *fps;
634  }
635 
636  //! Get the underlying key holder implementation.
637  /*! Throws a value error if no keyholder have been set */
639  if (!keyholder.get()) {
640  throw ValueErrorException("Substruct Library does not have fingerprints");
641  }
642  return *keyholder.get();
643  }
644 
645  //! Get the underlying key holder implementation.
646  /*! Throws a value error if no keyholder have been set */
647  const KeyHolderBase &getKeys() const {
648  if (!keyholder.get()) {
649  throw ValueErrorException("Substruct Library does not have fingerprints");
650  }
651  return *keyholder.get();
652  }
653 
654  //! Add a molecule to the library
655  /*!
656  \param mol Molecule to add
657 
658  returns index for the molecule in the library
659  */
660  unsigned int addMol(const ROMol &mol);
661 
662  //! Get the matching indices for the query
663  /*!
664  \param query Query or Tautomer Query to match against molecules
665  \param recursionPossible flags whether or not recursive matches are allowed
666  [default true]
667  \param useChirality use atomic CIP codes as part of the comparison
668  [default true]
669  \param useQueryQueryMatches if set, the contents of atom and bond queries
670  will be used as part of the matching
671  [default false]
672  \param numThreads If -1 use all available processors [default -1]
673  \param maxResults Maximum results to return, -1 means return all
674  [default -1]
675  */
676  template <class Query>
677  std::vector<unsigned int> getMatches(const Query &query,
678  bool recursionPossible = true,
679  bool useChirality = true,
680  bool useQueryQueryMatches = false,
681  int numThreads = -1,
682  int maxResults = -1) const {
684  params.recursionPossible = recursionPossible;
685  params.useChirality = useChirality;
686  params.useQueryQueryMatches = useQueryQueryMatches;
687  return getMatches(query, 0, size(), params, numThreads, maxResults);
688  }
689  //! overload
690  template <class Query>
691  std::vector<unsigned int> getMatches(const Query &query,
692  const SubstructMatchParameters &params,
693  int numThreads = -1,
694  int maxResults = -1) const {
695  return getMatches(query, 0, size(), params, numThreads, maxResults);
696  }
697  //! Get the matching indices for the query between the given indices
698  /*!
699  \param query Query to match against molecules
700  \param startIdx Start index of the search
701  \param endIdx Ending idx (non-inclusive) of the search.
702  \param recursionPossible flags whether or not recursive matches are allowed
703  [default true]
704  \param useChirality use atomic CIP codes as part of the comparison
705  [default true]
706  \param useQueryQueryMatches if set, the contents of atom and bond queries
707  will be used as part of the matching
708  [default false]
709  \param numThreads If -1 use all available processors [default -1]
710  \param maxResults Maximum results to return, -1 means return all
711  [default -1]
712  */
713  template <class Query>
714  std::vector<unsigned int> getMatches(
715  const Query &query, unsigned int startIdx, unsigned int endIdx,
716  bool recursionPossible = true, bool useChirality = true,
717  bool useQueryQueryMatches = false, int numThreads = -1,
718  int maxResults = -1) const {
720  params.recursionPossible = recursionPossible;
721  params.useChirality = useChirality;
722  params.useQueryQueryMatches = useQueryQueryMatches;
723  return getMatches(query, startIdx, endIdx, params, numThreads, maxResults);
724  };
725  //! overload
726  std::vector<unsigned int> getMatches(const ROMol &query,
727  unsigned int startIdx,
728  unsigned int endIdx,
729  const SubstructMatchParameters &params,
730  int numThreads = -1,
731  int maxResults = -1) const;
732  //! overload
733  std::vector<unsigned int> getMatches(const MolBundle &query,
734  unsigned int startIdx,
735  unsigned int endIdx,
736  const SubstructMatchParameters &params,
737  int numThreads = -1,
738  int maxResults = -1) const;
739  //! overload
740  std::vector<unsigned int> getMatches(const TautomerQuery &query,
741  unsigned int startIdx,
742  unsigned int endIdx,
743  const SubstructMatchParameters &params,
744  int numThreads = -1,
745  int maxResults = -1) const;
746 
747  //! Return the number of matches for the query
748  /*!
749  \param query Molecule or Tautomer Query to match against molecules
750  \param recursionPossible flags whether or not recursive matches are allowed
751  [default true]
752  \param useChirality use atomic CIP codes as part of the comparison
753  [default true]
754  \param useQueryQueryMatches if set, the contents of atom and bond queries
755  will be used as part of the matching
756  [default false]
757  \param numThreads If -1 use all available processors [default -1]
758  */
759  template <class Query>
760  unsigned int countMatches(const Query &query, bool recursionPossible = true,
761  bool useChirality = true,
762  bool useQueryQueryMatches = false,
763  int numThreads = -1) const {
765  params.recursionPossible = recursionPossible;
766  params.useChirality = useChirality;
767  params.useQueryQueryMatches = useQueryQueryMatches;
768  return countMatches(query, 0, size(), params, numThreads);
769  }
770  //! overload
771  template <class Query>
772  unsigned int countMatches(const Query &query,
773  const SubstructMatchParameters &params,
774  int numThreads = -1) const {
775  return countMatches(query, 0, size(), params, numThreads);
776  }
777 
778  //! Return the number of matches for the query
779 
780  //! Return the number of matches for the query between the given indices
781  /*!
782  \param query Query to match against molecules
783  \param startIdx Start index of the search
784  \param endIdx Ending idx (non-inclusive) of the search.
785  \param recursionPossible flags whether or not recursive matches are allowed
786  [default true]
787  \param useChirality use atomic CIP codes as part of the comparison
788  [default true]
789  \param useQueryQueryMatches if set, the contents of atom and bond queries
790  will be used as part of the matching
791  [default false]
792  \param numThreads If -1 use all available processors [default -1]
793  */
794  template <class Query>
795  unsigned int countMatches(const Query &query, unsigned int startIdx,
796  unsigned int endIdx, bool recursionPossible = true,
797  bool useChirality = true,
798  bool useQueryQueryMatches = false,
799  int numThreads = -1) const {
801  params.recursionPossible = recursionPossible;
802  params.useChirality = useChirality;
803  params.useQueryQueryMatches = useQueryQueryMatches;
804  return countMatches(query, startIdx, endIdx, params, numThreads);
805  };
806 
807  //! overload
808  unsigned int countMatches(const ROMol &query, unsigned int startIdx,
809  unsigned int endIdx,
810  const SubstructMatchParameters &params,
811  int numThreads = -1) const;
812  //! overload
813  unsigned int countMatches(const TautomerQuery &query, unsigned int startIdx,
814  unsigned int endIdx,
815  const SubstructMatchParameters &params,
816  int numThreads = -1) const;
817  //! overload
818  unsigned int countMatches(const MolBundle &query, unsigned int startIdx,
819  unsigned int endIdx,
820  const SubstructMatchParameters &params,
821  int numThreads = -1) const;
822 
823  //! Returns true if any match exists for the query
824  /*!
825  \param query Molecule or Tautomer Query to match against molecules
826  \param recursionPossible flags whether or not recursive matches are allowed
827  [default true]
828  \param useChirality use atomic CIP codes as part of the comparison
829  [default true]
830  \param useQueryQueryMatches if set, the contents of atom and bond queries
831  will be used as part of the matching
832  [default false]
833  \param numThreads If -1 use all available processors [default -1]
834  */
835  template <class Query>
836  bool hasMatch(const Query &query, bool recursionPossible = true,
837  bool useChirality = true, bool useQueryQueryMatches = false,
838  int numThreads = -1) const {
840  params.recursionPossible = recursionPossible;
841  params.useChirality = useChirality;
842  params.useQueryQueryMatches = useQueryQueryMatches;
843  return hasMatch(query, 0, size(), params, numThreads);
844  }
845  //! overload
846  template <class Query>
847  bool hasMatch(const Query &query, const SubstructMatchParameters &params,
848  int numThreads = -1) const {
849  return hasMatch(query, 0, size(), params, numThreads);
850  }
851  //! Returns true if any match exists for the query between the specified
852  //! indices
853  /*!
854  \param query Query to match against molecules
855  \param startIdx Start index of the search
856  \param endIdx Ending idx (inclusive) of the search.
857  \param recursionPossible flags whether or not recursive matches are
858  allowed [default true] \param useChirality use atomic CIP codes as part
859  of the comparison [default true] \param useQueryQueryMatches if set, the
860  contents of atom and bond queries will be used as part of the matching
861  [default false]
862  \param numThreads If -1 use all available processors [default -1]
863  */
864  template <class Query>
865  bool hasMatch(const Query &query, unsigned int startIdx, unsigned int endIdx,
866  bool recursionPossible = true, bool useChirality = true,
867  bool useQueryQueryMatches = false, int numThreads = -1) const {
869  params.recursionPossible = recursionPossible;
870  params.useChirality = useChirality;
871  params.useQueryQueryMatches = useQueryQueryMatches;
872  return hasMatch(query, startIdx, endIdx, params, numThreads);
873  };
874  //! overload
875  bool hasMatch(const ROMol &query, unsigned int startIdx, unsigned int endIdx,
876  const SubstructMatchParameters &params,
877  int numThreads = -1) const;
878  //! overload
879  bool hasMatch(const TautomerQuery &query, unsigned int startIdx,
880  unsigned int endIdx, const SubstructMatchParameters &params,
881  int numThreads = -1) const;
882  //! overload
883  bool hasMatch(const MolBundle &query, unsigned int startIdx,
884  unsigned int endIdx, const SubstructMatchParameters &params,
885  int numThreads = -1) const;
886  //! Returns the molecule at the given index
887  /*!
888  \param idx Index of the molecule in the library (n.b. could contain
889  null)
890  */
891  boost::shared_ptr<ROMol> getMol(unsigned int idx) const {
892  // expects implementation to throw IndexError if out of range
893  PRECONDITION(mols, "molholder is null in SubstructLibrary");
894  return mols->getMol(idx);
895  }
896 
897  //! Returns the molecule at the given index
898  /*!
899  \param idx Index of the molecule in the library (n.b. could contain
900  null)
901  */
902  boost::shared_ptr<ROMol> operator[](unsigned int idx) {
903  // expects implementation to throw IndexError if out of range
904  PRECONDITION(mols, "molholder is null in SubstructLibrary");
905  return mols->getMol(idx);
906  }
907 
908  //! return the number of molecules in the library
909  unsigned int size() const {
910  PRECONDITION(mols, "molholder is null in SubstructLibrary");
911  return rdcast<unsigned int>(molholder->size());
912  }
913 
914  //! does error checking
915  void setSearchOrder(const std::vector<unsigned int> &order) {
916  for (const auto idx : order) {
917  if (idx >= mols->size()) {
918  throw IndexErrorException(idx);
919  }
920  }
921  searchOrder = order;
922  }
923 
924  const std::vector<unsigned int> &getSearchOrder() const {
925  return searchOrder;
926  }
927 
928  std::vector<unsigned int> &getSearchOrder() { return searchOrder; }
929  //! access required for serialization
930  void resetHolders() {
931  is_tautomerquery = false;
932  mols = molholder.get();
933  fps = fpholder.get();
934  if (fps && dynamic_cast<TautomerPatternHolder *>(fps) != nullptr) {
935  is_tautomerquery = true;
936  }
937  }
938 
939  //! serializes (pickles) to a stream
940  void toStream(std::ostream &ss) const;
941  //! returns a string with a serialized (pickled) representation
942  std::string Serialize() const;
943  //! initializes from a stream pickle
944  void initFromStream(std::istream &ss);
945  //! initializes from a string pickle
946  void initFromString(const std::string &text);
947 };
948 } // namespace RDKit
949 
951 #endif
Contains general bit-comparison and similarity operations.
RDKIT_DATASTRUCTS_EXPORT bool AllProbeBitsMatch(const char *probe, const char *ref)
#define PRECONDITION(expr, mess)
Definition: Invariant.h:109
Defines a class for managing bundles of molecules.
pulls in the core RDKit functionality
a class for bit vectors that are densely occupied
Class to allow us to throw an IndexError from C++ and have it make it back to Python.
Definition: Exceptions.h:20
Concrete class that holds binary cached molecules in memory.
unsigned int size() const override
Get the current library size.
const std::vector< std::string > & getMols() const
unsigned int addMol(const ROMol &m) override
std::vector< std::string > & getMols()
unsigned int addBinary(const std::string &pickle)
boost::shared_ptr< ROMol > getMol(unsigned int idx) const override
Concrete class that holds smiles strings in memory.
std::vector< std::string > & getMols()
unsigned int addSmiles(const std::string &smiles)
const std::vector< std::string > & getMols() const
boost::shared_ptr< ROMol > getMol(unsigned int idx) const override
unsigned int addMol(const ROMol &m) override
unsigned int size() const override
Get the current library size.
Concrete class that holds trusted smiles strings in memory.
boost::shared_ptr< ROMol > getMol(unsigned int idx) const override
std::vector< std::string > & getMols()
unsigned int addSmiles(const std::string &smiles)
unsigned int addMol(const ROMol &m) override
unsigned int size() const override
Get the current library size.
const std::vector< std::string > & getMols() const
Base FPI for the fingerprinter used to rule out impossible matches.
const ExplicitBitVect & getFingerprint(unsigned int idx) const
unsigned int addMol(const ROMol &m)
Adds a molecule to the fingerprinter.
virtual unsigned int size() const
std::vector< ExplicitBitVect * > & getFingerprints()
virtual ExplicitBitVect * makeFingerprint(const ROMol &m) const =0
bool passesFilter(unsigned int idx, const ExplicitBitVect &query) const
Return false if a substructure search can never match the molecule.
unsigned int addFingerprint(ExplicitBitVect *v)
const std::vector< ExplicitBitVect * > & getFingerprints() const
unsigned int addFingerprint(const ExplicitBitVect &v)
const std::vector< std::string > & getKeys() const
KeyFromPropHolder(const std::string &propname="_Name")
std::vector< std::string > & getKeys()
unsigned int addKey(const std::string &key) override
unsigned int size() const override
Get the current keeyholder size.
std::vector< std::string > getKeys(const std::vector< unsigned int > &indices) const override
unsigned int addMol(const ROMol &m) override
Add a key to the database getting it from the molecule.
const std::string & getKey(unsigned int idx) const override
const std::string & getPropName() const
virtual const std::string & getKey(unsigned int) const =0
virtual std::vector< std::string > getKeys(const std::vector< unsigned int > &indices) const =0
virtual unsigned int addMol(const ROMol &m)=0
Add a key to the database getting it from the molecule.
virtual unsigned int size() const =0
Get the current keeyholder size.
virtual unsigned int addKey(const std::string &)=0
MolBundle contains a collection of related ROMols.
Definition: MolBundle.h:39
Base class API for holding molecules to substructure search.
virtual unsigned int addMol(const ROMol &m)=0
virtual unsigned int size() const =0
Get the current library size.
virtual boost::shared_ptr< ROMol > getMol(unsigned int) const =0
Concrete class that holds molecules in memory.
const std::vector< boost::shared_ptr< ROMol > > & getMols() const
unsigned int addMol(const ROMol &m) override
std::vector< boost::shared_ptr< ROMol > > & getMols()
unsigned int size() const override
Get the current library size.
boost::shared_ptr< ROMol > getMol(unsigned int idx) const override
static void molFromPickle(const std::string &pickle, ROMol *mol, unsigned int propertyFlags)
constructs a molecule from a pickle stored in a string
static void pickleMol(const ROMol *mol, std::ostream &ss)
pickles a molecule and sends the results to stream ss
ExplicitBitVect * makeFingerprint(const ROMol &m) const override
Caller owns the vector!
PatternHolder(unsigned int numBits)
unsigned int & getNumBits()
static unsigned int defaultNumBits()
const unsigned int & getNumBits() const
RWMol is a molecule class that is intended to be edited.
Definition: RWMol.h:32
Substructure Search a library of molecules.
unsigned int countMatches(const Query &query, bool recursionPossible=true, bool useChirality=true, bool useQueryQueryMatches=false, int numThreads=-1) const
Return the number of matches for the query.
unsigned int addMol(const ROMol &mol)
Add a molecule to the library.
boost::shared_ptr< ROMol > getMol(unsigned int idx) const
Returns the molecule at the given index.
void initFromStream(std::istream &ss)
initializes from a stream pickle
const MolHolderBase & getMolecules() const
const FPHolderBase & getFingerprints() const
bool hasMatch(const Query &query, bool recursionPossible=true, bool useChirality=true, bool useQueryQueryMatches=false, int numThreads=-1) const
Returns true if any match exists for the query.
SubstructLibrary(boost::shared_ptr< MolHolderBase > molecules, boost::shared_ptr< FPHolderBase > fingerprints, boost::shared_ptr< KeyHolderBase > keys)
unsigned int countMatches(const Query &query, unsigned int startIdx, unsigned int endIdx, bool recursionPossible=true, bool useChirality=true, bool useQueryQueryMatches=false, int numThreads=-1) const
Return the number of matches for the query.
void initFromString(const std::string &text)
initializes from a string pickle
unsigned int countMatches(const Query &query, const SubstructMatchParameters &params, int numThreads=-1) const
overload
const boost::shared_ptr< FPHolderBase > & getFpHolder() const
Get the underlying molecule holder implementation.
FPHolderBase & getFingerprints()
Get the underlying fingerprint implementation.
std::vector< unsigned int > getMatches(const Query &query, unsigned int startIdx, unsigned int endIdx, bool recursionPossible=true, bool useChirality=true, bool useQueryQueryMatches=false, int numThreads=-1, int maxResults=-1) const
Get the matching indices for the query between the given indices.
boost::shared_ptr< MolHolderBase > & getMolHolder()
Get the underlying molecule holder implementation.
bool hasMatch(const ROMol &query, unsigned int startIdx, unsigned int endIdx, const SubstructMatchParameters &params, int numThreads=-1) const
overload
boost::shared_ptr< KeyHolderBase > & getKeyHolder()
Get the underlying molecule holder implementation.
unsigned int countMatches(const MolBundle &query, unsigned int startIdx, unsigned int endIdx, const SubstructMatchParameters &params, int numThreads=-1) const
overload
std::vector< unsigned int > getMatches(const MolBundle &query, unsigned int startIdx, unsigned int endIdx, const SubstructMatchParameters &params, int numThreads=-1, int maxResults=-1) const
overload
void setSearchOrder(const std::vector< unsigned int > &order)
does error checking
bool hasMatch(const Query &query, unsigned int startIdx, unsigned int endIdx, bool recursionPossible=true, bool useChirality=true, bool useQueryQueryMatches=false, int numThreads=-1) const
boost::shared_ptr< FPHolderBase > & getFpHolder()
Get the underlying molecule holder implementation.
const std::vector< unsigned int > & getSearchOrder() const
SubstructLibrary(boost::shared_ptr< MolHolderBase > molecules, boost::shared_ptr< KeyHolderBase > keys)
const KeyHolderBase & getKeys() const
Get the underlying key holder implementation.
bool hasMatch(const MolBundle &query, unsigned int startIdx, unsigned int endIdx, const SubstructMatchParameters &params, int numThreads=-1) const
overload
KeyHolderBase & getKeys()
Get the underlying key holder implementation.
unsigned int countMatches(const ROMol &query, unsigned int startIdx, unsigned int endIdx, const SubstructMatchParameters &params, int numThreads=-1) const
overload
bool hasMatch(const Query &query, const SubstructMatchParameters &params, int numThreads=-1) const
overload
SubstructLibrary(boost::shared_ptr< MolHolderBase > molecules, boost::shared_ptr< FPHolderBase > fingerprints)
std::vector< unsigned int > getMatches(const ROMol &query, unsigned int startIdx, unsigned int endIdx, const SubstructMatchParameters &params, int numThreads=-1, int maxResults=-1) const
overload
std::vector< unsigned int > getMatches(const Query &query, const SubstructMatchParameters &params, int numThreads=-1, int maxResults=-1) const
overload
std::vector< unsigned int > & getSearchOrder()
bool hasMatch(const TautomerQuery &query, unsigned int startIdx, unsigned int endIdx, const SubstructMatchParameters &params, int numThreads=-1) const
overload
void resetHolders()
access required for serialization
unsigned int size() const
return the number of molecules in the library
std::vector< unsigned int > getMatches(const TautomerQuery &query, unsigned int startIdx, unsigned int endIdx, const SubstructMatchParameters &params, int numThreads=-1, int maxResults=-1) const
overload
const boost::shared_ptr< KeyHolderBase > & getKeyHolder() const
Get the underlying molecule holder implementation.
SubstructLibrary(boost::shared_ptr< MolHolderBase > molecules)
SubstructLibrary(const std::string &pickle)
std::string Serialize() const
returns a string with a serialized (pickled) representation
unsigned int countMatches(const TautomerQuery &query, unsigned int startIdx, unsigned int endIdx, const SubstructMatchParameters &params, int numThreads=-1) const
overload
const boost::shared_ptr< MolHolderBase > & getMolHolder() const
void toStream(std::ostream &ss) const
serializes (pickles) to a stream
boost::shared_ptr< ROMol > operator[](unsigned int idx)
Returns the molecule at the given index.
std::vector< unsigned int > getMatches(const Query &query, bool recursionPossible=true, bool useChirality=true, bool useQueryQueryMatches=false, int numThreads=-1, int maxResults=-1) const
Get the matching indices for the query.
ExplicitBitVect * makeFingerprint(const ROMol &m) const override
Caller owns the vector!
TautomerPatternHolder(unsigned int numBits)
Class to allow us to throw a ValueError from C++ and have it make it back to Python.
Definition: Exceptions.h:40
#define RDKIT_SUBSTRUCTLIBRARY_EXPORT
Definition: export.h:481
RDKIT_CHEMREACTIONS_EXPORT void pickle(const boost::shared_ptr< EnumerationStrategyBase > &enumerator, std::ostream &ss)
pickles a EnumerationStrategy and adds the results to a stream ss
Std stuff.
Definition: Abbreviations.h:19
RDKIT_FINGERPRINTS_EXPORT ExplicitBitVect * PatternFingerprintMol(const ROMol &mol, unsigned int fpSize=2048, std::vector< unsigned int > *atomCounts=nullptr, ExplicitBitVect *setOnlyBits=nullptr, bool tautomericFingerprint=false)
Generates a topological fingerprint for a molecule using a series of pre-defined structural patterns.
RDKIT_SMILESPARSE_EXPORT std::string MolToSmiles(const ROMol &mol, const SmilesWriteParams &params)
returns canonical SMILES for a molecule
RDKIT_SUBSTRUCTLIBRARY_EXPORT bool SubstructLibraryCanSerialize()
RDKIT_SMILESPARSE_EXPORT RWMol * SmilesToMol(const std::string &smi, const SmilesParserParams &params)
bool recursionPossible
Allow recursive queries.