RDKit
Open-source cheminformatics and machine learning.
MolStandardize/Tautomer.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2018-2021 Susan H. Leung and other RDKit contributors
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 #include <RDGeneral/export.h>
11 #ifndef RD_TAUTOMER_H
12 #define RD_TAUTOMER_H
13 
14 #include <boost/function.hpp>
15 #include <string>
16 #include <utility>
17 #include <iterator>
18 #include <Catalogs/Catalog.h>
23 #include <boost/dynamic_bitset.hpp>
24 
25 namespace RDKit {
26 class ROMol;
27 class RWMol;
28 
29 namespace MolStandardize {
30 
31 typedef RDCatalog::HierarchCatalog<TautomerCatalogEntry, TautomerCatalogParams,
32  int>
34 
35 namespace TautomerScoringFunctions {
36 const std::string tautomerScoringVersion = "1.0.0";
37 
41 
42 inline int scoreTautomer(const ROMol &mol) {
43  return scoreRings(mol) + scoreSubstructs(mol) + scoreHeteroHs(mol);
44 }
45 } // namespace TautomerScoringFunctions
46 
48  Completed = 0,
51  Canceled
52 };
53 
54 class Tautomer {
55  friend class TautomerEnumerator;
56 
57  public:
58  Tautomer() : d_numModifiedAtoms(0), d_numModifiedBonds(0), d_done(false) {}
59  Tautomer(ROMOL_SPTR t, ROMOL_SPTR k, size_t a = 0, size_t b = 0)
60  : tautomer(std::move(t)),
61  kekulized(std::move(k)),
62  d_numModifiedAtoms(a),
63  d_numModifiedBonds(b),
64  d_done(false) {}
67 
68  private:
69  size_t d_numModifiedAtoms;
70  size_t d_numModifiedBonds;
71  bool d_done;
72 };
73 
74 typedef std::map<std::string, Tautomer> SmilesTautomerMap;
75 typedef std::pair<std::string, Tautomer> SmilesTautomerPair;
76 
77 //! Contains results of tautomer enumeration
79  friend class TautomerEnumerator;
80 
81  public:
83  public:
85  typedef std::ptrdiff_t difference_type;
86  typedef const ROMol *pointer;
87  typedef const ROMOL_SPTR &reference;
88  typedef std::bidirectional_iterator_tag iterator_category;
89 
90  explicit const_iterator(const SmilesTautomerMap::const_iterator &it)
91  : d_it(it) {}
92  reference operator*() const { return d_it->second.tautomer; }
93  pointer operator->() const { return d_it->second.tautomer.get(); }
94  bool operator==(const const_iterator &other) const {
95  return (d_it == other.d_it);
96  }
97  bool operator!=(const const_iterator &other) const {
98  return !(*this == other);
99  }
101  const_iterator copy(d_it);
102  operator++();
103  return copy;
104  }
106  ++d_it;
107  return *this;
108  }
110  const_iterator copy(d_it);
111  operator--();
112  return copy;
113  }
115  --d_it;
116  return *this;
117  }
118 
119  private:
120  SmilesTautomerMap::const_iterator d_it;
121  };
124  : d_tautomers(other.d_tautomers),
125  d_status(other.d_status),
126  d_modifiedAtoms(other.d_modifiedAtoms),
127  d_modifiedBonds(other.d_modifiedBonds) {
128  fillTautomersItVec();
129  }
130  const const_iterator begin() const {
131  return const_iterator(d_tautomers.begin());
132  }
133  const const_iterator end() const { return const_iterator(d_tautomers.end()); }
134  size_t size() const { return d_tautomers.size(); }
135  bool empty() const { return d_tautomers.empty(); }
136  const ROMOL_SPTR &at(size_t pos) const {
137  PRECONDITION(pos < d_tautomers.size(), "index out of bounds");
138  return d_tautomersItVec.at(pos)->second.tautomer;
139  }
140  const ROMOL_SPTR &operator[](size_t pos) const { return at(pos); }
141  const boost::dynamic_bitset<> &modifiedAtoms() const {
142  return d_modifiedAtoms;
143  }
144  const boost::dynamic_bitset<> &modifiedBonds() const {
145  return d_modifiedBonds;
146  }
147  TautomerEnumeratorStatus status() const { return d_status; }
148  std::vector<ROMOL_SPTR> tautomers() const {
149  std::vector<ROMOL_SPTR> tautomerVec;
150  tautomerVec.reserve(d_tautomers.size());
151  std::transform(
152  d_tautomers.begin(), d_tautomers.end(), std::back_inserter(tautomerVec),
153  [](const SmilesTautomerPair &t) { return t.second.tautomer; });
154  return tautomerVec;
155  }
156  std::vector<ROMOL_SPTR> operator()() const { return tautomers(); }
157  std::vector<std::string> smiles() const {
158  std::vector<std::string> smilesVec;
159  smilesVec.reserve(d_tautomers.size());
160  std::transform(d_tautomers.begin(), d_tautomers.end(),
161  std::back_inserter(smilesVec),
162  [](const SmilesTautomerPair &t) { return t.first; });
163  return smilesVec;
164  }
165  const SmilesTautomerMap &smilesTautomerMap() const { return d_tautomers; }
166 
167  private:
168  void fillTautomersItVec() {
169  for (auto it = d_tautomers.begin(); it != d_tautomers.end(); ++it) {
170  d_tautomersItVec.push_back(it);
171  }
172  }
173  // the enumerated tautomers
174  SmilesTautomerMap d_tautomers;
175  // internal; vector of iterators into map items to enable random
176  // access to map items by index
177  std::vector<SmilesTautomerMap::const_iterator> d_tautomersItVec;
178  // status of the enumeration: did it complete? did it hit a limit?
179  // was it canceled?
180  TautomerEnumeratorStatus d_status;
181  // bit vector: flags atoms modified by the transforms
182  boost::dynamic_bitset<> d_modifiedAtoms;
183  // bit vector: flags bonds modified by the transforms
184  boost::dynamic_bitset<> d_modifiedBonds;
185 };
186 
188  public:
191  virtual bool operator()(const ROMol &, const TautomerEnumeratorResult &) = 0;
192 };
193 
195  public:
197  : dp_catalog(tautCat),
198  d_maxTautomers(1000),
199  d_maxTransforms(1000),
200  d_removeSp3Stereo(true),
201  d_removeBondStereo(true),
202  d_removeIsotopicHs(true),
203  d_reassignStereo(true) {}
206  : dp_catalog(other.dp_catalog),
207  d_callback(other.d_callback),
208  d_maxTautomers(other.d_maxTautomers),
209  d_maxTransforms(other.d_maxTransforms),
210  d_removeSp3Stereo(other.d_removeSp3Stereo),
211  d_removeBondStereo(other.d_removeBondStereo),
212  d_removeIsotopicHs(other.d_removeIsotopicHs),
213  d_reassignStereo(other.d_reassignStereo) {}
215  if (this == &other) {
216  return *this;
217  }
218  dp_catalog = other.dp_catalog;
219  d_callback = other.d_callback;
220  d_maxTautomers = other.d_maxTautomers;
221  d_maxTransforms = other.d_maxTransforms;
222  d_removeSp3Stereo = other.d_removeSp3Stereo;
223  d_removeBondStereo = other.d_removeBondStereo;
224  d_removeIsotopicHs = other.d_removeIsotopicHs;
225  d_reassignStereo = other.d_reassignStereo;
226  return *this;
227  }
228  //! \param maxTautomers maximum number of tautomers to be generated
229  void setMaxTautomers(unsigned int maxTautomers) {
230  d_maxTautomers = maxTautomers;
231  }
232  //! \return maximum number of tautomers to be generated
233  unsigned int getMaxTautomers() { return d_maxTautomers; }
234  /*! \param maxTransforms maximum number of transformations to be applied
235  this limit is usually hit earlier than the maxTautomers limit
236  and leads to a more linear scaling of CPU time with increasing
237  number of tautomeric centers (see Sitzmann et al.)
238  */
239  void setMaxTransforms(unsigned int maxTransforms) {
240  d_maxTransforms = maxTransforms;
241  }
242  //! \return maximum number of transformations to be applied
243  unsigned int getMaxTransforms() { return d_maxTransforms; }
244  /*! \param removeSp3Stereo; if set to true, stereochemistry information
245  will be removed from sp3 atoms involved in tautomerism.
246  This means that S-aminoacids will lose their stereochemistry after going
247  through tautomer enumeration because of the amido-imidol tautomerism.
248  This defaults to true in RDKit, false in the workflow described
249  by Sitzmann et al.
250  */
251  void setRemoveSp3Stereo(bool removeSp3Stereo) {
252  d_removeSp3Stereo = removeSp3Stereo;
253  }
254  /*! \return whether stereochemistry information will be removed from
255  sp3 atoms involved in tautomerism
256  */
257  bool getRemoveSp3Stereo() { return d_removeSp3Stereo; }
258  /*! \param removeBondStereo; if set to true, stereochemistry information
259  will be removed from double bonds involved in tautomerism.
260  This means that enols will lose their E/Z stereochemistry after going
261  through tautomer enumeration because of the keto-enolic tautomerism.
262  This defaults to true in RDKit and also in the workflow described
263  by Sitzmann et al.
264  */
265  void setRemoveBondStereo(bool removeBondStereo) {
266  d_removeBondStereo = removeBondStereo;
267  }
268  /*! \return whether stereochemistry information will be removed from
269  double bonds involved in tautomerism
270  */
271  bool getRemoveBondStereo() { return d_removeBondStereo; }
272  /*! \param removeIsotopicHs; if set to true, isotopic Hs
273  will be removed from centers involved in tautomerism.
274  */
275  void setRemoveIsotopicHs(bool removeIsotopicHs) {
276  d_removeIsotopicHs = removeIsotopicHs;
277  }
278  /*! \return whether isotpoic Hs will be removed from
279  centers involved in tautomerism
280  */
281  bool getRemoveIsotopicHs() { return d_removeIsotopicHs; }
282  /*! \param reassignStereo; if set to true, assignStereochemistry
283  will be called on each tautomer generated by the enumerate() method.
284  This defaults to true.
285  */
286  void setReassignStereo(bool reassignStereo) {
287  d_reassignStereo = reassignStereo;
288  }
289  /*! \return whether assignStereochemistry will be called on each
290  tautomer generated by the enumerate() method
291  */
292  bool getReassignStereo() { return d_reassignStereo; }
293  /*! set this to an instance of a class derived from
294  TautomerEnumeratorCallback where operator() is overridden.
295  DO NOT delete the instance as ownership of the pointer is transferred
296  to the TautomerEnumerator
297  */
299  d_callback.reset(callback);
300  }
301  /*! \return pointer to an instance of a class derived from
302  TautomerEnumeratorCallback.
303  DO NOT delete the instance as ownership of the pointer is transferred
304  to the TautomerEnumerator
305  */
306  TautomerEnumeratorCallback *getCallback() const { return d_callback.get(); }
307 
308  //! returns a \c TautomerEnumeratorResult structure for the input molecule
309  /*!
310  The enumeration rules are inspired by the publication:
311  M. Sitzmann et al., “Tautomerism in Large Databases.”, JCAMD 24:521 (2010)
312  https://doi.org/10.1007/s10822-010-9346-4
313 
314  \param mol: the molecule to be enumerated
315 
316  Note: the definitions used here are that the atoms modified during
317  tautomerization are the atoms at the beginning and end of each tautomer
318  transform (the H "donor" and H "acceptor" in the transform) and the bonds
319  modified during transformation are any bonds whose order is changed during
320  the tautomer transform (these are the bonds between the "donor" and the
321  "acceptor")
322 
323  */
325 
326  //! Deprecated, please use the form returning a \c TautomerEnumeratorResult
327  //! instead
328  [
329  [deprecated("please use the form returning a TautomerEnumeratorResult "
330  "instead")]] std::vector<ROMOL_SPTR>
331  enumerate(const ROMol &mol, boost::dynamic_bitset<> *modifiedAtoms,
332  boost::dynamic_bitset<> *modifiedBonds = nullptr) const;
333 
334  //! returns the canonical tautomer from a \c TautomerEnumeratorResult
336  boost::function<int(const ROMol &mol)> scoreFunc =
338 
339  //! returns the canonical tautomer from an iterable of possible tautomers
340  /// When Iterable is TautomerEnumeratorResult we use the other non-templated
341  /// overload for efficiency (TautomerEnumeratorResult already has SMILES so no
342  /// need to recompute them)
343  template <class Iterable,
344  typename std::enable_if<
345  !std::is_same<Iterable, TautomerEnumeratorResult>::value,
346  int>::type = 0>
347  ROMol *pickCanonical(const Iterable &tautomers,
348  boost::function<int(const ROMol &mol)> scoreFunc =
350  ROMOL_SPTR bestMol;
351  if (tautomers.size() == 1) {
352  bestMol = *tautomers.begin();
353  } else {
354  // Calculate score for each tautomer
355  int bestScore = std::numeric_limits<int>::min();
356  std::string bestSmiles = "";
357  for (const auto &t : tautomers) {
358  auto score = scoreFunc(*t);
359 #ifdef VERBOSE_ENUMERATION
360  std::cerr << " " << MolToSmiles(*t) << " " << score << std::endl;
361 #endif
362  if (score > bestScore) {
363  bestScore = score;
364  bestSmiles = MolToSmiles(*t);
365  bestMol = t;
366  } else if (score == bestScore) {
367  auto smiles = MolToSmiles(*t);
368  if (smiles < bestSmiles) {
369  bestSmiles = smiles;
370  bestMol = t;
371  }
372  }
373  }
374  }
375  ROMol *res = new ROMol(*bestMol);
376  static const bool cleanIt = true;
377  static const bool force = true;
378  MolOps::assignStereochemistry(*res, cleanIt, force);
379 
380  return res;
381  }
382 
383  //! returns the canonical tautomer for a molecule
384  /*!
385  Note that the canonical tautomer is very likely not the most stable tautomer
386  for any given conditions. The default scoring rules are designed to produce
387  "reasonable" tautomers, but the primary concern is that the results are
388  canonical: you always get the same canonical tautomer for a molecule
389  regardless of what the input tautomer or atom ordering were.
390 
391  The default scoring scheme is inspired by the publication:
392  M. Sitzmann et al., “Tautomerism in Large Databases.”, JCAMD 24:521 (2010)
393  https://doi.org/10.1007/s10822-010-9346-4
394 
395  */
396  ROMol *canonicalize(const ROMol &mol,
397  boost::function<int(const ROMol &mol)> scoreFunc =
399 
400  private:
401  bool setTautomerStereoAndIsoHs(const ROMol &mol, ROMol &taut,
402  const TautomerEnumeratorResult &res) const;
403  std::shared_ptr<TautomerCatalog> dp_catalog;
404  std::shared_ptr<TautomerEnumeratorCallback> d_callback;
405  unsigned int d_maxTautomers;
406  unsigned int d_maxTransforms;
407  bool d_removeSp3Stereo;
408  bool d_removeBondStereo;
409  bool d_removeIsotopicHs;
410  bool d_reassignStereo;
411 }; // TautomerEnumerator class
412 
413 // caller owns the pointer
415  const CleanupParameters &params) {
416  return new TautomerEnumerator(params);
417 }
418 // caller owns the pointer
420  TautomerCatalogParams tparms(
422  return new TautomerEnumerator(new TautomerCatalog(&tparms));
423 }
424 
425 } // namespace MolStandardize
426 } // namespace RDKit
427 
428 #endif
#define PRECONDITION(expr, mess)
Definition: Invariant.h:109
A Catalog with a hierarchical structure.
Definition: Catalog.h:135
virtual bool operator()(const ROMol &, const TautomerEnumeratorResult &)=0
Contains results of tautomer enumeration.
const ROMOL_SPTR & operator[](size_t pos) const
const boost::dynamic_bitset & modifiedBonds() const
const boost::dynamic_bitset & modifiedAtoms() const
TautomerEnumeratorResult(const TautomerEnumeratorResult &other)
ROMol * canonicalize(const ROMol &mol, boost::function< int(const ROMol &mol)> scoreFunc=TautomerScoringFunctions::scoreTautomer) const
returns the canonical tautomer for a molecule
TautomerEnumerator(const CleanupParameters &params=CleanupParameters())
ROMol * pickCanonical(const TautomerEnumeratorResult &tautRes, boost::function< int(const ROMol &mol)> scoreFunc=TautomerScoringFunctions::scoreTautomer) const
returns the canonical tautomer from a TautomerEnumeratorResult
TautomerEnumerator & operator=(const TautomerEnumerator &other)
TautomerEnumeratorResult enumerate(const ROMol &mol) const
returns a TautomerEnumeratorResult structure for the input molecule
TautomerEnumeratorCallback * getCallback() const
std::vector< ROMOL_SPTR > enumerate(const ROMol &mol, boost::dynamic_bitset<> *modifiedAtoms, boost::dynamic_bitset<> *modifiedBonds=nullptr) const
void setCallback(TautomerEnumeratorCallback *callback)
TautomerEnumerator(const TautomerEnumerator &other)
ROMol * pickCanonical(const Iterable &tautomers, boost::function< int(const ROMol &mol)> scoreFunc=TautomerScoringFunctions::scoreTautomer) const
void setMaxTransforms(unsigned int maxTransforms)
void setMaxTautomers(unsigned int maxTautomers)
Tautomer(ROMOL_SPTR t, ROMOL_SPTR k, size_t a=0, size_t b=0)
#define RDKIT_MOLSTANDARDIZE_EXPORT
Definition: export.h:321
RDKIT_GRAPHMOL_EXPORT void assignStereochemistry(ROMol &mol, bool cleanIt=false, bool force=false, bool flagPossibleStereoCenters=false)
Assign stereochemistry tags to atoms (i.e. R/S) and bonds (i.e. Z/E)
RDKIT_MOLSTANDARDIZE_EXPORT int scoreHeteroHs(const ROMol &mol)
RDKIT_MOLSTANDARDIZE_EXPORT int scoreSubstructs(const ROMol &mol)
RDKIT_MOLSTANDARDIZE_EXPORT int scoreRings(const ROMol &mol)
RDKIT_MOLSTANDARDIZE_EXPORT const TautomerTransformDefs defaultTautomerTransformsv1
std::map< std::string, Tautomer > SmilesTautomerMap
TautomerEnumerator * tautomerEnumeratorFromParams(const CleanupParameters &params)
TautomerEnumerator * getV1TautomerEnumerator()
RDCatalog::HierarchCatalog< TautomerCatalogEntry, TautomerCatalogParams, int > TautomerCatalog
std::pair< std::string, Tautomer > SmilesTautomerPair
Std stuff.
Definition: Abbreviations.h:19
RDKIT_SMILESPARSE_EXPORT std::string MolToSmiles(const ROMol &mol, const SmilesWriteParams &params)
returns canonical SMILES for a molecule
boost::shared_ptr< ROMol > ROMOL_SPTR