RDKit
Open-source cheminformatics and machine learning.
MolSupplier.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2002-2021 greg landrum, Rational Discovery LLC
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 #include <RDGeneral/export.h>
11 #ifndef RD_MOLSUPPLIER_H
12 #define RD_MOLSUPPLIER_H
13 
14 #include <RDGeneral/types.h>
15 
16 #include <string>
17 #include <list>
18 #include <memory>
19 #include <vector>
20 #include <iostream>
21 #include <fstream>
22 #include <GraphMol/ROMol.h>
24 
25 #ifdef RDK_BUILD_MAEPARSER_SUPPORT
26 namespace schrodinger {
27 namespace mae {
28 class Reader;
29 class Block;
30 } // namespace mae
31 } // namespace schrodinger
32 #endif // RDK_BUILD_MAEPARSER_SUPPORT
33 
34 namespace RDKit {
35 RDKIT_FILEPARSERS_EXPORT std::string strip(const std::string &orig);
36 
37 /*!
38 //
39 // Here are a couple of ways one can interact with MolSuppliers:
40 //
41 // 1) Lazy (ForwardIterator):
42 // while(!supplier.atEnd()){
43 // ROMol *mol = supplier.next();
44 // if(mol){
45 // do something;
46 // }
47 // }
48 // 2) Random Access:
49 // for(int i=0;i<supplier.length();i++){
50 // ROMol *mol = supplier[i];
51 // if(mol){
52 // do something;
53 // }
54 // }
55 //
56 //
57 */
59  // this is an abstract base class to supply molecules one at a time
60  public:
62  virtual ~MolSupplier() {}
63  virtual void init() = 0;
64  virtual void reset() = 0;
65  virtual bool atEnd() = 0;
66  virtual ROMol *next() = 0;
67 
68  virtual void close() {
69  if (df_owner) {
70  delete dp_inStream;
71  df_owner = false;
72  }
73  dp_inStream = nullptr;
74  }
75 
76  private:
77  // disable automatic copy constructors and assignment operators
78  // for this class and its subclasses. They will likely be
79  // carrying around stream pointers and copying those is a recipe
80  // for disaster.
81  MolSupplier(const MolSupplier &);
82  MolSupplier &operator=(const MolSupplier &);
83 
84  protected:
85  // stream to read the molecules from:
86  std::istream *dp_inStream = nullptr;
87  // do we own dp_inStream?
88  bool df_owner = false;
89  // opens a stream for reading and verifies that it can be read from.
90  // if not it throws an exception
91  // the caller owns the resulting stream
92  std::istream *openAndCheckStream(const std::string &filename) {
93  // FIX: this binary mode of opening file is here because of a bug in
94  // VC++ 6.0
95  // the function "tellg" does not work correctly if we do not open it this
96  // way
97  // Jan 2009: Confirmed that this is still the case in visual studio 2008
98  std::ifstream *strm =
99  new std::ifstream(filename.c_str(), std::ios_base::binary);
100  if ((!(*strm)) || strm->bad()) {
101  std::ostringstream errout;
102  errout << "Bad input file " << filename;
103  delete strm;
104  throw BadFileException(errout.str());
105  }
106 
107  strm->peek();
108  if (strm->bad() || strm->eof()) {
109  std::ostringstream errout;
110  errout << "Invalid input file " << filename;
111  delete strm;
112  throw BadFileException(errout.str());
113  }
114  return static_cast<std::istream *>(strm);
115  }
116 };
117 
118 // \brief a supplier from an SD file that only reads forward:
120  /*************************************************************************
121  * A lazy mol supplier from a SD file.
122  * - When new molecules are read using "next" their positions in the file are
123  *noted.
124  ***********************************************************************************/
125  public:
126  ForwardSDMolSupplier() { init(); }
127 
128  explicit ForwardSDMolSupplier(std::istream *inStream,
129  bool takeOwnership = true, bool sanitize = true,
130  bool removeHs = true,
131  bool strictParsing = false);
132 
133  ~ForwardSDMolSupplier() override { close(); }
134 
135  void init() override;
136  void reset() override;
137  ROMol *next() override;
138  bool atEnd() override;
139 
140  void setProcessPropertyLists(bool val) { df_processPropertyLists = val; }
141  bool getProcessPropertyLists() const { return df_processPropertyLists; }
142 
143  bool getEOFHitOnRead() const { return df_eofHitOnRead; }
144 
145  protected:
146  virtual void checkForEnd();
148  virtual void readMolProps(ROMol *);
149  bool df_end = false;
150  int d_line = 0; // line number we are currently on
151  bool df_sanitize = true, df_removeHs = true, df_strictParsing = true;
152  bool df_processPropertyLists = true;
153  bool df_eofHitOnRead = false;
154 };
155 
156 // \brief a lazy supplier from an SD file
158  /*************************************************************************
159  * A lazy mol supplier from a SD file.
160  * - When new molecules are read using "next" their positions in the file are
161  *noted.
162  * - A call to the "length" will automatically parse the entire file and
163  *cache all the mol
164  * block positions
165  * - [] operator is used to access a molecule at "idx", calling next
166  *following this will result
167  * in the next molecule after "idx"
168  ***********************************************************************************/
169 
170  public:
171  SDMolSupplier() { init(); }
172 
173  /*!
174  * \param fileName - the name of the SD file
175  * \param sanitize - if true sanitize the molecule before returning it
176  * \param removeHs - if true remove Hs from the molecule before returning it
177  * (triggers sanitization)
178  * \param strictParsing - if set to false, the parser is more lax about
179  * correctness
180  * of the contents.
181  */
182  explicit SDMolSupplier(const std::string &fileName, bool sanitize = true,
183  bool removeHs = true, bool strictParsing = true);
184 
185  explicit SDMolSupplier(std::istream *inStream, bool takeOwnership = true,
186  bool sanitize = true, bool removeHs = true,
187  bool strictParsing = true);
188 
189  ~SDMolSupplier() override { close(); }
190  void init() override;
191  void reset() override;
192  ROMol *next() override;
193  bool atEnd() override;
194  void moveTo(unsigned int idx);
195  ROMol *operator[](unsigned int idx);
196  /*! \brief returns the text block for a particular item
197  *
198  * \param idx - which item to return
199  */
200  std::string getItemText(unsigned int idx);
201  unsigned int length();
202  void setData(const std::string &text, bool sanitize = true,
203  bool removeHs = true);
204  void setData(const std::string &text, bool sanitize, bool removeHs,
205  bool strictParsing);
206 
207  /*! Resets our internal state and sets the indices of molecules in the stream.
208  * The client should be *very* careful about calling this method, as it's
209  *trivial
210  * to end up with a completely useless supplier.
211  *
212  * \param locs - the vector of stream positions.
213  *
214  * Note that this can be used not only to make reading selected molecules
215  *from a
216  * large SD file much faster, but it can also allow subsetting an SD file or
217  * rearranging the order of the molecules.
218  */
219  void setStreamIndices(const std::vector<std::streampos> &locs);
220 
221  private:
222  void checkForEnd() override;
223  void setDataCommon(const std::string &text, bool sanitize, bool removeHs);
224  int d_len = 0; // total number of mol blocks in the file (initialized to -1)
225  int d_last = 0; // the molecule we are ready to read
226  std::vector<std::streampos> d_molpos;
227 };
228 
229 //! lazy file parser for Smiles tables
231  /**************************************************************************
232  * Lazy file parser for Smiles table file, similar to the lazy SD
233  * file parser above
234  * - As an when new molecules are read using "next" their
235  * positions in the file are noted.
236  * - A call to the "length" will autamatically parse the entire
237  * file and cache all the mol block positions
238  * - [] operator is used to access a molecule at "idx", calling
239  * next following this will result in the next molecule after
240  * "idx"
241  ***************************************************************************/
242  public:
243  /*!
244  * \param fileName - the name of smiles table file
245  * \param delimiter - delimiting characters between records on a each
246  * line NOTE that this is not a string, the tokenizer looks for
247  * the individual characters in delimiter, not the full string
248  * itself. So the default delimiter: " \t", means " " or "\t".
249  * \param smilesColumn - column number for the SMILES string (defaults
250  * to the first column)
251  * \param nameColumn - column number for the molecule name (defaults to
252  * the second column) If set to -1 we assume that no name is
253  * available for the molecule and the name is defaulted to the
254  * smiles string
255  * \param titleLine - if true, the first line is assumed to list the
256  * names of properties in order separated by 'delimiter'. It is
257  * also assume that the 'SMILES' column and the 'name' column
258  * are not specified here if false - no title line is assumed
259  * and the properties are recorded as the "columnX" where "X" is
260  * the column number
261  * \param sanitize - if true sanitize the molecule before returning it
262  */
263  explicit SmilesMolSupplier(const std::string &fileName,
264  const std::string &delimiter = " \t",
265  int smilesColumn = 0, int nameColumn = 1,
266  bool titleLine = true, bool sanitize = true);
268  explicit SmilesMolSupplier(std::istream *inStream, bool takeOwnership = true,
269  const std::string &delimiter = " \t",
270  int smilesColumn = 0, int nameColumn = 1,
271  bool titleLine = true, bool sanitize = true);
272 
273  ~SmilesMolSupplier() override { close(); }
274  void setData(const std::string &text, const std::string &delimiter = " ",
275  int smilesColumn = 0, int nameColumn = 1, bool titleLine = true,
276  bool sanitize = true);
277  void init() override;
278  void reset() override;
279  ROMol *next() override;
280  bool atEnd() override;
281  void moveTo(unsigned int idx);
282  ROMol *operator[](unsigned int idx);
283  /*! \brief returns the text block for a particular item
284  *
285  * \param idx - which item to return
286  */
287  std::string getItemText(unsigned int idx);
288  unsigned int length();
289 
290  private:
291  ROMol *processLine(std::string inLine);
292  void processTitleLine();
293  std::string nextLine();
294  long int skipComments();
295  void checkForEnd();
296 
297  bool df_end = false; // have we reached the end of the file?
298  int d_len = 0; // total number of smiles in the file
299  int d_next = 0; // the molecule we are ready to read
300  int d_line = 0; // line number we are currently on
301  std::vector<std::streampos>
302  d_molpos; // vector of positions in the file for molecules
303  std::vector<int> d_lineNums;
304  std::string d_delim; // the delimiter string
305  bool df_sanitize = true; // sanitize molecules before returning them?
306  STR_VECT d_props; // vector of property names
307  bool df_title = true; // do we have a title line?
308  int d_smi = 0; // column id for the smile string
309  int d_name = 1; // column id for the name
310 };
311 
312 //! lazy file parser for TDT files
314  /**************************************************************************
315  * Lazy file parser for TDT files, similar to the lazy SD
316  * file parser above
317  * - As an when new molecules are read using "next" their
318  * positions in the file are noted.
319  * - A call to the "length" will autamatically parse the entire
320  * file and cache all the mol block positions
321  * - [] operator is used to access a molecule at "idx", calling
322  * next following this will result in the next molecule after
323  * "idx"
324  ***************************************************************************/
325  public:
326  /*!
327  * \param fileName - the name of the TDT file
328  * \param nameRecord - property name for the molecule name.
329  * If empty (the default), the name defaults to be empty
330  * \param confId2D - if >=0 and 2D coordinates are provided, the 2D
331  * structure (depiction) in the input will be read into the
332  * corresponding conformer id.
333  * \param confId3D - if >=0 and 3D coordinates are provided, the 3D
334  * structure (depiction) in the input will be read into the
335  * corresponding conformer id.
336  * \param sanitize - if true sanitize the molecule before returning it
337  */
338  explicit TDTMolSupplier(const std::string &fileName,
339  const std::string &nameRecord = "", int confId2D = -1,
340  int confId3D = 0, bool sanitize = true);
341  explicit TDTMolSupplier(std::istream *inStream, bool takeOwnership = true,
342  const std::string &nameRecord = "", int confId2D = -1,
343  int confId3D = 0, bool sanitize = true);
345  ~TDTMolSupplier() override { close(); }
346  void setData(const std::string &text, const std::string &nameRecord = "",
347  int confId2D = -1, int confId3D = 0, bool sanitize = true);
348  void init() override;
349  void reset() override;
350  ROMol *next() override;
351  bool atEnd() override;
352  void moveTo(unsigned int idx);
353  ROMol *operator[](unsigned int idx);
354  /*! \brief returns the text block for a particular item
355  *
356  * \param idx - which item to return
357  */
358  std::string getItemText(unsigned int idx);
359  unsigned int length();
360 
361  private:
362  bool advanceToNextRecord();
363  void checkForEnd();
364  ROMol *parseMol(std::string inLine);
365 
366  bool df_end = false; // have we reached the end of the file?
367  int d_len = 0; // total number of mols in the file
368  int d_last = 0; // the molecule we are ready to read
369  int d_line = 0; // line number we are currently on
370  int d_confId2D = -1; // id to use for 2D conformers
371  int d_confId3D = 0; // id to use for 3D conformers
372  std::vector<std::streampos>
373  d_molpos; // vector of positions in the file for molecules
374  bool df_sanitize = true; // sanitize molecules before returning them?
375  std::string d_nameProp =
376  ""; // local storage for the property providing mol names
377 };
378 
379 //! lazy file parser for PDB files
381  public:
382  explicit PDBMolSupplier(std::istream *inStream, bool takeOwnership = true,
383  bool sanitize = true, bool removeHs = true,
384  unsigned int flavor = 0,
385  bool proximityBonding = true);
386  explicit PDBMolSupplier(const std::string &fname, bool sanitize = true,
387  bool removeHs = true, unsigned int flavor = 0,
388  bool proximityBonding = true);
389 
390  ~PDBMolSupplier() override { close(); }
391 
392  void init() override;
393  void reset() override;
394  ROMol *next() override;
395  bool atEnd() override;
396 
397  protected:
398  bool df_sanitize, df_removeHs, df_proximityBonding;
399  unsigned int d_flavor;
400 };
401 #ifdef RDK_BUILD_MAEPARSER_SUPPORT
402 //! lazy file parser for MAE files
403 class RDKIT_FILEPARSERS_EXPORT MaeMolSupplier : public MolSupplier {
404  /**
405  * Due to maeparser's shared_ptr<istream> Reader interface, MaeMolSupplier
406  * always requires taking ownership of the istream ptr, as the shared ptr will
407  * always clear it upon destruction.
408  */
409 
410  public:
411  MaeMolSupplier() { init(); }
412 
413  explicit MaeMolSupplier(std::shared_ptr<std::istream> inStream,
414  bool sanitize = true, bool removeHs = true);
415 
416  explicit MaeMolSupplier(std::istream *inStream, bool takeOwnership = true,
417  bool sanitize = true, bool removeHs = true);
418 
419  explicit MaeMolSupplier(const std::string &fname, bool sanitize = true,
420  bool removeHs = true);
421 
422  ~MaeMolSupplier() override {}
423 
424  void init() override;
425  void reset() override;
426  ROMol *next() override;
427  bool atEnd() override;
428 
429  void close() override { dp_sInStream.reset(); }
430 
431  private:
432  void moveToNextBlock();
433 
434  protected:
435  bool df_sanitize, df_removeHs;
436  std::shared_ptr<schrodinger::mae::Reader> d_reader;
437  std::shared_ptr<schrodinger::mae::Block> d_next_struct;
438  std::shared_ptr<std::istream> dp_sInStream;
439  std::string d_stored_exc;
440 };
441 #endif // RDK_BUILD_MAEPARSER_SUPPORT
442 } // namespace RDKit
443 
444 #endif
Defines the primary molecule class ROMol as well as associated typedefs.
used by various file parsing classes to indicate a bad file
virtual void readMolProps(ROMol *)
void setProcessPropertyLists(bool val)
Definition: MolSupplier.h:140
ROMol * next() override
ForwardSDMolSupplier(std::istream *inStream, bool takeOwnership=true, bool sanitize=true, bool removeHs=true, bool strictParsing=false)
bool getProcessPropertyLists() const
Definition: MolSupplier.h:141
virtual bool atEnd()=0
std::istream * openAndCheckStream(const std::string &filename)
Definition: MolSupplier.h:92
virtual void reset()=0
virtual void init()=0
virtual ROMol * next()=0
virtual ~MolSupplier()
Definition: MolSupplier.h:62
virtual void close()
Definition: MolSupplier.h:68
lazy file parser for PDB files
Definition: MolSupplier.h:380
~PDBMolSupplier() override
Definition: MolSupplier.h:390
PDBMolSupplier(std::istream *inStream, bool takeOwnership=true, bool sanitize=true, bool removeHs=true, unsigned int flavor=0, bool proximityBonding=true)
bool atEnd() override
ROMol * next() override
void reset() override
void init() override
PDBMolSupplier(const std::string &fname, bool sanitize=true, bool removeHs=true, unsigned int flavor=0, bool proximityBonding=true)
unsigned int d_flavor
Definition: MolSupplier.h:399
void setStreamIndices(const std::vector< std::streampos > &locs)
void setData(const std::string &text, bool sanitize=true, bool removeHs=true)
bool atEnd() override
unsigned int length()
void reset() override
void setData(const std::string &text, bool sanitize, bool removeHs, bool strictParsing)
SDMolSupplier(const std::string &fileName, bool sanitize=true, bool removeHs=true, bool strictParsing=true)
std::string getItemText(unsigned int idx)
returns the text block for a particular item
ROMol * next() override
SDMolSupplier(std::istream *inStream, bool takeOwnership=true, bool sanitize=true, bool removeHs=true, bool strictParsing=true)
void moveTo(unsigned int idx)
void init() override
~SDMolSupplier() override
Definition: MolSupplier.h:189
ROMol * operator[](unsigned int idx)
lazy file parser for Smiles tables
Definition: MolSupplier.h:230
~SmilesMolSupplier() override
Definition: MolSupplier.h:273
void moveTo(unsigned int idx)
ROMol * next() override
SmilesMolSupplier(const std::string &fileName, const std::string &delimiter=" \t", int smilesColumn=0, int nameColumn=1, bool titleLine=true, bool sanitize=true)
void init() override
ROMol * operator[](unsigned int idx)
SmilesMolSupplier(std::istream *inStream, bool takeOwnership=true, const std::string &delimiter=" \t", int smilesColumn=0, int nameColumn=1, bool titleLine=true, bool sanitize=true)
void reset() override
bool atEnd() override
std::string getItemText(unsigned int idx)
returns the text block for a particular item
void setData(const std::string &text, const std::string &delimiter=" ", int smilesColumn=0, int nameColumn=1, bool titleLine=true, bool sanitize=true)
lazy file parser for TDT files
Definition: MolSupplier.h:313
void moveTo(unsigned int idx)
std::string getItemText(unsigned int idx)
returns the text block for a particular item
bool atEnd() override
~TDTMolSupplier() override
Definition: MolSupplier.h:345
void init() override
TDTMolSupplier(const std::string &fileName, const std::string &nameRecord="", int confId2D=-1, int confId3D=0, bool sanitize=true)
TDTMolSupplier(std::istream *inStream, bool takeOwnership=true, const std::string &nameRecord="", int confId2D=-1, int confId3D=0, bool sanitize=true)
ROMol * operator[](unsigned int idx)
ROMol * next() override
void reset() override
void setData(const std::string &text, const std::string &nameRecord="", int confId2D=-1, int confId3D=0, bool sanitize=true)
unsigned int length()
#define RDKIT_FILEPARSERS_EXPORT
Definition: export.h:153
RDKIT_GRAPHMOL_EXPORT ROMol * removeHs(const ROMol &mol, bool implicitOnly=false, bool updateExplicitCount=false, bool sanitize=true)
returns a copy of a molecule with hydrogens removed
Std stuff.
Definition: Abbreviations.h:18
RDKIT_FILEPARSERS_EXPORT std::string strip(const std::string &orig)
std::vector< std::string > STR_VECT
Definition: Dict.h:29