ViewVC Help
View File | Revision Log | Show Annotations | View Changeset | Root Listing
root/OpenMD/trunk/src/openbabel/fingerprint.cpp
Revision: 1081
Committed: Thu Oct 19 20:49:05 2006 UTC (18 years, 6 months ago) by gezelter
File size: 21414 byte(s)
Log Message:
updated OpenBabel to version 2.0.2

File Contents

# Content
1 /**********************************************************************
2 fingerprint.cpp - Implementation of fingerpring base class and fastsearching
3
4 Copyright (C) 2005 by Chris Morley
5
6 This file is part of the Open Babel project.
7 For more information, see <http://openbabel.sourceforge.net/>
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation version 2 of the License.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17 ***********************************************************************/
18
19 #include "config.h"
20 #include <vector>
21 #include <algorithm>
22
23 #if HAVE_IOSTREAM
24 #include <iostream>
25 #elif HAVE_IOSTREAM_H
26 #include <iostream.h>
27 #endif
28
29 #if HAVE_FSTREAM
30 #include <fstream>
31 #elif HAVE_FSTREAM_H
32 #include <fstream.h>
33 #endif
34 #include <vector>
35
36 #include "fingerprint.hpp"
37 #include "oberror.hpp"
38
39 using namespace std;
40 namespace OpenBabel {
41
42 OBFingerprint* OBFingerprint::_pDefault; //static variable
43 const unsigned int OBFingerprint::bitsperint = 8 * sizeof(unsigned int);
44 int OBFingerprint::rubbish = 666;
45
46 void OBFingerprint::SetBit(vector<unsigned int>& vec, unsigned int n)
47 {
48 vec[n/Getbitsperint()] |= (1 << (n % Getbitsperint()));
49 }
50
51 ////////////////////////////////////////
52 void OBFingerprint::Fold(vector<unsigned int>& vec, unsigned int nbits)
53 {
54 while(vec.size()*Getbitsperint()/2 >= nbits)
55 vec.erase(transform(vec.begin(),vec.begin()+vec.size()/2,
56 vec.begin()+vec.size()/2, vec.begin(), bit_or()), vec.end());
57 }
58
59 ////////////////////////////////////////
60 bool OBFingerprint::GetNextFPrt(std::string& id, OBFingerprint*& pFPrt)
61 {
62 Fptpos iter;
63 if(id.empty())
64 iter=FPtsMap().begin();
65 else
66 {
67 iter=FPtsMap().find(id);
68 if(iter!=FPtsMap().end())
69 ++iter;
70 }
71 if(iter==FPtsMap().end())
72 return false;
73 id = iter->first;
74 pFPrt = iter->second;
75 return true;
76 }
77
78 OBFingerprint* OBFingerprint::FindFingerprint(string& ID)
79 {
80 if(ID.empty())
81 return _pDefault;
82 Fptpos iter = FPtsMap().find(ID);
83 if(iter==FPtsMap().end())
84 return NULL;
85 else
86 return iter->second;
87 }
88
89 double OBFingerprint::Tanimoto(const vector<unsigned int>& vec1, const vector<unsigned int>& vec2)
90 {
91 //Independent of sizeof(unsigned int)
92 if(vec1.size()!=vec2.size())
93 return -1; //different number of bits
94 int andbits=0, orbits=0;
95 for (unsigned i=0;i<vec1.size();++i)
96 {
97 int andfp = vec1[i] & vec2[i];
98 int orfp = vec1[i] | vec2[i];
99 //Count bits
100 for(;andfp;andfp=andfp<<1)
101 if(andfp<0) ++andbits;
102 for(;orfp;orfp=orfp<<1)
103 if(orfp<0) ++orbits;
104 }
105 return((double)andbits/(double)orbits);
106 }
107
108 //*****************************************************************8
109 /*!
110
111 */
112 bool FastSearch::Find(OBBase* pOb, vector<unsigned int>& SeekPositions,
113 unsigned int MaxCandidates)
114 {
115 ///Finds chemical objects in datafilename (which must previously have been indexed)
116 ///that have all the same bits set in their fingerprints as in the fingerprint of
117 ///a pattern object. (Usually a substructure search in molecules.)
118 ///The type of fingerprint and its degree of folding does not have to be specified
119 ///here because the values in the index file are used.
120 ///The positions of the candidate matching molecules in the original datafile are returned.
121
122 vector<unsigned int> vecwords;
123 _pFP->GetFingerprint(pOb,vecwords, _index.header.words * OBFingerprint::Getbitsperint());
124
125 vector<unsigned int>candidates; //indices of matches from fingerprint screen
126 candidates.reserve(MaxCandidates);
127
128 unsigned int dataSize = _index.header.nEntries;
129 // GetFingerprint(mol, vecwords, _index.header.words, _index.header.fptype);
130
131 unsigned int words = _index.header.words;
132 unsigned int* nextp = &_index.fptdata[0];
133 unsigned int* ppat0 = &vecwords[0];
134 register unsigned int* p;
135 register unsigned int* ppat;
136 register unsigned int a;
137 unsigned int i; // need address of this, can't be register
138 for(i=0;i<dataSize; i++) //speed critical section
139 {
140 p=nextp;
141 nextp += words;
142 ppat=ppat0;
143 a=0;
144 while(p<nextp)
145 {
146 if ( (a=((*ppat) & (*p++)) ^ (*ppat++)) ) break;
147 }
148 if(!a)
149 {
150 candidates.push_back(i);
151 if(candidates.size()>=MaxCandidates)
152 break;
153 }
154 }
155
156 if(i<_index.header.nEntries) //premature end to search
157 {
158 #ifdef HAVE_SSTREAM
159 stringstream errorMsg;
160 #else
161 strstream errorMsg;
162 #endif
163 errorMsg << "Stopped looking after " << i << " molecules." << endl;
164 obErrorLog.ThrowError(__func__, errorMsg.str(), obWarning);
165 }
166
167 vector<unsigned int>::iterator itr;
168 for(itr=candidates.begin();itr!=candidates.end();itr++)
169 {
170 SeekPositions.push_back(_index.seekdata[*itr]);
171 }
172 return true;
173 }
174
175 /////////////////////////////////////////////////////////
176 bool FastSearch::FindSimilar(OBBase* pOb, multimap<double, unsigned int>& SeekposMap,
177 double MinTani)
178 {
179 vector<unsigned int> targetfp;
180 _pFP->GetFingerprint(pOb,targetfp, _index.header.words * OBFingerprint::Getbitsperint());
181
182 unsigned int words = _index.header.words;
183 unsigned int dataSize = _index.header.nEntries;
184 unsigned int* nextp = &_index.fptdata[0];
185 register unsigned int* p;
186 register unsigned int i;
187 for(i=0;i<dataSize; i++) //speed critical section
188 {
189 p=nextp;
190 nextp += words;
191 double tani = OBFingerprint::Tanimoto(targetfp,p);
192 if(tani>MinTani)
193 SeekposMap.insert(pair<const double, unsigned int>(tani,_index.seekdata[i]));
194 }
195 return true;
196 }
197
198 /////////////////////////////////////////////////////////
199 bool FastSearch::FindSimilar(OBBase* pOb, multimap<double, unsigned int>& SeekposMap,
200 int nCandidates)
201 {
202 ///If nCandidates is zero or omitted the original size of the multimap is used
203 if(nCandidates)
204 {
205 //initialise the multimap with nCandidate zero entries
206 SeekposMap.clear();
207 int i;
208 for(i=0;i<nCandidates;++i)
209 SeekposMap.insert(pair<const double, unsigned int>(0,0));
210 }
211 else if(SeekposMap.size()==0)
212 return false;
213
214 vector<unsigned int> targetfp;
215 _pFP->GetFingerprint(pOb,targetfp, _index.header.words * OBFingerprint::Getbitsperint());
216
217 unsigned int words = _index.header.words;
218 unsigned int dataSize = _index.header.nEntries;
219 unsigned int* nextp = &_index.fptdata[0];
220 register unsigned int* p;
221 register unsigned int i;
222 for(i=0;i<dataSize; i++) //speed critical section
223 {
224 p=nextp;
225 nextp += words;
226 double tani = OBFingerprint::Tanimoto(targetfp,p);
227 if(tani>SeekposMap.begin()->first)
228 {
229 SeekposMap.insert(pair<const double, unsigned int>(tani,_index.seekdata[i]));
230 SeekposMap.erase(SeekposMap.begin());
231 }
232 }
233 return true;
234 }
235
236 /////////////////////////////////////////////////////////
237 string FastSearch::ReadIndex(istream* pIndexstream)
238 {
239 //Reads fs index from istream into member variables
240 _index.Read(pIndexstream);
241
242 _pFP = _index.CheckFP();
243 if(!_pFP)
244 *(_index.header.datafilename) = '\0';
245
246 return _index.header.datafilename; //will be empty on error
247 }
248
249 //////////////////////////////////////////////////////////
250 bool FptIndex::Read(istream* pIndexstream)
251 {
252 pIndexstream->read((char*)&(header), sizeof(FptIndexHeader));
253 pIndexstream->seekg(header.headerlength);//allows header length to be changed
254 if(pIndexstream->fail() || header.headerlength != sizeof(FptIndexHeader))
255 {
256 *(header.datafilename) = '\0';
257 return false;
258 }
259
260 unsigned int nwords = header.nEntries * header.words;
261 fptdata.resize(nwords);
262 seekdata.resize(header.nEntries);
263
264 pIndexstream->read((char*)&(fptdata[0]), sizeof(unsigned int) * nwords);
265 pIndexstream->read((char*)&(seekdata[0]), sizeof(unsigned int) * header.nEntries);
266
267 if(pIndexstream->fail())
268 {
269 *(header.datafilename) = '\0';
270 return false;
271 }
272 return true;
273 }
274
275 //////////////////////////////////////////////////////////
276 OBFingerprint* FptIndex::CheckFP()
277 {
278 //check that fingerprint type is available
279 string tempFP(header.fpid);
280 OBFingerprint* pFP = OBFingerprint::FindFingerprint(tempFP);
281 if(!pFP)
282 {
283 #ifdef HAVE_SSTREAM
284 stringstream errorMsg;
285 #else
286 strstream errorMsg;
287 #endif
288 errorMsg << "Index has Fingerprints of type '" << header.fpid
289 << " which is not currently loaded." << endl;
290 obErrorLog.ThrowError(__func__, errorMsg.str(), obError);
291 }
292 return pFP; //NULL if not available
293 }
294
295 //*******************************************************
296 FastSearchIndexer::FastSearchIndexer(string& datafilename, ostream* os,
297 std::string& fpid, int FptBits)
298 {
299 ///Starts indexing process
300 _indexstream = os;
301 _nbits=FptBits;
302 _pindex= new FptIndex;
303 _pindex->header.headerlength = sizeof(FptIndexHeader);
304 strncpy(_pindex->header.fpid,fpid.c_str(),15);
305 strncpy(_pindex->header.datafilename, datafilename.c_str(), 255);
306
307 //check that fingerprint type is available
308 _pFP = _pindex->CheckFP();
309 }
310
311 /////////////////////////////////////////////////////////////
312 FastSearchIndexer::FastSearchIndexer(FptIndex* pindex, std::ostream* os)
313 {
314 //Uses existing index
315 _indexstream = os;
316 _pindex = pindex;
317 _nbits = _pindex->header.words * OBFingerprint::Getbitsperint();
318
319 //check that fingerprint type is available
320 _pFP = _pindex->CheckFP();
321 }
322
323 /////////////////////////////////////////////////////////////
324 FastSearchIndexer::~FastSearchIndexer()
325 {
326 ///Saves index file
327 _pindex->header.nEntries = _pindex->seekdata.size();
328 _indexstream->write((const char*)&_pindex->header, sizeof(FptIndexHeader));
329 _indexstream->write((const char*)&_pindex->fptdata[0], _pindex->fptdata.size()*sizeof(unsigned int));
330 _indexstream->write((const char*)&_pindex->seekdata[0], _pindex->seekdata.size()*sizeof(unsigned int));
331 if(!_indexstream)
332 obErrorLog.ThrowError(__func__,
333 "Difficulty writing index", obWarning);
334 delete _pindex;
335 }
336
337 ///////////////////////////////////////////////////////////////
338 bool FastSearchIndexer::Add(OBBase* pOb, streampos seekpos)
339 {
340 ///Adds a fingerprint
341
342 vector<unsigned int> vecwords;
343 if(!_pFP)
344 return false;
345 if(_pFP->GetFingerprint(pOb, vecwords, _nbits))
346 {
347 _pindex->header.words = vecwords.size(); //Use size as returned from fingerprint
348 for(unsigned int i=0;i<_pindex->header.words;i++)
349 _pindex->fptdata.push_back(vecwords[i]);
350 _pindex->seekdata.push_back(seekpos);
351 return true;
352 }
353 obErrorLog.ThrowError(__func__, "Failed to make a fingerprint", obWarning);
354 return false;
355 }
356
357 /*!
358 \class OBFingerprint
359 These fingerprints are condensed representation of molecules (or other objects)
360 as a list of boolean values (actually bits in a vector<unsigned>) with length
361 of a power of 2. The main motivation is for fast searching of data sources
362 containing large numbers of molecules (up to several million). OpenBabel
363 provides some routines which can search text files containing lists of molecules
364 in any format. See the documentation on the class FastSearch.
365
366 There are descriptions of molecular fingerprints at <br>
367 http://www.daylight.com/dayhtml/doc/theory/theory.finger.html) and <br>
368 http://www.mesaac.com/Fingerprint.htm <br>
369 Many methods of preparing fingerprints have been described, but the type supported
370 currently in OpenBabel has each bit representing a substructure (or other
371 molecular property). If a substructure is present in the molecule, then a
372 particular bit is set to 1. But because the hashing method may also map other
373 substructures to the same bit, a match does not guarantee that a particular
374 substructure is present; there may be false positives. However, with proper design,
375 a large fraction of irrelevant molecules in a data set can be eliminated in a
376 fast search with boolean methods on the fingerprints.
377 It then becomes feasible to make a definitive substructure search by
378 conventional methods on this reduced list even if it is slow.
379
380 OpenBabel provides a framework for applying new types of fingerprints without
381 changing any existing code. They are derived from OBFingerprint and the
382 source file is just compiled with the rest of OpenBabel. Alternatively,
383 they can be separately compiled as a DLL or shared library and discovered
384 when OpenBabel runs.
385
386 Fingerprints derived from this abstract base class OBFingerprint can be for any
387 object derived from OBBase (not just for OBMol).
388 Each derived class provides an ID as a string and OBFingerprint keeps a map of
389 these to provides a pointer to the class when requested in FindFingerprint.
390
391 <h4>-- To define a fingerprint type --</h4>
392 The classes derived form OBFingerprint are required to provide
393 a GetFingerprint() routine and a Description() routine
394 \code
395 class MyFpType : OBFingerprint
396 {
397 MyFpType(const char* id) : OBFingerprint(id){};
398 virtual bool GetFingerprint(OBBase* pOb, vector<unsigned int>& fp, int nbits)
399 {
400 //Convert pOb to the required type, usually OBMol
401 OBMol* pmol = dynamic_cast<OBMol*>(pOb);
402 fp.resize(required_number_of_words);
403 ...
404 use SetBit(fp,n); to set the nth bit
405
406 if(nbits)
407 Fold(fp, nbits);
408 }
409 virtual const char* Description(){ return "Some descriptive text";}
410 ...
411 };
412 \endcode
413 //Declare a global instance with the ID you will use in -f options to specify its use.
414 \code
415 MyFpType theMyFpType("myfpID");
416 \endcode
417
418 <h4>-- To obtain a fingerprint --</h4>
419 \code
420 OBMol mol;
421 ...
422 vector<unsigned int> fp;
423 OBFingerprint::GetDefault()->GetFingerprint(&mol, fp); //gets default size of fingerprint
424 \endcode
425 or
426 \code
427 vector<unsigned int> fp;
428 OBFingerPrint* pFP = OBFingerprint::FindFingerprint("myfpID");
429 ...and maybe...
430 pFP->GetFingerprint(&mol,fp, 128); //fold down to 128bits if was originally larger
431 \endcode
432
433 <h4>-- To print a list of available fingerprint types --</h4>
434 \code
435 std::string id;
436 OBFingerPrint* pFPrt=NULL;
437 while(OBFingerprint::GetNextFPrt(id, pFPrt))
438 {
439 cout << id << " -- " << pFPrt->Description() << endl;
440 }
441 \endcode
442
443
444 Fingerprints are handled as vector<unsigned int> so that the number of bits
445 in this vector and their order will be platform and compiler
446 dependent, because of size of int and endian differences. Use fingerprints
447 (and fastsearch indexes containing them) only for comparing with other
448 fingerprints prepared on the same machine.
449
450 The FingerprintFormat class is an output format which displays fingerprints
451 as hexadecimal. When multiple molecules are supplied it will calculate the
452 Tanimoto coefficient from the first molecule to each of the others. It also
453 shows whether the first molecule is a possible substructure to all the others,
454 i.e. whether all the bits set in the fingerprint for the first molecule are
455 set in the fingerprint of the others. To display hexadecimal information when
456 multiple molecules are provided it is necessay to use the -xh option.
457
458 To see a list of available format types, type babel -F on the command line.
459 The -xF option of the FingerprintFormat class also provides this output, but due
460 to a quirk in the way the program works, it is necessary to have a valid input
461 molecule for this option to work.
462 */
463
464 /*! \class FastSearch
465 The FastSearch class searches an index to a datafile containing a list of molecules
466 (or other objects) prepared by FastSearchIndexer.
467
468 OpenBabel can also search files for molecules containing a substructure specified
469 by a SMARTS string, using OBSmartsPattern or from the command line:
470 \code
471 babel datafile.xxx -outfile.yyy -sSMARTS
472 \endcode
473 But when the data file contains more than about 10,000 molecules this becomes
474 rather too slow to be used interactively. To do it more quickly, an index
475 of the molecules containing their fingerprints (see OBFingerprint) is prepared using
476 FastSearchIndexer. The indexing process may take a long time but only has to
477 be done once. The index can be searched very quickly with FastSearch. Because
478 of the nature of fingerprints a match to a bit does not guarantee
479 the presence of a particular substructure or other molecular property, so that
480 a definitive answer may require a subsequent search of the (much reduced) list
481 of candidates by another method (like OBSmartsPattern).
482
483 Note that the index files are not portable. They need to be prepared on the
484 computer that will acess them.
485
486 <h4>Using FastSearch and FastSearchIndexer in a program</h4>
487
488 The index has two tables:
489 - an array of fingerprints of the molecules
490 - an array of the seek positions in the datasource file of all the molecules
491
492 <h4>To prepare an fastsearch index file:</h4>
493 - Open an ostream to the index file.
494 - Make a FastSearchIndexer object on the heap or the stack, passing in as parameters:
495 - the datafile name, the indexfile stream,
496 - the id of the fingerprint type to be used,
497 - the number of bits the fingerprint is to be folded down to, If it is to be left
498 unfolded, set fpid to 0 or do not specify it.
499 .
500 - For each molecule, call Add() with its pointer and its position in the datafile.<br>
501 Currently the std::streampos value is implicitly cast to unsigned int so that
502 for 32bit machines the datafile has to be no longer than about 2Gbyte.
503 - The index file is written when the FastSearchIndexer object is deleted or goes out
504 of scope.
505
506 <h4>To search in a fastsearch index file</h4>
507
508 - Open a std::istream to the indexfile (in binary mode on some systems)
509 - Make a FastSearch object, read the index and open the datafile whose
510 name it provides
511 \code
512 ifstream ifs(indexname,ios::binary);
513 FastSearch fs;
514 string datafilename = fs.ReadIndex(&ifs);
515 if(datafilename.empty()/endcode
516 return false;
517 ifstream datastream(datafilename);
518 if(!datastream)
519 return false;
520 \endcode
521
522 <strong>To do a search for molecules which have all the substructure bits the
523 OBMol object, patternMol</strong>
524 \code
525 vector<unsigned int>& SeekPositions;
526 if(!fs.Find(patternMol, SeekPositions, MaxCandidates))
527
528 for(itr=SeekPositions.begin();itr!=SeekPositions.end();itr++)
529 {
530 datastream.seekg(*itr);
531 ... read the candidate molecule
532 and subject to more rigorous test if necessary
533 }
534 \endcode
535
536 <strong>To do a similarity search based on the Tanimoto coefficient</strong>
537 This is defined as: <br>
538 <em>Number of bits set in (patternFP & targetFP)/Number of bits in (patternFP | targetFP)</em><br>
539 where the boolean operations between the fingerprints are bitwise<br>
540 The Tanimoto coefficient has no absolute meaning and depends on
541 the design of the fingerprint.
542 \code
543 multimap<double, unsigned int> SeekposMap;
544 // to find n best molecules
545 fs.FindSimilar(&patternMol, SeekposMap, n);
546 ...or
547 // to find molecules with Tanimoto coefficient > MinTani
548 fs.FindSimilar(&patternMol, SeekposMap, MinTani);
549
550 multimap<double, unsigned int>::reverse_iterator itr;
551 for(itr=SeekposMap.rbegin();itr!=SeekposMap.rend();++itr)
552 {
553 datastream.seekg(itr->second);
554 ... read the candidate molecule
555 double tani = itr->first;
556 }
557 \endcode
558
559 The FastSearchFormat class facilitates the use of these routine from the
560 command line or other front end program. For instance:
561
562 <strong>Prepare an index:</strong>
563 \code
564 babel datafile.xxx index.fs
565 \endcode
566 or
567 \code
568 babel datafile.xxx -ofs namedindex
569 \endcode
570 With options you can specify:
571 - which type of fingerprint to be used, e.g. -xfFP2,
572 - whether it is folded to a specified number of bits, e.g. -xn128
573 (which should be a power of2)
574 - whether to pre-select the molecules which are indexed:
575 - by structure e.g only ethers and esters, -sCOC
576 - by excluding molecules with bezene rings, -vc1ccccc1
577 - by position in the datafile e.g. only mols 10 to 90, -f10 -l90
578 .
579 <strong>Substructure search</strong> in a previously prepared index file
580 \code
581 babel index.fs outfile.yyy -sSMILES
582 \endcode
583 The datafile can also be used as the input file, provided the input format is specified as fs
584 \code
585 babel datafile.xxx outfile.yyy -sSMILES -ifs
586 \endcode
587 A "slow" search not using fingerprints would be done on the same data by omitting -ifs.
588 A "slow" search can use SMARTS strings, but the fast search is restricted
589 to the subset which is valid SMILES.
590
591 With the -S option, the target can be specified as a molecule in a file of any format
592 \code
593 babel datafile.xxx outfile.yyy -Spattern_mol.zzz -ifs
594 \endcode
595 These searches have two stages: a fingerprint search which produces
596 a number of candidate molecules and a definitive search which selects
597 from these using SMARTS pattern matching. The maximum number of candidates
598 is 4000 by default but you can change this with an option
599 e.g. -al 8000 (Note that you need the space between l and the number.)
600 If the fingerprint search reaches the maximum number it will not
601 look further and will tell you at what stage it stopped.
602
603 <strong>Similarity search</strong> in a previously prepared index file<br>
604 This rather done (rather than a substructure search) if the -at option is used,
605 \code
606 babel datafile.xxx outfile.yyy -sSMILES -ifs -at0.7
607 \endcode
608 for instance
609 - -at0.7 will recover all molecules with Tanimoto greater than 0.7
610 - -at15 (no decimal point) will recover the 15 molecules with largest coefficients.
611 - -aa will add the Tanimoto coefficient to the titles of the output molecules.
612
613 All stages, the indexing, the interpretation of the SMILES string in the -s option,
614 the file in the -S option and the final SMARTS filter convert to OBMol and apply
615 ConvertDativeBonds(). This ensures thatforms such as[N+]([O-])=O and N(=O)=O
616 are recognized as chemically identical.
617 */
618
619 }//Openbabel
620
621 //! \file fingerprint.cpp
622 //! \brief Definitions for OBFingerprint base class and fastsearch classes