1 |
/********************************************************************** |
2 |
fingerprint.cpp - Implementation of fingerpring base class and fastsearching |
3 |
|
4 |
Copyright (C) 2005 by Chris Morley |
5 |
|
6 |
This file is part of the Open Babel project. |
7 |
For more information, see <http://openbabel.sourceforge.net/> |
8 |
|
9 |
This program is free software; you can redistribute it and/or modify |
10 |
it under the terms of the GNU General Public License as published by |
11 |
the Free Software Foundation version 2 of the License. |
12 |
|
13 |
This program is distributed in the hope that it will be useful, |
14 |
but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 |
GNU General Public License for more details. |
17 |
***********************************************************************/ |
18 |
|
19 |
#include "config.h" |
20 |
#include <vector> |
21 |
#include <algorithm> |
22 |
|
23 |
#if HAVE_IOSTREAM |
24 |
#include <iostream> |
25 |
#elif HAVE_IOSTREAM_H |
26 |
#include <iostream.h> |
27 |
#endif |
28 |
|
29 |
#if HAVE_FSTREAM |
30 |
#include <fstream> |
31 |
#elif HAVE_FSTREAM_H |
32 |
#include <fstream.h> |
33 |
#endif |
34 |
#include <vector> |
35 |
|
36 |
#include "fingerprint.hpp" |
37 |
#include "oberror.hpp" |
38 |
|
39 |
using namespace std; |
40 |
namespace OpenBabel { |
41 |
|
42 |
OBFingerprint* OBFingerprint::_pDefault; //static variable |
43 |
const unsigned int OBFingerprint::bitsperint = 8 * sizeof(unsigned int); |
44 |
int OBFingerprint::rubbish = 666; |
45 |
|
46 |
void OBFingerprint::SetBit(vector<unsigned int>& vec, unsigned int n) |
47 |
{ |
48 |
vec[n/Getbitsperint()] |= (1 << (n % Getbitsperint())); |
49 |
} |
50 |
|
51 |
//////////////////////////////////////// |
52 |
void OBFingerprint::Fold(vector<unsigned int>& vec, unsigned int nbits) |
53 |
{ |
54 |
while(vec.size()*Getbitsperint()/2 >= nbits) |
55 |
vec.erase(transform(vec.begin(),vec.begin()+vec.size()/2, |
56 |
vec.begin()+vec.size()/2, vec.begin(), bit_or()), vec.end()); |
57 |
} |
58 |
|
59 |
//////////////////////////////////////// |
60 |
bool OBFingerprint::GetNextFPrt(std::string& id, OBFingerprint*& pFPrt) |
61 |
{ |
62 |
Fptpos iter; |
63 |
if(id.empty()) |
64 |
iter=FPtsMap().begin(); |
65 |
else |
66 |
{ |
67 |
iter=FPtsMap().find(id); |
68 |
if(iter!=FPtsMap().end()) |
69 |
++iter; |
70 |
} |
71 |
if(iter==FPtsMap().end()) |
72 |
return false; |
73 |
id = iter->first; |
74 |
pFPrt = iter->second; |
75 |
return true; |
76 |
} |
77 |
|
78 |
OBFingerprint* OBFingerprint::FindFingerprint(string& ID) |
79 |
{ |
80 |
if(ID.empty()) |
81 |
return _pDefault; |
82 |
Fptpos iter = FPtsMap().find(ID); |
83 |
if(iter==FPtsMap().end()) |
84 |
return NULL; |
85 |
else |
86 |
return iter->second; |
87 |
} |
88 |
|
89 |
double OBFingerprint::Tanimoto(const vector<unsigned int>& vec1, const vector<unsigned int>& vec2) |
90 |
{ |
91 |
//Independent of sizeof(unsigned int) |
92 |
if(vec1.size()!=vec2.size()) |
93 |
return -1; //different number of bits |
94 |
int andbits=0, orbits=0; |
95 |
for (unsigned i=0;i<vec1.size();++i) |
96 |
{ |
97 |
int andfp = vec1[i] & vec2[i]; |
98 |
int orfp = vec1[i] | vec2[i]; |
99 |
//Count bits |
100 |
for(;andfp;andfp=andfp<<1) |
101 |
if(andfp<0) ++andbits; |
102 |
for(;orfp;orfp=orfp<<1) |
103 |
if(orfp<0) ++orbits; |
104 |
} |
105 |
return((double)andbits/(double)orbits); |
106 |
} |
107 |
|
108 |
//*****************************************************************8 |
109 |
/*! |
110 |
|
111 |
*/ |
112 |
bool FastSearch::Find(OBBase* pOb, vector<unsigned int>& SeekPositions, |
113 |
unsigned int MaxCandidates) |
114 |
{ |
115 |
///Finds chemical objects in datafilename (which must previously have been indexed) |
116 |
///that have all the same bits set in their fingerprints as in the fingerprint of |
117 |
///a pattern object. (Usually a substructure search in molecules.) |
118 |
///The type of fingerprint and its degree of folding does not have to be specified |
119 |
///here because the values in the index file are used. |
120 |
///The positions of the candidate matching molecules in the original datafile are returned. |
121 |
|
122 |
vector<unsigned int> vecwords; |
123 |
_pFP->GetFingerprint(pOb,vecwords, _index.header.words * OBFingerprint::Getbitsperint()); |
124 |
|
125 |
vector<unsigned int>candidates; //indices of matches from fingerprint screen |
126 |
candidates.reserve(MaxCandidates); |
127 |
|
128 |
unsigned int dataSize = _index.header.nEntries; |
129 |
// GetFingerprint(mol, vecwords, _index.header.words, _index.header.fptype); |
130 |
|
131 |
unsigned int words = _index.header.words; |
132 |
unsigned int* nextp = &_index.fptdata[0]; |
133 |
unsigned int* ppat0 = &vecwords[0]; |
134 |
register unsigned int* p; |
135 |
register unsigned int* ppat; |
136 |
register unsigned int a; |
137 |
unsigned int i; // need address of this, can't be register |
138 |
for(i=0;i<dataSize; i++) //speed critical section |
139 |
{ |
140 |
p=nextp; |
141 |
nextp += words; |
142 |
ppat=ppat0; |
143 |
a=0; |
144 |
while(p<nextp) |
145 |
{ |
146 |
if ( (a=((*ppat) & (*p++)) ^ (*ppat++)) ) break; |
147 |
} |
148 |
if(!a) |
149 |
{ |
150 |
candidates.push_back(i); |
151 |
if(candidates.size()>=MaxCandidates) |
152 |
break; |
153 |
} |
154 |
} |
155 |
|
156 |
if(i<_index.header.nEntries) //premature end to search |
157 |
{ |
158 |
#ifdef HAVE_SSTREAM |
159 |
stringstream errorMsg; |
160 |
#else |
161 |
strstream errorMsg; |
162 |
#endif |
163 |
errorMsg << "Stopped looking after " << i << " molecules." << endl; |
164 |
obErrorLog.ThrowError(__func__, errorMsg.str(), obWarning); |
165 |
} |
166 |
|
167 |
vector<unsigned int>::iterator itr; |
168 |
for(itr=candidates.begin();itr!=candidates.end();itr++) |
169 |
{ |
170 |
SeekPositions.push_back(_index.seekdata[*itr]); |
171 |
} |
172 |
return true; |
173 |
} |
174 |
|
175 |
///////////////////////////////////////////////////////// |
176 |
bool FastSearch::FindSimilar(OBBase* pOb, multimap<double, unsigned int>& SeekposMap, |
177 |
double MinTani) |
178 |
{ |
179 |
vector<unsigned int> targetfp; |
180 |
_pFP->GetFingerprint(pOb,targetfp, _index.header.words * OBFingerprint::Getbitsperint()); |
181 |
|
182 |
unsigned int words = _index.header.words; |
183 |
unsigned int dataSize = _index.header.nEntries; |
184 |
unsigned int* nextp = &_index.fptdata[0]; |
185 |
register unsigned int* p; |
186 |
register unsigned int i; |
187 |
for(i=0;i<dataSize; i++) //speed critical section |
188 |
{ |
189 |
p=nextp; |
190 |
nextp += words; |
191 |
double tani = OBFingerprint::Tanimoto(targetfp,p); |
192 |
if(tani>MinTani) |
193 |
SeekposMap.insert(pair<const double, unsigned int>(tani,_index.seekdata[i])); |
194 |
} |
195 |
return true; |
196 |
} |
197 |
|
198 |
///////////////////////////////////////////////////////// |
199 |
bool FastSearch::FindSimilar(OBBase* pOb, multimap<double, unsigned int>& SeekposMap, |
200 |
int nCandidates) |
201 |
{ |
202 |
///If nCandidates is zero or omitted the original size of the multimap is used |
203 |
if(nCandidates) |
204 |
{ |
205 |
//initialise the multimap with nCandidate zero entries |
206 |
SeekposMap.clear(); |
207 |
int i; |
208 |
for(i=0;i<nCandidates;++i) |
209 |
SeekposMap.insert(pair<const double, unsigned int>(0,0)); |
210 |
} |
211 |
else if(SeekposMap.size()==0) |
212 |
return false; |
213 |
|
214 |
vector<unsigned int> targetfp; |
215 |
_pFP->GetFingerprint(pOb,targetfp, _index.header.words * OBFingerprint::Getbitsperint()); |
216 |
|
217 |
unsigned int words = _index.header.words; |
218 |
unsigned int dataSize = _index.header.nEntries; |
219 |
unsigned int* nextp = &_index.fptdata[0]; |
220 |
register unsigned int* p; |
221 |
register unsigned int i; |
222 |
for(i=0;i<dataSize; i++) //speed critical section |
223 |
{ |
224 |
p=nextp; |
225 |
nextp += words; |
226 |
double tani = OBFingerprint::Tanimoto(targetfp,p); |
227 |
if(tani>SeekposMap.begin()->first) |
228 |
{ |
229 |
SeekposMap.insert(pair<const double, unsigned int>(tani,_index.seekdata[i])); |
230 |
SeekposMap.erase(SeekposMap.begin()); |
231 |
} |
232 |
} |
233 |
return true; |
234 |
} |
235 |
|
236 |
///////////////////////////////////////////////////////// |
237 |
string FastSearch::ReadIndex(istream* pIndexstream) |
238 |
{ |
239 |
//Reads fs index from istream into member variables |
240 |
_index.Read(pIndexstream); |
241 |
|
242 |
_pFP = _index.CheckFP(); |
243 |
if(!_pFP) |
244 |
*(_index.header.datafilename) = '\0'; |
245 |
|
246 |
return _index.header.datafilename; //will be empty on error |
247 |
} |
248 |
|
249 |
////////////////////////////////////////////////////////// |
250 |
bool FptIndex::Read(istream* pIndexstream) |
251 |
{ |
252 |
pIndexstream->read((char*)&(header), sizeof(FptIndexHeader)); |
253 |
pIndexstream->seekg(header.headerlength);//allows header length to be changed |
254 |
if(pIndexstream->fail() || header.headerlength != sizeof(FptIndexHeader)) |
255 |
{ |
256 |
*(header.datafilename) = '\0'; |
257 |
return false; |
258 |
} |
259 |
|
260 |
unsigned int nwords = header.nEntries * header.words; |
261 |
fptdata.resize(nwords); |
262 |
seekdata.resize(header.nEntries); |
263 |
|
264 |
pIndexstream->read((char*)&(fptdata[0]), sizeof(unsigned int) * nwords); |
265 |
pIndexstream->read((char*)&(seekdata[0]), sizeof(unsigned int) * header.nEntries); |
266 |
|
267 |
if(pIndexstream->fail()) |
268 |
{ |
269 |
*(header.datafilename) = '\0'; |
270 |
return false; |
271 |
} |
272 |
return true; |
273 |
} |
274 |
|
275 |
////////////////////////////////////////////////////////// |
276 |
OBFingerprint* FptIndex::CheckFP() |
277 |
{ |
278 |
//check that fingerprint type is available |
279 |
string tempFP(header.fpid); |
280 |
OBFingerprint* pFP = OBFingerprint::FindFingerprint(tempFP); |
281 |
if(!pFP) |
282 |
{ |
283 |
#ifdef HAVE_SSTREAM |
284 |
stringstream errorMsg; |
285 |
#else |
286 |
strstream errorMsg; |
287 |
#endif |
288 |
errorMsg << "Index has Fingerprints of type '" << header.fpid |
289 |
<< " which is not currently loaded." << endl; |
290 |
obErrorLog.ThrowError(__func__, errorMsg.str(), obError); |
291 |
} |
292 |
return pFP; //NULL if not available |
293 |
} |
294 |
|
295 |
//******************************************************* |
296 |
FastSearchIndexer::FastSearchIndexer(string& datafilename, ostream* os, |
297 |
std::string& fpid, int FptBits) |
298 |
{ |
299 |
///Starts indexing process |
300 |
_indexstream = os; |
301 |
_nbits=FptBits; |
302 |
_pindex= new FptIndex; |
303 |
_pindex->header.headerlength = sizeof(FptIndexHeader); |
304 |
strncpy(_pindex->header.fpid,fpid.c_str(),15); |
305 |
strncpy(_pindex->header.datafilename, datafilename.c_str(), 255); |
306 |
|
307 |
//check that fingerprint type is available |
308 |
_pFP = _pindex->CheckFP(); |
309 |
} |
310 |
|
311 |
///////////////////////////////////////////////////////////// |
312 |
FastSearchIndexer::FastSearchIndexer(FptIndex* pindex, std::ostream* os) |
313 |
{ |
314 |
//Uses existing index |
315 |
_indexstream = os; |
316 |
_pindex = pindex; |
317 |
_nbits = _pindex->header.words * OBFingerprint::Getbitsperint(); |
318 |
|
319 |
//check that fingerprint type is available |
320 |
_pFP = _pindex->CheckFP(); |
321 |
} |
322 |
|
323 |
///////////////////////////////////////////////////////////// |
324 |
FastSearchIndexer::~FastSearchIndexer() |
325 |
{ |
326 |
///Saves index file |
327 |
_pindex->header.nEntries = _pindex->seekdata.size(); |
328 |
_indexstream->write((const char*)&_pindex->header, sizeof(FptIndexHeader)); |
329 |
_indexstream->write((const char*)&_pindex->fptdata[0], _pindex->fptdata.size()*sizeof(unsigned int)); |
330 |
_indexstream->write((const char*)&_pindex->seekdata[0], _pindex->seekdata.size()*sizeof(unsigned int)); |
331 |
if(!_indexstream) |
332 |
obErrorLog.ThrowError(__func__, |
333 |
"Difficulty writing index", obWarning); |
334 |
delete _pindex; |
335 |
} |
336 |
|
337 |
/////////////////////////////////////////////////////////////// |
338 |
bool FastSearchIndexer::Add(OBBase* pOb, streampos seekpos) |
339 |
{ |
340 |
///Adds a fingerprint |
341 |
|
342 |
vector<unsigned int> vecwords; |
343 |
if(!_pFP) |
344 |
return false; |
345 |
if(_pFP->GetFingerprint(pOb, vecwords, _nbits)) |
346 |
{ |
347 |
_pindex->header.words = vecwords.size(); //Use size as returned from fingerprint |
348 |
for(unsigned int i=0;i<_pindex->header.words;i++) |
349 |
_pindex->fptdata.push_back(vecwords[i]); |
350 |
_pindex->seekdata.push_back(seekpos); |
351 |
return true; |
352 |
} |
353 |
obErrorLog.ThrowError(__func__, "Failed to make a fingerprint", obWarning); |
354 |
return false; |
355 |
} |
356 |
|
357 |
/*! |
358 |
\class OBFingerprint |
359 |
These fingerprints are condensed representation of molecules (or other objects) |
360 |
as a list of boolean values (actually bits in a vector<unsigned>) with length |
361 |
of a power of 2. The main motivation is for fast searching of data sources |
362 |
containing large numbers of molecules (up to several million). OpenBabel |
363 |
provides some routines which can search text files containing lists of molecules |
364 |
in any format. See the documentation on the class FastSearch. |
365 |
|
366 |
There are descriptions of molecular fingerprints at <br> |
367 |
http://www.daylight.com/dayhtml/doc/theory/theory.finger.html) and <br> |
368 |
http://www.mesaac.com/Fingerprint.htm <br> |
369 |
Many methods of preparing fingerprints have been described, but the type supported |
370 |
currently in OpenBabel has each bit representing a substructure (or other |
371 |
molecular property). If a substructure is present in the molecule, then a |
372 |
particular bit is set to 1. But because the hashing method may also map other |
373 |
substructures to the same bit, a match does not guarantee that a particular |
374 |
substructure is present; there may be false positives. However, with proper design, |
375 |
a large fraction of irrelevant molecules in a data set can be eliminated in a |
376 |
fast search with boolean methods on the fingerprints. |
377 |
It then becomes feasible to make a definitive substructure search by |
378 |
conventional methods on this reduced list even if it is slow. |
379 |
|
380 |
OpenBabel provides a framework for applying new types of fingerprints without |
381 |
changing any existing code. They are derived from OBFingerprint and the |
382 |
source file is just compiled with the rest of OpenBabel. Alternatively, |
383 |
they can be separately compiled as a DLL or shared library and discovered |
384 |
when OpenBabel runs. |
385 |
|
386 |
Fingerprints derived from this abstract base class OBFingerprint can be for any |
387 |
object derived from OBBase (not just for OBMol). |
388 |
Each derived class provides an ID as a string and OBFingerprint keeps a map of |
389 |
these to provides a pointer to the class when requested in FindFingerprint. |
390 |
|
391 |
<h4>-- To define a fingerprint type --</h4> |
392 |
The classes derived form OBFingerprint are required to provide |
393 |
a GetFingerprint() routine and a Description() routine |
394 |
\code |
395 |
class MyFpType : OBFingerprint |
396 |
{ |
397 |
MyFpType(const char* id) : OBFingerprint(id){}; |
398 |
virtual bool GetFingerprint(OBBase* pOb, vector<unsigned int>& fp, int nbits) |
399 |
{ |
400 |
//Convert pOb to the required type, usually OBMol |
401 |
OBMol* pmol = dynamic_cast<OBMol*>(pOb); |
402 |
fp.resize(required_number_of_words); |
403 |
... |
404 |
use SetBit(fp,n); to set the nth bit |
405 |
|
406 |
if(nbits) |
407 |
Fold(fp, nbits); |
408 |
} |
409 |
virtual const char* Description(){ return "Some descriptive text";} |
410 |
... |
411 |
}; |
412 |
\endcode |
413 |
//Declare a global instance with the ID you will use in -f options to specify its use. |
414 |
\code |
415 |
MyFpType theMyFpType("myfpID"); |
416 |
\endcode |
417 |
|
418 |
<h4>-- To obtain a fingerprint --</h4> |
419 |
\code |
420 |
OBMol mol; |
421 |
... |
422 |
vector<unsigned int> fp; |
423 |
OBFingerprint::GetDefault()->GetFingerprint(&mol, fp); //gets default size of fingerprint |
424 |
\endcode |
425 |
or |
426 |
\code |
427 |
vector<unsigned int> fp; |
428 |
OBFingerPrint* pFP = OBFingerprint::FindFingerprint("myfpID"); |
429 |
...and maybe... |
430 |
pFP->GetFingerprint(&mol,fp, 128); //fold down to 128bits if was originally larger |
431 |
\endcode |
432 |
|
433 |
<h4>-- To print a list of available fingerprint types --</h4> |
434 |
\code |
435 |
std::string id; |
436 |
OBFingerPrint* pFPrt=NULL; |
437 |
while(OBFingerprint::GetNextFPrt(id, pFPrt)) |
438 |
{ |
439 |
cout << id << " -- " << pFPrt->Description() << endl; |
440 |
} |
441 |
\endcode |
442 |
|
443 |
|
444 |
Fingerprints are handled as vector<unsigned int> so that the number of bits |
445 |
in this vector and their order will be platform and compiler |
446 |
dependent, because of size of int and endian differences. Use fingerprints |
447 |
(and fastsearch indexes containing them) only for comparing with other |
448 |
fingerprints prepared on the same machine. |
449 |
|
450 |
The FingerprintFormat class is an output format which displays fingerprints |
451 |
as hexadecimal. When multiple molecules are supplied it will calculate the |
452 |
Tanimoto coefficient from the first molecule to each of the others. It also |
453 |
shows whether the first molecule is a possible substructure to all the others, |
454 |
i.e. whether all the bits set in the fingerprint for the first molecule are |
455 |
set in the fingerprint of the others. To display hexadecimal information when |
456 |
multiple molecules are provided it is necessay to use the -xh option. |
457 |
|
458 |
To see a list of available format types, type babel -F on the command line. |
459 |
The -xF option of the FingerprintFormat class also provides this output, but due |
460 |
to a quirk in the way the program works, it is necessary to have a valid input |
461 |
molecule for this option to work. |
462 |
*/ |
463 |
|
464 |
/*! \class FastSearch |
465 |
The FastSearch class searches an index to a datafile containing a list of molecules |
466 |
(or other objects) prepared by FastSearchIndexer. |
467 |
|
468 |
OpenBabel can also search files for molecules containing a substructure specified |
469 |
by a SMARTS string, using OBSmartsPattern or from the command line: |
470 |
\code |
471 |
babel datafile.xxx -outfile.yyy -sSMARTS |
472 |
\endcode |
473 |
But when the data file contains more than about 10,000 molecules this becomes |
474 |
rather too slow to be used interactively. To do it more quickly, an index |
475 |
of the molecules containing their fingerprints (see OBFingerprint) is prepared using |
476 |
FastSearchIndexer. The indexing process may take a long time but only has to |
477 |
be done once. The index can be searched very quickly with FastSearch. Because |
478 |
of the nature of fingerprints a match to a bit does not guarantee |
479 |
the presence of a particular substructure or other molecular property, so that |
480 |
a definitive answer may require a subsequent search of the (much reduced) list |
481 |
of candidates by another method (like OBSmartsPattern). |
482 |
|
483 |
Note that the index files are not portable. They need to be prepared on the |
484 |
computer that will acess them. |
485 |
|
486 |
<h4>Using FastSearch and FastSearchIndexer in a program</h4> |
487 |
|
488 |
The index has two tables: |
489 |
- an array of fingerprints of the molecules |
490 |
- an array of the seek positions in the datasource file of all the molecules |
491 |
|
492 |
<h4>To prepare an fastsearch index file:</h4> |
493 |
- Open an ostream to the index file. |
494 |
- Make a FastSearchIndexer object on the heap or the stack, passing in as parameters: |
495 |
- the datafile name, the indexfile stream, |
496 |
- the id of the fingerprint type to be used, |
497 |
- the number of bits the fingerprint is to be folded down to, If it is to be left |
498 |
unfolded, set fpid to 0 or do not specify it. |
499 |
. |
500 |
- For each molecule, call Add() with its pointer and its position in the datafile.<br> |
501 |
Currently the std::streampos value is implicitly cast to unsigned int so that |
502 |
for 32bit machines the datafile has to be no longer than about 2Gbyte. |
503 |
- The index file is written when the FastSearchIndexer object is deleted or goes out |
504 |
of scope. |
505 |
|
506 |
<h4>To search in a fastsearch index file</h4> |
507 |
|
508 |
- Open a std::istream to the indexfile (in binary mode on some systems) |
509 |
- Make a FastSearch object, read the index and open the datafile whose |
510 |
name it provides |
511 |
\code |
512 |
ifstream ifs(indexname,ios::binary); |
513 |
FastSearch fs; |
514 |
string datafilename = fs.ReadIndex(&ifs); |
515 |
if(datafilename.empty()/endcode |
516 |
return false; |
517 |
ifstream datastream(datafilename); |
518 |
if(!datastream) |
519 |
return false; |
520 |
\endcode |
521 |
|
522 |
<strong>To do a search for molecules which have all the substructure bits the |
523 |
OBMol object, patternMol</strong> |
524 |
\code |
525 |
vector<unsigned int>& SeekPositions; |
526 |
if(!fs.Find(patternMol, SeekPositions, MaxCandidates)) |
527 |
|
528 |
for(itr=SeekPositions.begin();itr!=SeekPositions.end();itr++) |
529 |
{ |
530 |
datastream.seekg(*itr); |
531 |
... read the candidate molecule |
532 |
and subject to more rigorous test if necessary |
533 |
} |
534 |
\endcode |
535 |
|
536 |
<strong>To do a similarity search based on the Tanimoto coefficient</strong> |
537 |
This is defined as: <br> |
538 |
<em>Number of bits set in (patternFP & targetFP)/Number of bits in (patternFP | targetFP)</em><br> |
539 |
where the boolean operations between the fingerprints are bitwise<br> |
540 |
The Tanimoto coefficient has no absolute meaning and depends on |
541 |
the design of the fingerprint. |
542 |
\code |
543 |
multimap<double, unsigned int> SeekposMap; |
544 |
// to find n best molecules |
545 |
fs.FindSimilar(&patternMol, SeekposMap, n); |
546 |
...or |
547 |
// to find molecules with Tanimoto coefficient > MinTani |
548 |
fs.FindSimilar(&patternMol, SeekposMap, MinTani); |
549 |
|
550 |
multimap<double, unsigned int>::reverse_iterator itr; |
551 |
for(itr=SeekposMap.rbegin();itr!=SeekposMap.rend();++itr) |
552 |
{ |
553 |
datastream.seekg(itr->second); |
554 |
... read the candidate molecule |
555 |
double tani = itr->first; |
556 |
} |
557 |
\endcode |
558 |
|
559 |
The FastSearchFormat class facilitates the use of these routine from the |
560 |
command line or other front end program. For instance: |
561 |
|
562 |
<strong>Prepare an index:</strong> |
563 |
\code |
564 |
babel datafile.xxx index.fs |
565 |
\endcode |
566 |
or |
567 |
\code |
568 |
babel datafile.xxx -ofs namedindex |
569 |
\endcode |
570 |
With options you can specify: |
571 |
- which type of fingerprint to be used, e.g. -xfFP2, |
572 |
- whether it is folded to a specified number of bits, e.g. -xn128 |
573 |
(which should be a power of2) |
574 |
- whether to pre-select the molecules which are indexed: |
575 |
- by structure e.g only ethers and esters, -sCOC |
576 |
- by excluding molecules with bezene rings, -vc1ccccc1 |
577 |
- by position in the datafile e.g. only mols 10 to 90, -f10 -l90 |
578 |
. |
579 |
<strong>Substructure search</strong> in a previously prepared index file |
580 |
\code |
581 |
babel index.fs outfile.yyy -sSMILES |
582 |
\endcode |
583 |
The datafile can also be used as the input file, provided the input format is specified as fs |
584 |
\code |
585 |
babel datafile.xxx outfile.yyy -sSMILES -ifs |
586 |
\endcode |
587 |
A "slow" search not using fingerprints would be done on the same data by omitting -ifs. |
588 |
A "slow" search can use SMARTS strings, but the fast search is restricted |
589 |
to the subset which is valid SMILES. |
590 |
|
591 |
With the -S option, the target can be specified as a molecule in a file of any format |
592 |
\code |
593 |
babel datafile.xxx outfile.yyy -Spattern_mol.zzz -ifs |
594 |
\endcode |
595 |
These searches have two stages: a fingerprint search which produces |
596 |
a number of candidate molecules and a definitive search which selects |
597 |
from these using SMARTS pattern matching. The maximum number of candidates |
598 |
is 4000 by default but you can change this with an option |
599 |
e.g. -al 8000 (Note that you need the space between l and the number.) |
600 |
If the fingerprint search reaches the maximum number it will not |
601 |
look further and will tell you at what stage it stopped. |
602 |
|
603 |
<strong>Similarity search</strong> in a previously prepared index file<br> |
604 |
This rather done (rather than a substructure search) if the -at option is used, |
605 |
\code |
606 |
babel datafile.xxx outfile.yyy -sSMILES -ifs -at0.7 |
607 |
\endcode |
608 |
for instance |
609 |
- -at0.7 will recover all molecules with Tanimoto greater than 0.7 |
610 |
- -at15 (no decimal point) will recover the 15 molecules with largest coefficients. |
611 |
- -aa will add the Tanimoto coefficient to the titles of the output molecules. |
612 |
|
613 |
All stages, the indexing, the interpretation of the SMILES string in the -s option, |
614 |
the file in the -S option and the final SMARTS filter convert to OBMol and apply |
615 |
ConvertDativeBonds(). This ensures thatforms such as[N+]([O-])=O and N(=O)=O |
616 |
are recognized as chemically identical. |
617 |
*/ |
618 |
|
619 |
}//Openbabel |
620 |
|
621 |
//! \file fingerprint.cpp |
622 |
//! \brief Definitions for OBFingerprint base class and fastsearch classes |