1 |
tim |
741 |
/********************************************************************** |
2 |
|
|
fingerprint.cpp - Implementation of fingerpring base class and fastsearching |
3 |
|
|
|
4 |
|
|
Copyright (C) 2005 by Chris Morley |
5 |
|
|
|
6 |
|
|
This file is part of the Open Babel project. |
7 |
|
|
For more information, see <http://openbabel.sourceforge.net/> |
8 |
|
|
|
9 |
|
|
This program is free software; you can redistribute it and/or modify |
10 |
|
|
it under the terms of the GNU General Public License as published by |
11 |
|
|
the Free Software Foundation version 2 of the License. |
12 |
|
|
|
13 |
|
|
This program is distributed in the hope that it will be useful, |
14 |
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 |
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 |
|
|
GNU General Public License for more details. |
17 |
|
|
***********************************************************************/ |
18 |
|
|
|
19 |
gezelter |
751 |
#include "config.h" |
20 |
tim |
741 |
#include <vector> |
21 |
|
|
#include <algorithm> |
22 |
|
|
|
23 |
|
|
#if HAVE_IOSTREAM |
24 |
|
|
#include <iostream> |
25 |
|
|
#elif HAVE_IOSTREAM_H |
26 |
|
|
#include <iostream.h> |
27 |
|
|
#endif |
28 |
|
|
|
29 |
|
|
#if HAVE_FSTREAM |
30 |
|
|
#include <fstream> |
31 |
|
|
#elif HAVE_FSTREAM_H |
32 |
|
|
#include <fstream.h> |
33 |
|
|
#endif |
34 |
|
|
#include <vector> |
35 |
|
|
|
36 |
|
|
#include "fingerprint.hpp" |
37 |
|
|
#include "oberror.hpp" |
38 |
|
|
|
39 |
|
|
using namespace std; |
40 |
|
|
namespace OpenBabel { |
41 |
|
|
|
42 |
|
|
OBFingerprint* OBFingerprint::_pDefault; //static variable |
43 |
|
|
const unsigned int OBFingerprint::bitsperint = 8 * sizeof(unsigned int); |
44 |
|
|
|
45 |
|
|
void OBFingerprint::SetBit(vector<unsigned int>& vec, unsigned int n) |
46 |
|
|
{ |
47 |
|
|
vec[n/bitsperint] |= (1 << (n % bitsperint)); |
48 |
|
|
} |
49 |
|
|
|
50 |
|
|
//////////////////////////////////////// |
51 |
|
|
void OBFingerprint::Fold(vector<unsigned int>& vec, unsigned int nbits) |
52 |
|
|
{ |
53 |
|
|
while(vec.size()*bitsperint/2 >= nbits) |
54 |
|
|
vec.erase(transform(vec.begin(),vec.begin()+vec.size()/2, |
55 |
|
|
vec.begin()+vec.size()/2, vec.begin(), bit_or()), vec.end()); |
56 |
|
|
} |
57 |
|
|
|
58 |
|
|
//////////////////////////////////////// |
59 |
|
|
bool OBFingerprint::GetNextFPrt(std::string& id, OBFingerprint*& pFPrt) |
60 |
|
|
{ |
61 |
|
|
Fptpos iter; |
62 |
|
|
if(id.empty()) |
63 |
|
|
iter=FPtsMap().begin(); |
64 |
|
|
else |
65 |
|
|
{ |
66 |
|
|
iter=FPtsMap().find(id); |
67 |
|
|
if(iter!=FPtsMap().end()) |
68 |
|
|
++iter; |
69 |
|
|
} |
70 |
|
|
if(iter==FPtsMap().end()) |
71 |
|
|
return false; |
72 |
|
|
id = iter->first; |
73 |
|
|
pFPrt = iter->second; |
74 |
|
|
return true; |
75 |
|
|
} |
76 |
|
|
|
77 |
|
|
OBFingerprint* OBFingerprint::FindFingerprint(string& ID) |
78 |
|
|
{ |
79 |
|
|
if(ID.empty()) |
80 |
|
|
return _pDefault; |
81 |
|
|
Fptpos iter = FPtsMap().find(ID); |
82 |
|
|
if(iter==FPtsMap().end()) |
83 |
|
|
return NULL; |
84 |
|
|
else |
85 |
|
|
return iter->second; |
86 |
|
|
} |
87 |
|
|
|
88 |
|
|
double OBFingerprint::Tanimoto(const vector<unsigned int>& vec1, const vector<unsigned int>& vec2) |
89 |
|
|
{ |
90 |
|
|
//Independent of sizeof(unsigned int) |
91 |
|
|
if(vec1.size()!=vec2.size()) |
92 |
|
|
return -1; //different number of bits |
93 |
|
|
int andbits=0, orbits=0; |
94 |
|
|
for (unsigned i=0;i<vec1.size();++i) |
95 |
|
|
{ |
96 |
|
|
int andfp = vec1[i] & vec2[i]; |
97 |
|
|
int orfp = vec1[i] | vec2[i]; |
98 |
|
|
//Count bits |
99 |
|
|
for(;andfp;andfp=andfp<<1) |
100 |
|
|
if(andfp<0) ++andbits; |
101 |
|
|
for(;orfp;orfp=orfp<<1) |
102 |
|
|
if(orfp<0) ++orbits; |
103 |
|
|
} |
104 |
|
|
return((double)andbits/(double)orbits); |
105 |
|
|
} |
106 |
|
|
|
107 |
|
|
//*****************************************************************8 |
108 |
|
|
/*! |
109 |
|
|
|
110 |
|
|
*/ |
111 |
|
|
bool FastSearch::Find(OBBase* pOb, vector<unsigned int>& SeekPositions, |
112 |
|
|
unsigned int MaxCandidates) |
113 |
|
|
{ |
114 |
|
|
///Finds chemical objects in datafilename (which must previously have been indexed) |
115 |
|
|
///that have all the same bits set in their fingerprints as in the fingerprint of |
116 |
|
|
///a pattern object. (Usually a substructure search in molecules.) |
117 |
|
|
///The type of fingerprint and its degree of folding does not have to be specified |
118 |
|
|
///here because the values in the index file are used. |
119 |
|
|
///The positions of the candidate matching molecules in the original datafile are returned. |
120 |
|
|
|
121 |
|
|
vector<unsigned int> vecwords; |
122 |
|
|
_pFP->GetFingerprint(pOb,vecwords, _index.header.words * OBFingerprint::bitsperint); |
123 |
|
|
|
124 |
|
|
vector<unsigned int>candidates; //indices of matches from fingerprint screen |
125 |
|
|
candidates.reserve(MaxCandidates); |
126 |
|
|
|
127 |
|
|
unsigned int dataSize = _index.header.nEntries; |
128 |
|
|
// GetFingerprint(mol, vecwords, _index.header.words, _index.header.fptype); |
129 |
|
|
|
130 |
|
|
unsigned int words = _index.header.words; |
131 |
|
|
unsigned int* nextp = &_index.fptdata[0]; |
132 |
|
|
unsigned int* ppat0 = &vecwords[0]; |
133 |
|
|
register unsigned int* p; |
134 |
|
|
register unsigned int* ppat; |
135 |
|
|
register unsigned int a; |
136 |
|
|
unsigned int i; // need address of this, can't be register |
137 |
|
|
for(i=0;i<dataSize; i++) //speed critical section |
138 |
|
|
{ |
139 |
|
|
p=nextp; |
140 |
|
|
nextp += words; |
141 |
|
|
ppat=ppat0; |
142 |
|
|
a=0; |
143 |
|
|
while(p<nextp) |
144 |
|
|
{ |
145 |
|
|
if ( (a=((*ppat) & (*p++)) ^ (*ppat++)) ) break; |
146 |
|
|
} |
147 |
|
|
if(!a) |
148 |
|
|
{ |
149 |
|
|
candidates.push_back(i); |
150 |
|
|
if(candidates.size()>=MaxCandidates) |
151 |
|
|
break; |
152 |
|
|
} |
153 |
|
|
} |
154 |
|
|
|
155 |
|
|
if(i<_index.header.nEntries) //premature end to search |
156 |
|
|
{ |
157 |
|
|
#ifdef HAVE_SSTREAM |
158 |
|
|
stringstream errorMsg; |
159 |
|
|
#else |
160 |
|
|
strstream errorMsg; |
161 |
|
|
#endif |
162 |
|
|
errorMsg << "Stopped looking after " << i << " molecules." << endl; |
163 |
|
|
obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obInfo); |
164 |
|
|
} |
165 |
|
|
|
166 |
|
|
vector<unsigned int>::iterator itr; |
167 |
|
|
for(itr=candidates.begin();itr!=candidates.end();itr++) |
168 |
|
|
{ |
169 |
|
|
SeekPositions.push_back(_index.seekdata[*itr]); |
170 |
|
|
} |
171 |
|
|
return true; |
172 |
|
|
} |
173 |
|
|
|
174 |
|
|
///////////////////////////////////////////////////////// |
175 |
|
|
bool FastSearch::FindSimilar(OBBase* pOb, multimap<double, unsigned int>& SeekposMap, |
176 |
|
|
double MinTani) |
177 |
|
|
{ |
178 |
|
|
vector<unsigned int> targetfp; |
179 |
|
|
_pFP->GetFingerprint(pOb,targetfp, _index.header.words * OBFingerprint::bitsperint); |
180 |
|
|
|
181 |
|
|
unsigned int words = _index.header.words; |
182 |
|
|
unsigned int dataSize = _index.header.nEntries; |
183 |
|
|
unsigned int* nextp = &_index.fptdata[0]; |
184 |
|
|
register unsigned int* p; |
185 |
|
|
register unsigned int i; |
186 |
|
|
for(i=0;i<dataSize; i++) //speed critical section |
187 |
|
|
{ |
188 |
|
|
p=nextp; |
189 |
|
|
nextp += words; |
190 |
|
|
double tani = OBFingerprint::Tanimoto(targetfp,p); |
191 |
|
|
if(tani>MinTani) |
192 |
|
|
SeekposMap.insert(pair<const double, unsigned int>(tani,_index.seekdata[i])); |
193 |
|
|
} |
194 |
|
|
return true; |
195 |
|
|
} |
196 |
|
|
|
197 |
|
|
///////////////////////////////////////////////////////// |
198 |
|
|
bool FastSearch::FindSimilar(OBBase* pOb, multimap<double, unsigned int>& SeekposMap, |
199 |
|
|
int nCandidates) |
200 |
|
|
{ |
201 |
|
|
///If nCandidates is zero or omitted the original size of the multimap is used |
202 |
|
|
if(nCandidates) |
203 |
|
|
{ |
204 |
|
|
//initialise the multimap with nCandidate zero entries |
205 |
|
|
SeekposMap.clear(); |
206 |
|
|
int i; |
207 |
|
|
for(i=0;i<nCandidates;++i) |
208 |
|
|
SeekposMap.insert(pair<const double, unsigned int>(0,0)); |
209 |
|
|
} |
210 |
|
|
else if(SeekposMap.size()==0) |
211 |
|
|
return false; |
212 |
|
|
|
213 |
|
|
vector<unsigned int> targetfp; |
214 |
|
|
_pFP->GetFingerprint(pOb,targetfp, _index.header.words * OBFingerprint::bitsperint); |
215 |
|
|
|
216 |
|
|
unsigned int words = _index.header.words; |
217 |
|
|
unsigned int dataSize = _index.header.nEntries; |
218 |
|
|
unsigned int* nextp = &_index.fptdata[0]; |
219 |
|
|
register unsigned int* p; |
220 |
|
|
register unsigned int i; |
221 |
|
|
for(i=0;i<dataSize; i++) //speed critical section |
222 |
|
|
{ |
223 |
|
|
p=nextp; |
224 |
|
|
nextp += words; |
225 |
|
|
double tani = OBFingerprint::Tanimoto(targetfp,p); |
226 |
|
|
if(tani>SeekposMap.begin()->first) |
227 |
|
|
{ |
228 |
|
|
SeekposMap.insert(pair<const double, unsigned int>(tani,_index.seekdata[i])); |
229 |
|
|
SeekposMap.erase(SeekposMap.begin()); |
230 |
|
|
} |
231 |
|
|
} |
232 |
|
|
return true; |
233 |
|
|
}///////////////////////////////////////////////////////// |
234 |
|
|
string FastSearch::ReadIndex(istream* pIndexstream) |
235 |
|
|
{ |
236 |
|
|
//Reads fs index from istream into member variables |
237 |
|
|
// but first checks whether it is already loaded |
238 |
|
|
FptIndexHeader headercopy = _index.header; |
239 |
|
|
pIndexstream->read((char*)&(_index.header), sizeof(FptIndexHeader)); |
240 |
|
|
|
241 |
|
|
if(memcmp(&headercopy,&(_index.header),sizeof(FptIndexHeader))) |
242 |
|
|
{ |
243 |
|
|
pIndexstream->seekg(_index.header.headerlength);//allows header length to be changed |
244 |
|
|
|
245 |
|
|
unsigned int nwords = _index.header.nEntries * _index.header.words; |
246 |
|
|
_index.fptdata.resize(nwords); |
247 |
|
|
_index.seekdata.resize(_index.header.nEntries); |
248 |
|
|
|
249 |
|
|
pIndexstream->read((char*)&(_index.fptdata[0]), sizeof(unsigned int) * nwords); |
250 |
|
|
pIndexstream->read((char*)&(_index.seekdata[0]), sizeof(unsigned int) * _index.header.nEntries); |
251 |
|
|
|
252 |
|
|
if(pIndexstream->fail()) |
253 |
|
|
*(_index.header.datafilename) = '\0'; |
254 |
|
|
|
255 |
|
|
string tempFP(_index.header.fpid); |
256 |
|
|
_pFP = OBFingerprint::FindFingerprint(tempFP); |
257 |
|
|
if(!_pFP) |
258 |
|
|
{ |
259 |
|
|
#ifdef HAVE_SSTREAM |
260 |
|
|
stringstream errorMsg; |
261 |
|
|
#else |
262 |
|
|
strstream errorMsg; |
263 |
|
|
#endif |
264 |
|
|
errorMsg << "Index has Fingerprints of type '" << _index.header.fpid |
265 |
|
|
<< " which is not currently loaded." << endl; |
266 |
|
|
obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obWarning); |
267 |
|
|
*(_index.header.datafilename) = '\0'; |
268 |
|
|
} |
269 |
|
|
|
270 |
|
|
} |
271 |
|
|
return _index.header.datafilename; |
272 |
|
|
} |
273 |
|
|
|
274 |
|
|
//******************************************************* |
275 |
|
|
FastSearchIndexer::FastSearchIndexer(string& datafilename, ostream* os, |
276 |
|
|
std::string& fpid, int FptBits) |
277 |
|
|
{ |
278 |
|
|
///Starts indexing process |
279 |
|
|
_indexstream = os; |
280 |
|
|
_pFP = OBFingerprint::FindFingerprint(fpid); |
281 |
|
|
if(!_pFP) |
282 |
|
|
{ |
283 |
|
|
#ifdef HAVE_SSTREAM |
284 |
|
|
stringstream errorMsg; |
285 |
|
|
#else |
286 |
|
|
strstream errorMsg; |
287 |
|
|
#endif |
288 |
|
|
errorMsg << "Fingerprint type '" << fpid << "' not available" << endl; |
289 |
|
|
obErrorLog.ThrowError(__FUNCTION__, errorMsg.str(), obWarning); |
290 |
|
|
} |
291 |
|
|
|
292 |
|
|
_nbits=FptBits; |
293 |
|
|
_pindex= new FptIndex; |
294 |
|
|
_pindex->header.headerlength = sizeof(FptIndexHeader); |
295 |
|
|
strncpy(_pindex->header.fpid,fpid.c_str(),15); |
296 |
|
|
strncpy(_pindex->header.datafilename, datafilename.c_str(), 255); |
297 |
|
|
} |
298 |
|
|
|
299 |
|
|
///////////////////////////////////////////////////////////// |
300 |
|
|
FastSearchIndexer::~FastSearchIndexer() |
301 |
|
|
{ |
302 |
|
|
///Saves index file |
303 |
|
|
_pindex->header.nEntries = _pindex->seekdata.size(); |
304 |
|
|
_indexstream->write((const char*)&_pindex->header, sizeof(FptIndexHeader)); |
305 |
|
|
_indexstream->write((const char*)&_pindex->fptdata[0], _pindex->fptdata.size()*sizeof(unsigned int)); |
306 |
|
|
_indexstream->write((const char*)&_pindex->seekdata[0], _pindex->seekdata.size()*sizeof(unsigned int)); |
307 |
|
|
if(!_indexstream) |
308 |
|
|
obErrorLog.ThrowError(__FUNCTION__, |
309 |
|
|
"Difficulty writing index", obWarning); |
310 |
|
|
delete _pindex; |
311 |
|
|
} |
312 |
|
|
|
313 |
|
|
/////////////////////////////////////////////////////////////// |
314 |
|
|
bool FastSearchIndexer::Add(OBBase* pOb, streampos seekpos) |
315 |
|
|
{ |
316 |
|
|
///Adds a fingerprint |
317 |
|
|
|
318 |
|
|
vector<unsigned int> vecwords; |
319 |
|
|
if(!_pFP) |
320 |
|
|
return false; |
321 |
|
|
if(_pFP->GetFingerprint(pOb, vecwords, _nbits)) |
322 |
|
|
{ |
323 |
|
|
_pindex->header.words = vecwords.size(); //Use size as returned from fingerprint |
324 |
|
|
for(unsigned int i=0;i<_pindex->header.words;i++) |
325 |
|
|
_pindex->fptdata.push_back(vecwords[i]); |
326 |
|
|
_pindex->seekdata.push_back(seekpos); |
327 |
|
|
return true; |
328 |
|
|
} |
329 |
|
|
obErrorLog.ThrowError(__FUNCTION__, "Failed to make a fingerprint", obWarning); |
330 |
|
|
return false; |
331 |
|
|
|
332 |
|
|
} |
333 |
|
|
|
334 |
|
|
/*! |
335 |
|
|
\class OBFingerprint |
336 |
|
|
These fingerprints are condensed representation of molecules (or other objects) |
337 |
|
|
as a list of boolean values (actually bits in a vector<unsigned>) with length |
338 |
|
|
of a power of 2. The main motivation is for fast searching of data sources |
339 |
|
|
containing large numbers of molecules (up to several million). OpenBabel |
340 |
|
|
provides some routines which can search text files containing lists of molecules |
341 |
|
|
in any format. See the documentation on the class FastSearch. |
342 |
|
|
|
343 |
|
|
There are descriptions of molecular fingerprints at <br> |
344 |
|
|
http://www.daylight.com/dayhtml/doc/theory/theory.finger.html) and <br> |
345 |
|
|
http://www.mesaac.com/Fingerprint.htm <br> |
346 |
|
|
Many methods of preparing fingerprints have been described, but the type supported |
347 |
|
|
currently in OpenBabel has each bit representing a substructure (or other |
348 |
|
|
molecular property). If a substructure is present in the molecule, then a |
349 |
|
|
particular bit is set to 1. But because the hashing method may also map other |
350 |
|
|
substructures to the same bit, a match does not guarantee that a particular |
351 |
|
|
substructure is present; there may be false positives. However, with proper design, |
352 |
|
|
a large fraction of irrelevant molecules in a data set can be eliminated in a |
353 |
|
|
fast search with boolean methods on the fingerprints. |
354 |
|
|
It then becomes feasible to make a definitive substructure search by |
355 |
|
|
conventional methods on this reduced list even if it is slow. |
356 |
|
|
|
357 |
|
|
OpenBabel provides a framework for applying new types of fingerprints without |
358 |
|
|
changing any existing code. They are derived from OBFingerprint and the |
359 |
|
|
source file is just compiled with the rest of OpenBabel. Alternatively, |
360 |
|
|
they can be separately compiled as a DLL or shared library and discovered |
361 |
|
|
when OpenBabel runs. |
362 |
|
|
|
363 |
|
|
Fingerprints derived from this abstract base class OBFingerprint can be for any |
364 |
|
|
object derived from OBBase (not just for OBMol). |
365 |
|
|
Each derived class provides an ID as a string and OBFingerprint keeps a map of |
366 |
|
|
these to provides a pointer to the class when requested in FindFingerprint. |
367 |
|
|
|
368 |
|
|
<h4>-- To define a fingerprint type --</h4> |
369 |
|
|
The classes derived form OBFingerprint are required to provide |
370 |
|
|
a GetFingerprint() routine and a Description() routine |
371 |
|
|
\code |
372 |
|
|
class MyFpType : OBFingerprint |
373 |
|
|
{ |
374 |
|
|
MyFpType(const char* id) : OBFingerprint(id){}; |
375 |
|
|
virtual bool GetFingerprint(OBBase* pOb, vector<unsigned int>& fp, int nbits) |
376 |
|
|
{ |
377 |
|
|
//Convert pOb to the required type, usually OBMol |
378 |
|
|
OBMol* pmol = dynamic_cast<OBMol*>(pOb); |
379 |
|
|
fp.resize(required_number_of_words); |
380 |
|
|
... |
381 |
|
|
use SetBit(fp,n); to set the nth bit |
382 |
|
|
|
383 |
|
|
if(nbits) |
384 |
|
|
Fold(fp, nbits); |
385 |
|
|
} |
386 |
|
|
virtual const char* Description(){ return "Some descriptive text";} |
387 |
|
|
... |
388 |
|
|
}; |
389 |
|
|
\endcode |
390 |
|
|
//Declare a global instance with the ID you will use in -f options to specify its use. |
391 |
|
|
\code |
392 |
|
|
MyFpType theMyFpType("myfpID"); |
393 |
|
|
\endcode |
394 |
|
|
|
395 |
|
|
<h4>-- To obtain a fingerprint --</h4> |
396 |
|
|
\code |
397 |
|
|
OBMol mol; |
398 |
|
|
... |
399 |
|
|
vector<unsigned int> fp; |
400 |
|
|
OBFingerprint::GetDefault()->GetFingerprint(&mol, fp); //gets default size of fingerprint |
401 |
|
|
\endcode |
402 |
|
|
or |
403 |
|
|
\code |
404 |
|
|
vector<unsigned int> fp; |
405 |
|
|
OBFingerPrint* pFP = OBFingerprint::FindFingerprint("myfpID"); |
406 |
|
|
...and maybe... |
407 |
|
|
pFP->GetFingerprint(&mol,fp, 128); //fold down to 128bits if was originally larger |
408 |
|
|
\endcode |
409 |
|
|
|
410 |
|
|
<h4>-- To print a list of available fingerprint types --</h4> |
411 |
|
|
\code |
412 |
|
|
std::string id; |
413 |
|
|
OBFingerPrint* pFPrt=NULL; |
414 |
|
|
while(OBFingerprint::GetNextFPrt(id, pFPrt)) |
415 |
|
|
{ |
416 |
|
|
cout << id << " -- " << pFPrt->Description() << endl; |
417 |
|
|
} |
418 |
|
|
\endcode |
419 |
|
|
|
420 |
|
|
|
421 |
|
|
Fingerprints are handled as vector<unsigned int> so that the number of bits |
422 |
|
|
in this vector and their order will be platform and compiler |
423 |
|
|
dependent, because of size of int and endian differences. Use fingerprints |
424 |
|
|
(and fastsearch indexes containing them) only for comparing with other |
425 |
|
|
fingerprints prepared on the same machine. |
426 |
|
|
|
427 |
|
|
The FingerprintFormat class is an output format which displays fingerprints |
428 |
|
|
as hexadecimal. When multiple molecules are supplied it will calculate the |
429 |
|
|
Tanimoto coefficient from the first molecule to each of the others. It also |
430 |
|
|
shows whether the first molecule is a possible substructure to all the others, |
431 |
|
|
i.e. whether all the bits set in the fingerprint for the first molecule are |
432 |
|
|
set in the fingerprint of the others. To display hexadecimal information when |
433 |
|
|
multiple molecules are provided it is necessay to use the -xh option. |
434 |
|
|
|
435 |
|
|
To see a list of available format types, type babel -F on the command line. |
436 |
|
|
The -xF option of the FingerprintFormat class also provides this output, but due |
437 |
|
|
to a quirk in the way the program works, it is necessary to have a valid input |
438 |
|
|
molecule for this option to work. |
439 |
|
|
*/ |
440 |
|
|
|
441 |
|
|
/*! \class FastSearch |
442 |
|
|
The FastSearch class searches an index to a datafile containing a list of molecules |
443 |
|
|
(or other objects) prepared by FastSearchIndexer. |
444 |
|
|
|
445 |
|
|
OpenBabel can also search files for molecules containing a substructure specified |
446 |
|
|
by a SMARTS string, using OBSmartsPattern or from the command line: |
447 |
|
|
\code |
448 |
|
|
babel datafile.xxx -outfile.yyy -sSMARTS |
449 |
|
|
\endcode |
450 |
|
|
But when the data file contains more than about 10,000 molecules this becomes |
451 |
|
|
rather too slow to be used interactively. To do it more quickly, an index |
452 |
|
|
of the molecules containing their fingerprints (see OBFingerprint) is prepared using |
453 |
|
|
FastSearchIndexer. The indexing process may take a long time but only has to |
454 |
|
|
be done once. The index can be searched very quickly with FastSearch. Because |
455 |
|
|
of the nature of fingerprints a match to a bit does not guarantee |
456 |
|
|
the presence of a particular substructure or other molecular property, so that |
457 |
|
|
a definitive answer may require a subsequent search of the (much reduced) list |
458 |
|
|
of candidates by another method (like OBSmartsPattern). |
459 |
|
|
|
460 |
|
|
Note that the index files are not portable. They need to be prepared on the |
461 |
|
|
computer that will acess them. |
462 |
|
|
|
463 |
|
|
<h4>Using FastSearch and FastSearchIndexer in a program</h4> |
464 |
|
|
|
465 |
|
|
The index has two tables: |
466 |
|
|
- an array of fingerprints of the molecules |
467 |
|
|
- an array of the seek positions in the datasource file of all the molecules |
468 |
|
|
|
469 |
|
|
<h4>To prepare an fastsearch index file:</h4> |
470 |
|
|
- Open an ostream to the index file. |
471 |
|
|
- Make a FastSearchIndexer object on the heap or the stack, passing in as parameters: |
472 |
|
|
- the datafile name, the indexfile stream, |
473 |
|
|
- the id of the fingerprint type to be used, |
474 |
|
|
- the number of bits the fingerprint is to be folded down to, If it is to be left |
475 |
|
|
unfolded, set fpid to 0 or do not specify it. |
476 |
|
|
. |
477 |
|
|
- For each molecule, call Add() with its pointer and its position in the datafile.<br> |
478 |
|
|
Currently the std::streampos value is implicitly cast to unsigned int so that |
479 |
|
|
for 32bit machines the datafile has to be no longer than about 2Gbyte. |
480 |
|
|
- The index file is written when the FastSearchIndexer object is deleted or goes out |
481 |
|
|
of scope. |
482 |
|
|
|
483 |
|
|
<h4>To search in a fastsearch index file</h4> |
484 |
|
|
|
485 |
|
|
- Open a std::istream to the indexfile (in binary mode on some systems) |
486 |
|
|
- Make a FastSearch object, read the index and open the datafile whose |
487 |
|
|
name it provides |
488 |
|
|
\code |
489 |
|
|
ifstream ifs(indexname,ios::binary); |
490 |
|
|
FastSearch fs; |
491 |
|
|
string datafilename = fs.ReadIndex(&ifs); |
492 |
|
|
if(datafilename.empty()/endcode |
493 |
|
|
return false; |
494 |
|
|
ifstream datastream(datafilename); |
495 |
|
|
if(!datastream) |
496 |
|
|
return false; |
497 |
|
|
\endcode |
498 |
|
|
|
499 |
|
|
<strong>To do a search for molecules which have all the substructure bits the |
500 |
|
|
OBMol object, patternMol</strong> |
501 |
|
|
\code |
502 |
|
|
vector<unsigned int>& SeekPositions; |
503 |
|
|
if(!fs.Find(patternMol, SeekPositions, MaxCandidates)) |
504 |
|
|
|
505 |
|
|
for(itr=SeekPositions.begin();itr!=SeekPositions.end();itr++) |
506 |
|
|
{ |
507 |
|
|
datastream.seekg(*itr); |
508 |
|
|
... read the candidate molecule |
509 |
|
|
and subject to more rigorous test if necessary |
510 |
|
|
} |
511 |
|
|
\endcode |
512 |
|
|
|
513 |
|
|
<strong>To do a similarity search based on the Tanimoto coefficient</strong> |
514 |
|
|
This is defined as: <br> |
515 |
|
|
<em>Number of bits set in (patternFP & targetFP)/Number of bits in (patternFP | targetFP)</em><br> |
516 |
|
|
where the boolean operations between the fingerprints are bitwise<br> |
517 |
|
|
The Tanimoto coefficient has no absolute meaning and depends on |
518 |
|
|
the design of the fingerprint. |
519 |
|
|
\code |
520 |
|
|
multimap<double, unsigned int> SeekposMap; |
521 |
|
|
// to find n best molecules |
522 |
|
|
fs.FindSimilar(&patternMol, SeekposMap, n); |
523 |
|
|
...or |
524 |
|
|
// to find molecules with Tanimoto coefficient > MinTani |
525 |
|
|
fs.FindSimilar(&patternMol, SeekposMap, MinTani); |
526 |
|
|
|
527 |
|
|
multimap<double, unsigned int>::reverse_iterator itr; |
528 |
|
|
for(itr=SeekposMap.rbegin();itr!=SeekposMap.rend();++itr) |
529 |
|
|
{ |
530 |
|
|
datastream.seekg(itr->second); |
531 |
|
|
... read the candidate molecule |
532 |
|
|
double tani = itr->first; |
533 |
|
|
} |
534 |
|
|
\endcode |
535 |
|
|
|
536 |
|
|
The FastSearchFormat class facilitates the use of these routine from the |
537 |
|
|
command line or other front end program. For instance: |
538 |
|
|
|
539 |
|
|
<strong>Prepare an index:</strong> |
540 |
|
|
\code |
541 |
|
|
babel datafile.xxx index.fs |
542 |
|
|
\endcode |
543 |
|
|
or |
544 |
|
|
\code |
545 |
|
|
babel datafile.xxx -ofs namedindex |
546 |
|
|
\endcode |
547 |
|
|
With options you can specify: |
548 |
|
|
- which type of fingerprint to be used, e.g. -xfFP2, |
549 |
|
|
- whether it is folded to a specified number of bits, e.g. -xn128 |
550 |
|
|
(which should be a power of2) |
551 |
|
|
- whether to pre-select the molecules which are indexed: |
552 |
|
|
- by structure e.g only ethers and esters, -sCOC |
553 |
|
|
- by excluding molecules with bezene rings, -vc1ccccc1 |
554 |
|
|
- by position in the datafile e.g. only mols 10 to 90, -f10 -l90 |
555 |
|
|
. |
556 |
|
|
<strong>Substructure search</strong> in a previously prepared index file |
557 |
|
|
\code |
558 |
|
|
babel index.fs outfile.yyy -sSMILES |
559 |
|
|
\endcode |
560 |
|
|
The datafile can also be used as the input file, provided the input format is specified as fs |
561 |
|
|
\code |
562 |
|
|
babel datafile.xxx outfile.yyy -sSMILES -ifs |
563 |
|
|
\endcode |
564 |
|
|
A "slow" search not using fingerprints would be done on the same data by omitting -ifs. |
565 |
|
|
A "slow" search can use SMARTS strings, but the fast search is restricted |
566 |
|
|
to the subset which is valid SMILES. |
567 |
|
|
|
568 |
|
|
With the -S option, the target can be specified as a molecule in a file of any format |
569 |
|
|
\code |
570 |
|
|
babel datafile.xxx outfile.yyy -Spattern_mol.zzz -ifs |
571 |
|
|
\endcode |
572 |
|
|
These searches have two stages: a fingerprint search which produces |
573 |
|
|
a number of candidate molecules and a definitive search which selects |
574 |
|
|
from these using SMARTS pattern matching. The maximum number of candidates |
575 |
|
|
is 4000 by default but you can change this with an option |
576 |
|
|
e.g. -al 8000 (Note that you need the space between l and the number.) |
577 |
|
|
If the fingerprint search reaches the maximum number it will not |
578 |
|
|
look further and will tell you at what stage it stopped. |
579 |
|
|
|
580 |
|
|
<strong>Similarity search</strong> in a previously prepared index file<br> |
581 |
|
|
This rather done (rather than a substructure search) if the -at option is used, |
582 |
|
|
\code |
583 |
|
|
babel datafile.xxx outfile.yyy -sSMILES -ifs -at0.7 |
584 |
|
|
\endcode |
585 |
|
|
for instance |
586 |
|
|
- -at0.7 will recover all molecules with Tanimoto greater than 0.7 |
587 |
|
|
- -at15 (no decimal point) will recover the 15 molecules with largest coefficients. |
588 |
|
|
- -aa will add the Tanimoto coefficient to the titles of the output molecules. |
589 |
|
|
|
590 |
|
|
All stages, the indexing, the interpretation of the SMILES string in the -s option, |
591 |
|
|
the file in the -S option and the final SMARTS filter convert to OBMol and apply |
592 |
|
|
ConvertDativeBonds(). This ensures thatforms such as[N+]([O-])=O and N(=O)=O |
593 |
|
|
are recognized as chemically identical. |
594 |
|
|
*/ |
595 |
|
|
|
596 |
|
|
}//Openbabel |
597 |
|
|
|
598 |
|
|
//! \file fingerprint.cpp |
599 |
|
|
//! \brief Definitions for OBFingerprint base class and fastsearch classes |