1 |
tim |
741 |
/********************************************************************** |
2 |
|
|
fingerprint.cpp - Implementation of fingerpring base class and fastsearching |
3 |
|
|
|
4 |
|
|
Copyright (C) 2005 by Chris Morley |
5 |
|
|
|
6 |
|
|
This file is part of the Open Babel project. |
7 |
|
|
For more information, see <http://openbabel.sourceforge.net/> |
8 |
|
|
|
9 |
|
|
This program is free software; you can redistribute it and/or modify |
10 |
|
|
it under the terms of the GNU General Public License as published by |
11 |
|
|
the Free Software Foundation version 2 of the License. |
12 |
|
|
|
13 |
|
|
This program is distributed in the hope that it will be useful, |
14 |
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 |
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 |
|
|
GNU General Public License for more details. |
17 |
|
|
***********************************************************************/ |
18 |
|
|
|
19 |
gezelter |
751 |
#include "config.h" |
20 |
tim |
741 |
#include <vector> |
21 |
|
|
#include <algorithm> |
22 |
|
|
|
23 |
|
|
#if HAVE_IOSTREAM |
24 |
|
|
#include <iostream> |
25 |
|
|
#elif HAVE_IOSTREAM_H |
26 |
|
|
#include <iostream.h> |
27 |
|
|
#endif |
28 |
|
|
|
29 |
|
|
#if HAVE_FSTREAM |
30 |
|
|
#include <fstream> |
31 |
|
|
#elif HAVE_FSTREAM_H |
32 |
|
|
#include <fstream.h> |
33 |
|
|
#endif |
34 |
|
|
#include <vector> |
35 |
|
|
|
36 |
|
|
#include "fingerprint.hpp" |
37 |
|
|
#include "oberror.hpp" |
38 |
|
|
|
39 |
|
|
using namespace std; |
40 |
|
|
namespace OpenBabel { |
41 |
|
|
|
42 |
|
|
OBFingerprint* OBFingerprint::_pDefault; //static variable |
43 |
|
|
const unsigned int OBFingerprint::bitsperint = 8 * sizeof(unsigned int); |
44 |
gezelter |
1081 |
int OBFingerprint::rubbish = 666; |
45 |
tim |
741 |
|
46 |
|
|
void OBFingerprint::SetBit(vector<unsigned int>& vec, unsigned int n) |
47 |
|
|
{ |
48 |
gezelter |
1081 |
vec[n/Getbitsperint()] |= (1 << (n % Getbitsperint())); |
49 |
tim |
741 |
} |
50 |
|
|
|
51 |
|
|
//////////////////////////////////////// |
52 |
|
|
void OBFingerprint::Fold(vector<unsigned int>& vec, unsigned int nbits) |
53 |
|
|
{ |
54 |
gezelter |
1081 |
while(vec.size()*Getbitsperint()/2 >= nbits) |
55 |
tim |
741 |
vec.erase(transform(vec.begin(),vec.begin()+vec.size()/2, |
56 |
|
|
vec.begin()+vec.size()/2, vec.begin(), bit_or()), vec.end()); |
57 |
|
|
} |
58 |
|
|
|
59 |
|
|
//////////////////////////////////////// |
60 |
|
|
bool OBFingerprint::GetNextFPrt(std::string& id, OBFingerprint*& pFPrt) |
61 |
|
|
{ |
62 |
|
|
Fptpos iter; |
63 |
|
|
if(id.empty()) |
64 |
|
|
iter=FPtsMap().begin(); |
65 |
|
|
else |
66 |
|
|
{ |
67 |
|
|
iter=FPtsMap().find(id); |
68 |
|
|
if(iter!=FPtsMap().end()) |
69 |
|
|
++iter; |
70 |
|
|
} |
71 |
|
|
if(iter==FPtsMap().end()) |
72 |
|
|
return false; |
73 |
|
|
id = iter->first; |
74 |
|
|
pFPrt = iter->second; |
75 |
|
|
return true; |
76 |
|
|
} |
77 |
|
|
|
78 |
|
|
OBFingerprint* OBFingerprint::FindFingerprint(string& ID) |
79 |
|
|
{ |
80 |
|
|
if(ID.empty()) |
81 |
|
|
return _pDefault; |
82 |
|
|
Fptpos iter = FPtsMap().find(ID); |
83 |
|
|
if(iter==FPtsMap().end()) |
84 |
|
|
return NULL; |
85 |
|
|
else |
86 |
|
|
return iter->second; |
87 |
|
|
} |
88 |
|
|
|
89 |
|
|
double OBFingerprint::Tanimoto(const vector<unsigned int>& vec1, const vector<unsigned int>& vec2) |
90 |
|
|
{ |
91 |
|
|
//Independent of sizeof(unsigned int) |
92 |
|
|
if(vec1.size()!=vec2.size()) |
93 |
|
|
return -1; //different number of bits |
94 |
|
|
int andbits=0, orbits=0; |
95 |
|
|
for (unsigned i=0;i<vec1.size();++i) |
96 |
|
|
{ |
97 |
|
|
int andfp = vec1[i] & vec2[i]; |
98 |
|
|
int orfp = vec1[i] | vec2[i]; |
99 |
|
|
//Count bits |
100 |
|
|
for(;andfp;andfp=andfp<<1) |
101 |
|
|
if(andfp<0) ++andbits; |
102 |
|
|
for(;orfp;orfp=orfp<<1) |
103 |
|
|
if(orfp<0) ++orbits; |
104 |
|
|
} |
105 |
|
|
return((double)andbits/(double)orbits); |
106 |
|
|
} |
107 |
|
|
|
108 |
|
|
//*****************************************************************8 |
109 |
|
|
/*! |
110 |
|
|
|
111 |
|
|
*/ |
112 |
|
|
bool FastSearch::Find(OBBase* pOb, vector<unsigned int>& SeekPositions, |
113 |
|
|
unsigned int MaxCandidates) |
114 |
|
|
{ |
115 |
|
|
///Finds chemical objects in datafilename (which must previously have been indexed) |
116 |
|
|
///that have all the same bits set in their fingerprints as in the fingerprint of |
117 |
|
|
///a pattern object. (Usually a substructure search in molecules.) |
118 |
|
|
///The type of fingerprint and its degree of folding does not have to be specified |
119 |
|
|
///here because the values in the index file are used. |
120 |
|
|
///The positions of the candidate matching molecules in the original datafile are returned. |
121 |
|
|
|
122 |
|
|
vector<unsigned int> vecwords; |
123 |
gezelter |
1081 |
_pFP->GetFingerprint(pOb,vecwords, _index.header.words * OBFingerprint::Getbitsperint()); |
124 |
tim |
741 |
|
125 |
|
|
vector<unsigned int>candidates; //indices of matches from fingerprint screen |
126 |
|
|
candidates.reserve(MaxCandidates); |
127 |
|
|
|
128 |
|
|
unsigned int dataSize = _index.header.nEntries; |
129 |
|
|
// GetFingerprint(mol, vecwords, _index.header.words, _index.header.fptype); |
130 |
|
|
|
131 |
|
|
unsigned int words = _index.header.words; |
132 |
|
|
unsigned int* nextp = &_index.fptdata[0]; |
133 |
|
|
unsigned int* ppat0 = &vecwords[0]; |
134 |
|
|
register unsigned int* p; |
135 |
|
|
register unsigned int* ppat; |
136 |
|
|
register unsigned int a; |
137 |
|
|
unsigned int i; // need address of this, can't be register |
138 |
|
|
for(i=0;i<dataSize; i++) //speed critical section |
139 |
|
|
{ |
140 |
|
|
p=nextp; |
141 |
|
|
nextp += words; |
142 |
|
|
ppat=ppat0; |
143 |
|
|
a=0; |
144 |
|
|
while(p<nextp) |
145 |
|
|
{ |
146 |
|
|
if ( (a=((*ppat) & (*p++)) ^ (*ppat++)) ) break; |
147 |
|
|
} |
148 |
|
|
if(!a) |
149 |
|
|
{ |
150 |
|
|
candidates.push_back(i); |
151 |
|
|
if(candidates.size()>=MaxCandidates) |
152 |
|
|
break; |
153 |
|
|
} |
154 |
|
|
} |
155 |
|
|
|
156 |
|
|
if(i<_index.header.nEntries) //premature end to search |
157 |
|
|
{ |
158 |
|
|
#ifdef HAVE_SSTREAM |
159 |
|
|
stringstream errorMsg; |
160 |
|
|
#else |
161 |
|
|
strstream errorMsg; |
162 |
|
|
#endif |
163 |
|
|
errorMsg << "Stopped looking after " << i << " molecules." << endl; |
164 |
gezelter |
1081 |
obErrorLog.ThrowError(__func__, errorMsg.str(), obWarning); |
165 |
tim |
741 |
} |
166 |
|
|
|
167 |
|
|
vector<unsigned int>::iterator itr; |
168 |
|
|
for(itr=candidates.begin();itr!=candidates.end();itr++) |
169 |
|
|
{ |
170 |
|
|
SeekPositions.push_back(_index.seekdata[*itr]); |
171 |
|
|
} |
172 |
|
|
return true; |
173 |
|
|
} |
174 |
|
|
|
175 |
|
|
///////////////////////////////////////////////////////// |
176 |
|
|
bool FastSearch::FindSimilar(OBBase* pOb, multimap<double, unsigned int>& SeekposMap, |
177 |
|
|
double MinTani) |
178 |
|
|
{ |
179 |
|
|
vector<unsigned int> targetfp; |
180 |
gezelter |
1081 |
_pFP->GetFingerprint(pOb,targetfp, _index.header.words * OBFingerprint::Getbitsperint()); |
181 |
tim |
741 |
|
182 |
|
|
unsigned int words = _index.header.words; |
183 |
|
|
unsigned int dataSize = _index.header.nEntries; |
184 |
|
|
unsigned int* nextp = &_index.fptdata[0]; |
185 |
|
|
register unsigned int* p; |
186 |
|
|
register unsigned int i; |
187 |
|
|
for(i=0;i<dataSize; i++) //speed critical section |
188 |
|
|
{ |
189 |
|
|
p=nextp; |
190 |
|
|
nextp += words; |
191 |
|
|
double tani = OBFingerprint::Tanimoto(targetfp,p); |
192 |
|
|
if(tani>MinTani) |
193 |
|
|
SeekposMap.insert(pair<const double, unsigned int>(tani,_index.seekdata[i])); |
194 |
|
|
} |
195 |
|
|
return true; |
196 |
|
|
} |
197 |
|
|
|
198 |
|
|
///////////////////////////////////////////////////////// |
199 |
|
|
bool FastSearch::FindSimilar(OBBase* pOb, multimap<double, unsigned int>& SeekposMap, |
200 |
|
|
int nCandidates) |
201 |
|
|
{ |
202 |
|
|
///If nCandidates is zero or omitted the original size of the multimap is used |
203 |
|
|
if(nCandidates) |
204 |
|
|
{ |
205 |
|
|
//initialise the multimap with nCandidate zero entries |
206 |
|
|
SeekposMap.clear(); |
207 |
|
|
int i; |
208 |
|
|
for(i=0;i<nCandidates;++i) |
209 |
|
|
SeekposMap.insert(pair<const double, unsigned int>(0,0)); |
210 |
|
|
} |
211 |
|
|
else if(SeekposMap.size()==0) |
212 |
|
|
return false; |
213 |
|
|
|
214 |
|
|
vector<unsigned int> targetfp; |
215 |
gezelter |
1081 |
_pFP->GetFingerprint(pOb,targetfp, _index.header.words * OBFingerprint::Getbitsperint()); |
216 |
tim |
741 |
|
217 |
|
|
unsigned int words = _index.header.words; |
218 |
|
|
unsigned int dataSize = _index.header.nEntries; |
219 |
|
|
unsigned int* nextp = &_index.fptdata[0]; |
220 |
|
|
register unsigned int* p; |
221 |
|
|
register unsigned int i; |
222 |
|
|
for(i=0;i<dataSize; i++) //speed critical section |
223 |
|
|
{ |
224 |
|
|
p=nextp; |
225 |
|
|
nextp += words; |
226 |
|
|
double tani = OBFingerprint::Tanimoto(targetfp,p); |
227 |
|
|
if(tani>SeekposMap.begin()->first) |
228 |
|
|
{ |
229 |
|
|
SeekposMap.insert(pair<const double, unsigned int>(tani,_index.seekdata[i])); |
230 |
|
|
SeekposMap.erase(SeekposMap.begin()); |
231 |
|
|
} |
232 |
|
|
} |
233 |
|
|
return true; |
234 |
gezelter |
1081 |
} |
235 |
|
|
|
236 |
|
|
///////////////////////////////////////////////////////// |
237 |
tim |
741 |
string FastSearch::ReadIndex(istream* pIndexstream) |
238 |
|
|
{ |
239 |
|
|
//Reads fs index from istream into member variables |
240 |
gezelter |
1081 |
_index.Read(pIndexstream); |
241 |
tim |
741 |
|
242 |
gezelter |
1081 |
_pFP = _index.CheckFP(); |
243 |
|
|
if(!_pFP) |
244 |
|
|
*(_index.header.datafilename) = '\0'; |
245 |
|
|
|
246 |
|
|
return _index.header.datafilename; //will be empty on error |
247 |
|
|
} |
248 |
|
|
|
249 |
|
|
////////////////////////////////////////////////////////// |
250 |
|
|
bool FptIndex::Read(istream* pIndexstream) |
251 |
|
|
{ |
252 |
|
|
pIndexstream->read((char*)&(header), sizeof(FptIndexHeader)); |
253 |
|
|
pIndexstream->seekg(header.headerlength);//allows header length to be changed |
254 |
|
|
if(pIndexstream->fail() || header.headerlength != sizeof(FptIndexHeader)) |
255 |
tim |
741 |
{ |
256 |
gezelter |
1081 |
*(header.datafilename) = '\0'; |
257 |
|
|
return false; |
258 |
|
|
} |
259 |
tim |
741 |
|
260 |
gezelter |
1081 |
unsigned int nwords = header.nEntries * header.words; |
261 |
|
|
fptdata.resize(nwords); |
262 |
|
|
seekdata.resize(header.nEntries); |
263 |
tim |
741 |
|
264 |
gezelter |
1081 |
pIndexstream->read((char*)&(fptdata[0]), sizeof(unsigned int) * nwords); |
265 |
|
|
pIndexstream->read((char*)&(seekdata[0]), sizeof(unsigned int) * header.nEntries); |
266 |
|
|
|
267 |
|
|
if(pIndexstream->fail()) |
268 |
|
|
{ |
269 |
|
|
*(header.datafilename) = '\0'; |
270 |
|
|
return false; |
271 |
|
|
} |
272 |
|
|
return true; |
273 |
|
|
} |
274 |
tim |
741 |
|
275 |
gezelter |
1081 |
////////////////////////////////////////////////////////// |
276 |
|
|
OBFingerprint* FptIndex::CheckFP() |
277 |
|
|
{ |
278 |
|
|
//check that fingerprint type is available |
279 |
|
|
string tempFP(header.fpid); |
280 |
|
|
OBFingerprint* pFP = OBFingerprint::FindFingerprint(tempFP); |
281 |
|
|
if(!pFP) |
282 |
|
|
{ |
283 |
|
|
#ifdef HAVE_SSTREAM |
284 |
|
|
stringstream errorMsg; |
285 |
|
|
#else |
286 |
|
|
strstream errorMsg; |
287 |
|
|
#endif |
288 |
|
|
errorMsg << "Index has Fingerprints of type '" << header.fpid |
289 |
|
|
<< " which is not currently loaded." << endl; |
290 |
|
|
obErrorLog.ThrowError(__func__, errorMsg.str(), obError); |
291 |
tim |
741 |
} |
292 |
gezelter |
1081 |
return pFP; //NULL if not available |
293 |
tim |
741 |
} |
294 |
|
|
|
295 |
|
|
//******************************************************* |
296 |
|
|
FastSearchIndexer::FastSearchIndexer(string& datafilename, ostream* os, |
297 |
|
|
std::string& fpid, int FptBits) |
298 |
|
|
{ |
299 |
|
|
///Starts indexing process |
300 |
|
|
_indexstream = os; |
301 |
|
|
_nbits=FptBits; |
302 |
|
|
_pindex= new FptIndex; |
303 |
|
|
_pindex->header.headerlength = sizeof(FptIndexHeader); |
304 |
|
|
strncpy(_pindex->header.fpid,fpid.c_str(),15); |
305 |
|
|
strncpy(_pindex->header.datafilename, datafilename.c_str(), 255); |
306 |
gezelter |
1081 |
|
307 |
|
|
//check that fingerprint type is available |
308 |
|
|
_pFP = _pindex->CheckFP(); |
309 |
tim |
741 |
} |
310 |
|
|
|
311 |
|
|
///////////////////////////////////////////////////////////// |
312 |
gezelter |
1081 |
FastSearchIndexer::FastSearchIndexer(FptIndex* pindex, std::ostream* os) |
313 |
|
|
{ |
314 |
|
|
//Uses existing index |
315 |
|
|
_indexstream = os; |
316 |
|
|
_pindex = pindex; |
317 |
|
|
_nbits = _pindex->header.words * OBFingerprint::Getbitsperint(); |
318 |
|
|
|
319 |
|
|
//check that fingerprint type is available |
320 |
|
|
_pFP = _pindex->CheckFP(); |
321 |
|
|
} |
322 |
|
|
|
323 |
|
|
///////////////////////////////////////////////////////////// |
324 |
tim |
741 |
FastSearchIndexer::~FastSearchIndexer() |
325 |
|
|
{ |
326 |
|
|
///Saves index file |
327 |
|
|
_pindex->header.nEntries = _pindex->seekdata.size(); |
328 |
|
|
_indexstream->write((const char*)&_pindex->header, sizeof(FptIndexHeader)); |
329 |
|
|
_indexstream->write((const char*)&_pindex->fptdata[0], _pindex->fptdata.size()*sizeof(unsigned int)); |
330 |
|
|
_indexstream->write((const char*)&_pindex->seekdata[0], _pindex->seekdata.size()*sizeof(unsigned int)); |
331 |
|
|
if(!_indexstream) |
332 |
tim |
819 |
obErrorLog.ThrowError(__func__, |
333 |
tim |
741 |
"Difficulty writing index", obWarning); |
334 |
|
|
delete _pindex; |
335 |
|
|
} |
336 |
|
|
|
337 |
|
|
/////////////////////////////////////////////////////////////// |
338 |
|
|
bool FastSearchIndexer::Add(OBBase* pOb, streampos seekpos) |
339 |
|
|
{ |
340 |
|
|
///Adds a fingerprint |
341 |
|
|
|
342 |
|
|
vector<unsigned int> vecwords; |
343 |
|
|
if(!_pFP) |
344 |
|
|
return false; |
345 |
|
|
if(_pFP->GetFingerprint(pOb, vecwords, _nbits)) |
346 |
|
|
{ |
347 |
|
|
_pindex->header.words = vecwords.size(); //Use size as returned from fingerprint |
348 |
|
|
for(unsigned int i=0;i<_pindex->header.words;i++) |
349 |
|
|
_pindex->fptdata.push_back(vecwords[i]); |
350 |
|
|
_pindex->seekdata.push_back(seekpos); |
351 |
|
|
return true; |
352 |
|
|
} |
353 |
tim |
819 |
obErrorLog.ThrowError(__func__, "Failed to make a fingerprint", obWarning); |
354 |
tim |
741 |
return false; |
355 |
|
|
} |
356 |
|
|
|
357 |
|
|
/*! |
358 |
|
|
\class OBFingerprint |
359 |
|
|
These fingerprints are condensed representation of molecules (or other objects) |
360 |
|
|
as a list of boolean values (actually bits in a vector<unsigned>) with length |
361 |
|
|
of a power of 2. The main motivation is for fast searching of data sources |
362 |
|
|
containing large numbers of molecules (up to several million). OpenBabel |
363 |
|
|
provides some routines which can search text files containing lists of molecules |
364 |
|
|
in any format. See the documentation on the class FastSearch. |
365 |
|
|
|
366 |
|
|
There are descriptions of molecular fingerprints at <br> |
367 |
|
|
http://www.daylight.com/dayhtml/doc/theory/theory.finger.html) and <br> |
368 |
|
|
http://www.mesaac.com/Fingerprint.htm <br> |
369 |
|
|
Many methods of preparing fingerprints have been described, but the type supported |
370 |
|
|
currently in OpenBabel has each bit representing a substructure (or other |
371 |
|
|
molecular property). If a substructure is present in the molecule, then a |
372 |
|
|
particular bit is set to 1. But because the hashing method may also map other |
373 |
|
|
substructures to the same bit, a match does not guarantee that a particular |
374 |
|
|
substructure is present; there may be false positives. However, with proper design, |
375 |
|
|
a large fraction of irrelevant molecules in a data set can be eliminated in a |
376 |
|
|
fast search with boolean methods on the fingerprints. |
377 |
|
|
It then becomes feasible to make a definitive substructure search by |
378 |
|
|
conventional methods on this reduced list even if it is slow. |
379 |
|
|
|
380 |
|
|
OpenBabel provides a framework for applying new types of fingerprints without |
381 |
|
|
changing any existing code. They are derived from OBFingerprint and the |
382 |
|
|
source file is just compiled with the rest of OpenBabel. Alternatively, |
383 |
|
|
they can be separately compiled as a DLL or shared library and discovered |
384 |
|
|
when OpenBabel runs. |
385 |
|
|
|
386 |
|
|
Fingerprints derived from this abstract base class OBFingerprint can be for any |
387 |
|
|
object derived from OBBase (not just for OBMol). |
388 |
|
|
Each derived class provides an ID as a string and OBFingerprint keeps a map of |
389 |
|
|
these to provides a pointer to the class when requested in FindFingerprint. |
390 |
|
|
|
391 |
|
|
<h4>-- To define a fingerprint type --</h4> |
392 |
|
|
The classes derived form OBFingerprint are required to provide |
393 |
|
|
a GetFingerprint() routine and a Description() routine |
394 |
|
|
\code |
395 |
|
|
class MyFpType : OBFingerprint |
396 |
|
|
{ |
397 |
|
|
MyFpType(const char* id) : OBFingerprint(id){}; |
398 |
|
|
virtual bool GetFingerprint(OBBase* pOb, vector<unsigned int>& fp, int nbits) |
399 |
|
|
{ |
400 |
|
|
//Convert pOb to the required type, usually OBMol |
401 |
|
|
OBMol* pmol = dynamic_cast<OBMol*>(pOb); |
402 |
|
|
fp.resize(required_number_of_words); |
403 |
|
|
... |
404 |
|
|
use SetBit(fp,n); to set the nth bit |
405 |
|
|
|
406 |
|
|
if(nbits) |
407 |
|
|
Fold(fp, nbits); |
408 |
|
|
} |
409 |
|
|
virtual const char* Description(){ return "Some descriptive text";} |
410 |
|
|
... |
411 |
|
|
}; |
412 |
|
|
\endcode |
413 |
|
|
//Declare a global instance with the ID you will use in -f options to specify its use. |
414 |
|
|
\code |
415 |
|
|
MyFpType theMyFpType("myfpID"); |
416 |
|
|
\endcode |
417 |
|
|
|
418 |
|
|
<h4>-- To obtain a fingerprint --</h4> |
419 |
|
|
\code |
420 |
|
|
OBMol mol; |
421 |
|
|
... |
422 |
|
|
vector<unsigned int> fp; |
423 |
|
|
OBFingerprint::GetDefault()->GetFingerprint(&mol, fp); //gets default size of fingerprint |
424 |
|
|
\endcode |
425 |
|
|
or |
426 |
|
|
\code |
427 |
|
|
vector<unsigned int> fp; |
428 |
|
|
OBFingerPrint* pFP = OBFingerprint::FindFingerprint("myfpID"); |
429 |
|
|
...and maybe... |
430 |
|
|
pFP->GetFingerprint(&mol,fp, 128); //fold down to 128bits if was originally larger |
431 |
|
|
\endcode |
432 |
|
|
|
433 |
|
|
<h4>-- To print a list of available fingerprint types --</h4> |
434 |
|
|
\code |
435 |
|
|
std::string id; |
436 |
|
|
OBFingerPrint* pFPrt=NULL; |
437 |
|
|
while(OBFingerprint::GetNextFPrt(id, pFPrt)) |
438 |
|
|
{ |
439 |
|
|
cout << id << " -- " << pFPrt->Description() << endl; |
440 |
|
|
} |
441 |
|
|
\endcode |
442 |
|
|
|
443 |
|
|
|
444 |
|
|
Fingerprints are handled as vector<unsigned int> so that the number of bits |
445 |
|
|
in this vector and their order will be platform and compiler |
446 |
|
|
dependent, because of size of int and endian differences. Use fingerprints |
447 |
|
|
(and fastsearch indexes containing them) only for comparing with other |
448 |
|
|
fingerprints prepared on the same machine. |
449 |
|
|
|
450 |
|
|
The FingerprintFormat class is an output format which displays fingerprints |
451 |
|
|
as hexadecimal. When multiple molecules are supplied it will calculate the |
452 |
|
|
Tanimoto coefficient from the first molecule to each of the others. It also |
453 |
|
|
shows whether the first molecule is a possible substructure to all the others, |
454 |
|
|
i.e. whether all the bits set in the fingerprint for the first molecule are |
455 |
|
|
set in the fingerprint of the others. To display hexadecimal information when |
456 |
|
|
multiple molecules are provided it is necessay to use the -xh option. |
457 |
|
|
|
458 |
|
|
To see a list of available format types, type babel -F on the command line. |
459 |
|
|
The -xF option of the FingerprintFormat class also provides this output, but due |
460 |
|
|
to a quirk in the way the program works, it is necessary to have a valid input |
461 |
|
|
molecule for this option to work. |
462 |
|
|
*/ |
463 |
|
|
|
464 |
|
|
/*! \class FastSearch |
465 |
|
|
The FastSearch class searches an index to a datafile containing a list of molecules |
466 |
|
|
(or other objects) prepared by FastSearchIndexer. |
467 |
|
|
|
468 |
|
|
OpenBabel can also search files for molecules containing a substructure specified |
469 |
|
|
by a SMARTS string, using OBSmartsPattern or from the command line: |
470 |
|
|
\code |
471 |
|
|
babel datafile.xxx -outfile.yyy -sSMARTS |
472 |
|
|
\endcode |
473 |
|
|
But when the data file contains more than about 10,000 molecules this becomes |
474 |
|
|
rather too slow to be used interactively. To do it more quickly, an index |
475 |
|
|
of the molecules containing their fingerprints (see OBFingerprint) is prepared using |
476 |
|
|
FastSearchIndexer. The indexing process may take a long time but only has to |
477 |
|
|
be done once. The index can be searched very quickly with FastSearch. Because |
478 |
|
|
of the nature of fingerprints a match to a bit does not guarantee |
479 |
|
|
the presence of a particular substructure or other molecular property, so that |
480 |
|
|
a definitive answer may require a subsequent search of the (much reduced) list |
481 |
|
|
of candidates by another method (like OBSmartsPattern). |
482 |
|
|
|
483 |
|
|
Note that the index files are not portable. They need to be prepared on the |
484 |
|
|
computer that will acess them. |
485 |
|
|
|
486 |
|
|
<h4>Using FastSearch and FastSearchIndexer in a program</h4> |
487 |
|
|
|
488 |
|
|
The index has two tables: |
489 |
|
|
- an array of fingerprints of the molecules |
490 |
|
|
- an array of the seek positions in the datasource file of all the molecules |
491 |
|
|
|
492 |
|
|
<h4>To prepare an fastsearch index file:</h4> |
493 |
|
|
- Open an ostream to the index file. |
494 |
|
|
- Make a FastSearchIndexer object on the heap or the stack, passing in as parameters: |
495 |
|
|
- the datafile name, the indexfile stream, |
496 |
|
|
- the id of the fingerprint type to be used, |
497 |
|
|
- the number of bits the fingerprint is to be folded down to, If it is to be left |
498 |
|
|
unfolded, set fpid to 0 or do not specify it. |
499 |
|
|
. |
500 |
|
|
- For each molecule, call Add() with its pointer and its position in the datafile.<br> |
501 |
|
|
Currently the std::streampos value is implicitly cast to unsigned int so that |
502 |
|
|
for 32bit machines the datafile has to be no longer than about 2Gbyte. |
503 |
|
|
- The index file is written when the FastSearchIndexer object is deleted or goes out |
504 |
|
|
of scope. |
505 |
|
|
|
506 |
|
|
<h4>To search in a fastsearch index file</h4> |
507 |
|
|
|
508 |
|
|
- Open a std::istream to the indexfile (in binary mode on some systems) |
509 |
|
|
- Make a FastSearch object, read the index and open the datafile whose |
510 |
|
|
name it provides |
511 |
|
|
\code |
512 |
|
|
ifstream ifs(indexname,ios::binary); |
513 |
|
|
FastSearch fs; |
514 |
|
|
string datafilename = fs.ReadIndex(&ifs); |
515 |
|
|
if(datafilename.empty()/endcode |
516 |
|
|
return false; |
517 |
|
|
ifstream datastream(datafilename); |
518 |
|
|
if(!datastream) |
519 |
|
|
return false; |
520 |
|
|
\endcode |
521 |
|
|
|
522 |
|
|
<strong>To do a search for molecules which have all the substructure bits the |
523 |
|
|
OBMol object, patternMol</strong> |
524 |
|
|
\code |
525 |
|
|
vector<unsigned int>& SeekPositions; |
526 |
|
|
if(!fs.Find(patternMol, SeekPositions, MaxCandidates)) |
527 |
|
|
|
528 |
|
|
for(itr=SeekPositions.begin();itr!=SeekPositions.end();itr++) |
529 |
|
|
{ |
530 |
|
|
datastream.seekg(*itr); |
531 |
|
|
... read the candidate molecule |
532 |
|
|
and subject to more rigorous test if necessary |
533 |
|
|
} |
534 |
|
|
\endcode |
535 |
|
|
|
536 |
|
|
<strong>To do a similarity search based on the Tanimoto coefficient</strong> |
537 |
|
|
This is defined as: <br> |
538 |
|
|
<em>Number of bits set in (patternFP & targetFP)/Number of bits in (patternFP | targetFP)</em><br> |
539 |
|
|
where the boolean operations between the fingerprints are bitwise<br> |
540 |
|
|
The Tanimoto coefficient has no absolute meaning and depends on |
541 |
|
|
the design of the fingerprint. |
542 |
|
|
\code |
543 |
|
|
multimap<double, unsigned int> SeekposMap; |
544 |
|
|
// to find n best molecules |
545 |
|
|
fs.FindSimilar(&patternMol, SeekposMap, n); |
546 |
|
|
...or |
547 |
|
|
// to find molecules with Tanimoto coefficient > MinTani |
548 |
|
|
fs.FindSimilar(&patternMol, SeekposMap, MinTani); |
549 |
|
|
|
550 |
|
|
multimap<double, unsigned int>::reverse_iterator itr; |
551 |
|
|
for(itr=SeekposMap.rbegin();itr!=SeekposMap.rend();++itr) |
552 |
|
|
{ |
553 |
|
|
datastream.seekg(itr->second); |
554 |
|
|
... read the candidate molecule |
555 |
|
|
double tani = itr->first; |
556 |
|
|
} |
557 |
|
|
\endcode |
558 |
|
|
|
559 |
|
|
The FastSearchFormat class facilitates the use of these routine from the |
560 |
|
|
command line or other front end program. For instance: |
561 |
|
|
|
562 |
|
|
<strong>Prepare an index:</strong> |
563 |
|
|
\code |
564 |
|
|
babel datafile.xxx index.fs |
565 |
|
|
\endcode |
566 |
|
|
or |
567 |
|
|
\code |
568 |
|
|
babel datafile.xxx -ofs namedindex |
569 |
|
|
\endcode |
570 |
|
|
With options you can specify: |
571 |
|
|
- which type of fingerprint to be used, e.g. -xfFP2, |
572 |
|
|
- whether it is folded to a specified number of bits, e.g. -xn128 |
573 |
|
|
(which should be a power of2) |
574 |
|
|
- whether to pre-select the molecules which are indexed: |
575 |
|
|
- by structure e.g only ethers and esters, -sCOC |
576 |
|
|
- by excluding molecules with bezene rings, -vc1ccccc1 |
577 |
|
|
- by position in the datafile e.g. only mols 10 to 90, -f10 -l90 |
578 |
|
|
. |
579 |
|
|
<strong>Substructure search</strong> in a previously prepared index file |
580 |
|
|
\code |
581 |
|
|
babel index.fs outfile.yyy -sSMILES |
582 |
|
|
\endcode |
583 |
|
|
The datafile can also be used as the input file, provided the input format is specified as fs |
584 |
|
|
\code |
585 |
|
|
babel datafile.xxx outfile.yyy -sSMILES -ifs |
586 |
|
|
\endcode |
587 |
|
|
A "slow" search not using fingerprints would be done on the same data by omitting -ifs. |
588 |
|
|
A "slow" search can use SMARTS strings, but the fast search is restricted |
589 |
|
|
to the subset which is valid SMILES. |
590 |
|
|
|
591 |
|
|
With the -S option, the target can be specified as a molecule in a file of any format |
592 |
|
|
\code |
593 |
|
|
babel datafile.xxx outfile.yyy -Spattern_mol.zzz -ifs |
594 |
|
|
\endcode |
595 |
|
|
These searches have two stages: a fingerprint search which produces |
596 |
|
|
a number of candidate molecules and a definitive search which selects |
597 |
|
|
from these using SMARTS pattern matching. The maximum number of candidates |
598 |
|
|
is 4000 by default but you can change this with an option |
599 |
|
|
e.g. -al 8000 (Note that you need the space between l and the number.) |
600 |
|
|
If the fingerprint search reaches the maximum number it will not |
601 |
|
|
look further and will tell you at what stage it stopped. |
602 |
|
|
|
603 |
|
|
<strong>Similarity search</strong> in a previously prepared index file<br> |
604 |
|
|
This rather done (rather than a substructure search) if the -at option is used, |
605 |
|
|
\code |
606 |
|
|
babel datafile.xxx outfile.yyy -sSMILES -ifs -at0.7 |
607 |
|
|
\endcode |
608 |
|
|
for instance |
609 |
|
|
- -at0.7 will recover all molecules with Tanimoto greater than 0.7 |
610 |
|
|
- -at15 (no decimal point) will recover the 15 molecules with largest coefficients. |
611 |
|
|
- -aa will add the Tanimoto coefficient to the titles of the output molecules. |
612 |
|
|
|
613 |
|
|
All stages, the indexing, the interpretation of the SMILES string in the -s option, |
614 |
|
|
the file in the -S option and the final SMARTS filter convert to OBMol and apply |
615 |
|
|
ConvertDativeBonds(). This ensures thatforms such as[N+]([O-])=O and N(=O)=O |
616 |
|
|
are recognized as chemically identical. |
617 |
|
|
*/ |
618 |
|
|
|
619 |
|
|
}//Openbabel |
620 |
|
|
|
621 |
|
|
//! \file fingerprint.cpp |
622 |
|
|
//! \brief Definitions for OBFingerprint base class and fastsearch classes |