\input makrzhi \font\sf = cmss8 \def\chapter{} \def\Gusymbol{\scaledpicture 4.33 in by 4.35 in (Gusymbol scaled 500)} \def\bildett{\scaledpicture 5.01 in by 3.46 in (bildett scaled 200)} \def\bildtva{\scaledpicture 6.33 in by 3.67 in (bildtva scaled 200)} \def\bildtre{\scaledpicture 4.22 in by 2.99 in (bildtre scaled 200)} \def\bildfyra{\scaledpicture 3.06 in by 2.21 in (bildfyra scaled 200)} \def\bildfem{\scaledpicture 3.39 in by 2.04 in (bildfem scaled 200)} \def\bildsex{\scaledpicture 2.26 in by 1.74 in (bildsex scaled 200)} \def\bildsju{\scaledpicture 3.04 in by 2.24 in (bildsju scaled 200)} \def\bildatta{\scaledpicture 4.99 in by 2.19 in (Bildatta scaled 200)} \def\bildnio{\scaledpicture 3.32 in by 2.01 in (bildnio scaled 200)} \def\bildtio{\scaledpicture 6.50 in by 4.72 in (bildtio scaled 200)} \def\bildelva{\scaledpicture 3.64 in by 1.82 in (bildelva scaled 200)} \def\bildtolv{\scaledpicture 3.04 in by 2.15 in (bildtolv scaled 200)} \def\bildtretton{\scaledpicture 6.06 in by 3.24 in (bildtretton scaled 200)} \def\bildfjorton{\scaledpicture 4.15 in by 2.83 in (bildfjorton scaled 200)} \def\bildfemton{\scaledpicture 3.88 in by 3.51 in (bildfemton scaled 200)} \def\bildsexton{\scaledpicture 3.89 in by 4.28 in (bildsexton scaled 200)} \def\bildsjutton{\scaledpicture 4.65 in by 2.93 in (bildsjutton scaled 200)} \def\bildarton{\scaledpicture 3.29 in by 3.88 in (bildarton scaled 200)} \def\bildnitton{\scaledpicture 2.24 in by 2.63 in (bildnitton scaled 200)} \def\bildtjugo{\scaledpicture 5.44 in by 2.39 in (bildtjugo scaled 200)} \def\bildtjugoett{\scaledpicture 5.94 in by 3.38 in (bildtjugoett scaled 200)} \def\bildtjugotva{\scaledpicture 8.19 in by 4.10 in (bildtjugotva scaled 200)} \def\bildtjugotre{\scaledpicture 4.01 in by 2.86 in (bildtjugotre scaled 200)} \def\bildtjugofyra{\scaledpicture 4.86 in by 3.28 in (bildtjugofyra scaled 200)} \def\bildtjugofem{\scaledpicture 6.10 in by 4.06 in (bildtjugofem scaled 200)} \def\bildtjugosex{\scaledpicture 2.19 in by 2.58 in (bildtjugosex scaled 200)} \def\bildtjugosju{\scaledpicture 2.94 in by 2.39 in (bildtjugosju scaled 200)} \def\bildtjugoatta{\scaledpicture 2.92 in by 2.22 in (bildtjugoatta scaled 200)} \def\bildtjugonio{\scaledpicture 1.64 in by 2.61 in (bildtjugonio scaled 200)} \def\bildtrettio{\scaledpicture 2.82 in by 1.86 in (bildtrettio scaled 200)} \def\bildtrettioett{\scaledpicture 4.63 in by 1.60 in (bildtrettioett scaled 200)} \def\bildtrettiotva{\scaledpicture 4.00 in by 1.79 in (bildtrettiotva scaled 200)} \def\bildtrettiotre{\scaledpicture 2.10 in by 2.25 in (bildtrettiotre scaled 200)} \def\bildtrettiofyra{\scaledpicture 2.93 in by 1.69 in (bildtrettiofyra scaled 200)} \def\bildtrettiofem{\scaledpicture 2.29 in by 2.90 in (bildtrettiofem scaled 200)} \def\bildtrettiosex{\scaledpicture 2.96 in by 2.96 in (bildtrettiosex scaled 200)} \def\bildtrettiosju{\scaledpicture 3.19 in by 2.00 in (bildtrettiosju scaled 200)} \def\bildtrettioatta{\scaledpicture 3.19 in by 2.00 in (bildtrettioatta scaled 200)} \def\bildtrettionio{\scaledpicture 3.19 in by 2.00 in (bildtrettionio scaled 200)} \def\bildfyrtio{\scaledpicture 3.18 in by 2.86 in (bildfyrtio scaled 200)} \def\bildfyrtioett{\scaledpicture 4.04 in by 4.04 in (bildfyrtioett scaled 200)} \def\bildfyrtiotva{\scaledpicture 2.26 in by 2.43 in (bildfyrtiotva scaled 200)} \def\bildfyrtiotre{\scaledpicture 10.06 in by 0.81 in (bildfyrtiotre scaled 200)} \def\bildfyrtiofyra{\scaledpicture 2.90 in by 3.14 in (bildfyrtiofyra scaled 200)} \def\bildfyrtiofem{\scaledpicture 3.88 in by 3.72 in (bildfyrtiofem scaled 200)} \def\bildfyrtiosex{\scaledpicture 2.47 in by 2.28 in (bildfyrtiosex scaled 200)} \def\bildfyrtiosju{\scaledpicture 2.74 in by 2.54 in (bildfyrtiosju scaled 200)} \def\bildfyrtioatta{\scaledpicture 4.17 in by 1.93 in (bildfyrtioatta scaled 200)} \def\bildfyrtionio{\scaledpicture 4.15 in by 1.88 in (bildfyrtionio scaled 200)} \def\bildfemtio{\scaledpicture 2.61 in by 2.03 in (bildfemtio scaled 200)} \def\bildfemtioett{\scaledpicture 2.68 in by 2.06 in (bildfemtioett scaled 200)} \def\bildfemtiotva{\scaledpicture 3.08 in by 2.42 in (bildfemtiotva scaled 200)} \def\bildfemtiotre{\scaledpicture 3.08 in by 2.42 in (bildfemtiotre scaled 200)} \def\bildfemtiofyra{\scaledpicture 4.50 in by 2.68 in (bildfemtiofyra scaled 200)} \def\bildfemtiofem{\scaledpicture 3.14 in by 3.18 in (bildfemtiofem scaled 200)} \def\bildfemtiosex{\scaledpicture 2.60 in by 2.33 in (bildfemtiosex scaled 200)} \def\ecgonine{\scaledpicture 2.28 in by 2.22 in (ecgonine scaled 1000)} \def\example{\scaledpicture 6.94 in by 5.49 in (example scaled 700)} \def\ipratropbromine{\scaledpicture 6.94 in by 5.49 in (ipratropbromine scaled 700)} \overfullrule=0pt \parskip = 6pt plus 1pt % This line is protected by Swedish and international copyright laws \footline={} \bigskip \bigskip {\bigabf \centerline{Encoding Chemical Information} \vskip 0.5cm \centerline{into Bit-Strings for the} \vskip 0.5cm \centerline{Purpose of Virtual Screening}} \vskip 3cm \centerline{{\bigait K\aa{}re Andersson}} \vskip 3cm \centerline{\Gusymbol} {\bigabf \vskip 3cm \centerline{Department of Theoretical Chemistry} \centerline{G\"oteborg University} \centerline{G\"oteborg, Sweden} \vskip 0.5cm \centerline{2004}} \medskip \vskip 3cm \vfill \eject \footline={} \pagezhi \parskip=6pt plus 1pt {\bf \noindent List of Contents} \bigskip {\parindent = 1cm} \litem{1.} Introduction \dotfill { 1} \litem{1.1} Objectives of this Thesis \dotfill { 3} \bigskip \litem{2.} Virtual Screening \dotfill { 5} \litem{2.1} Tools and Related Methods \dotfill { 6} \litemitem{2.1.1} \quad Methods for classification of ligands \dotfill { 6} \litemitem{2.1.2} \quad Statistical approaches \dotfill { 7} \litemitem{2.1.3}\quad Methods that use target information \dotfill { 7} \litemitem{2.1.4} \quad Methods that use molecular descriptors \dotfill { 7} \litemitem{2.1.5} \quad Molecular fingerprints \dotfill { 7} \litem{2.2} Describing Chemical Structures by Bit-Strings \dotfill { 8} \litemitem{2.2.1} \quad Keying \dotfill { 8} \litemitem{2.2.2} \quad Hashing \dotfill { 8} \litemitem{2.2.3} \quad Folding \dotfill { 8} \litem{2.3} Global Parameters for the Screening Process \dotfill { 9} \litem{2.4} The Enrichment Factor of Virtual Screening \dotfill { 10} \bigskip \litem{3.} Molecular Similarity \dotfill { 11} \litem{3.1} Similarity Processing \dotfill { 11} \litem{3.2} Two Dimensional Similarity Searches \dotfill { 12} \litem{3.3} Three Dimensional Similarity Searches \dotfill { 12} \litem{3.4} Extended Use of Similarity \dotfill { 13} \litem{3.5} Limitations of the Similarity Concept \dotfill { 13} \litem{3.6} Molecular Diversity \dotfill { 14} \bigskip \litem{4.} A Virtual Screening Method Based on Structural Searches \dotfill { 16} \litem{4.1} The Approach used in this Work \dotfill { 17} \litemitem{4.1.1} \quad Conversion of structural information \dotfill { 18} \litemitem{4.1.2} \quad Similarity \dotfill { 19} \litemitem{4.1.3} \quad Fuzzy Similarity \dotfill { 21} \litem{4.2} Generation of the Virtual Library \dotfill { 21} \litem{4.3} Description of the Program kol.f90 \dotfill { 26} \litemitem{4.3.1} Unavailable Structural Features \dotfill { 29} \litem{4.4} Program program4.f90 \dotfill { 29} \litem{4.5} Program program5mass.f90 \dotfill { 31} \litem{4.6} Intervals and Distances \dotfill { 32} \litem{4.7} The Functional Unit \dotfill { 34} \bigskip \litem{5.} Results and Discussion \dotfill { 36} \litem{5.1} Results \dotfill { 37} \litem{5.2} Discussion \dotfill { 38} \litem{5.3} Interpretation \dotfill { 39} \bigskip \litem{6.} Conclusions and Outlook \dotfill { 41} \bigskip \litem{7.} Acknowledgments \dotfill { 42} \bigskip \litem{8.} References \dotfill { 43} \bigskip \litem{9.} Appendix A \dotfill { 47} \litem{9.1} Table 9.1 \dotfill { 48} \litem{9.2} Table 9.2 \dotfill {52} \litem{9.3} Table 9.3 \dotfill {54} \litem{9.4} Table 9.4 \dotfill {57} \litem{9.5} Table 9.5 \dotfill {59} \litem{9.6} Table 9.6 \dotfill {63} \litemitem{9.6.1} \quad Benzatropine \dotfill {63} \litemitem{9.6.2} \quad Budesonide \dotfill {66} \litemitem{9.6.3} \quad Bupivacaine \dotfill {69} \litemitem{9.6.4} \quad 2-Chloro-4-hydroxy-6-amino-1,3,5-triazine \dotfill {72} \litemitem{9.6.5} \quad 2-Hydroxyglutarate \dotfill {63} \litemitem{9.6.2} \quad Budesonide \dotfill {66} \litemitem{9.6.3} \quad Bupivacaine \dotfill {69} \litemitem{9.6.4} \quad 2-Chloro-4-hydroxy-6-amino-1,3,5-triazine \dotfill {72} \litemitem{9.6.5} \quad 2-Hydroxyglutarate \dotfill {74} \litemitem{9.6.6} \quad 2-Chloroethanol \dotfill {76} \litemitem{9.6.7} \quad 2-Hydroxy-4,6-diamino-1,3,5-triazine \dotfill {77} \litemitem{9.6.8} \quad Cocaine \dotfill {79} \litemitem{9.6.9} \quad 4-Methylheptan-3-ol \dotfill {82} \litemitem{9.6.10} \quad Lofepramine \dotfill {84} \litemitem{9.6.11} \quad Amphetamine \dotfill {87} \litemitem{9.6.12} \quad Acetophenone \dotfill {89} \litemitem{9.6.13} \quad Meclizine \dotfill {91} \litemitem{9.6.14} \quad Methadone \dotfill {94} \litemitem{9.6.15} \quad Morphine \dotfill {97} \litemitem{9.6.16} \quad Noscapine \dotfill {100} \litemitem{9.6.17} \quad Oxotremorine \dotfill {103} \litemitem{9.6.18} \quad Atrazine \dotfill {106} \litemitem{9.6.19} \quad Benzyl alcohol \dotfill {108} \litemitem{9.6.20} \quad Procaine \dotfill {110} \litemitem{9.6.21} \quad Simanneal \dotfill {112} \litemitem{9.6.22} \quad Terfenadine \dotfill {115} \litemitem{9.6.23} \quad Tiazotienol \dotfill {119} \litemitem{9.6.24} \quad Trimipramine \dotfill {122} \litemitem{9.6.25} \quad Meropenem \dotfill {125} \litemitem{9.6.26} \quad Benzaldehyde \dotfill {128} \litemitem{9.6.27} \quad 2,4-Dihydroxy-6-amino-1,3,5-triazine \dotfill {130} \litemitem{9.6.28} \quad 2-Chloro-4,6-diamino-1,3,5-triazine \dotfill {132} \litemitem{9.6.29} \quad Spirodioxaundecane \dotfill {134} \litemitem{9.6.30} \quad Santene \dotfill {136} \litemitem{9.6.31} \quad S-7-Methyl-3-nonanone \dotfill {138} \litemitem{9.6.32} \quad R-Sulcatol \dotfill {140} \litemitem{9.6.33} \quad Phenol \dotfill {142} \litemitem{9.6.34} \quad R-Seudenol \dotfill {144} \litemitem{9.6.35} \quad Piperonal \dotfill {146} \litemitem{9.6.36} \quad Pentachlorophenol \dotfill {148} \litemitem{9.6.37} \quad Nicotine \dotfill {150} \litemitem{9.6.38} \quad Nicotine \dotfill {152} \litemitem{9.6.39} \quad Nicotine \dotfill {154} \litemitem{9.6.40} \quad myo-Inosito \dotfill {156} \litemitem{9.6.41} \quad Mescaline \dotfill {158} \litemitem{9.6.42} \quad m-Cresol \dotfill {161} \litemitem{9.6.43} \quad Linoleate \dotfill {163} \litemitem{9.6.44} \quad Isopropylammelide \dotfill {166} \litemitem{9.6.45} \quad Ibuprofen \dotfill {168} \litemitem{9.6.46} \quad Frontalin \dotfill {171} \litemitem{9.6.47} \quad exo-Brevicomin \dotfill {173} \litemitem{9.6.48} \quad Epinephrine \dotfill {175} \litemitem{9.6.49} \quad Dopamine \dotfill {177} \litemitem{9.6.50} \quad d-3-Hydroxyproline \dotfill {179} \litemitem{9.6.51} \quad d-Tartarate \dotfill {181} \litemitem{9.6.52} \quad Coniine \dotfill {183} \litemitem{9.6.53} \quad Coniine \dotfill {185} \litemitem{9.6.54} \quad Disulfiram \dotfill {187} \litemitem{9.6.55} \quad $\beta$-D-Galactose \dotfill {190} \litemitem{9.6.56} \quad Benzoic acid \dotfill {192} \litem{9.7} Program kol.f90 \dotfill {194} \litem{9.8} Program program5mass.f90 \dotfill {221} \litem{9.9} Program bearbprogram4.f90 \dotfill {227} \litem{9.10} Program refl.f90 \dotfill {239} \litem{9.11} File kolfil \dotfill {246} \litem{9.12} Definitions kolfil \dotfill {249} \litem{9.13} Help File "intervalspec" \dotfill {251} \litem{9.13.1} File "bioisosterism" \dotfill {253} \litem{9.14} General Definitions \dotfill {254} \litem{9.15} Example Ipratropbromine \dotfill {257} \bigskip \litem{10.} Appendix B \dotfill { 296} \litem{10.1} Feature Tables \dotfill { 296} \litem{10.2} Similarity Tables \dotfill { 319} \vfill \eject \parindent=20pt \footline={\hss\tenrm\folio\hss} \pageno 1 \def\chapter{1. Introduction} \headline={\vbox{ \line{\strut \hss\ninerm\chapter\hss} \hrule} \hss} \def\chapter{1. Introduction} \def\num{1. } \def\title{ Introduction} {\kapitel} Although the history of CADD (computer aided drug design) reaches back just a few decades, the development in this area has been rapid. [1-7] One important tool of CADD is the method of virtual screening of molecules. [8-14] Virtual screening is the computational counterpart to high-throughput screening (HTS). In HTS a set of experimental techniques including robotic and automation technology is used for rapidly testing the activity of large numbers of compounds for their potential use as drugs (ligands). [8] HTS is good for testing hundreds or even thousands of compounds. Advanced technologies suggest that HTS may be good for 10 000 compounds per chemist and year. However, this means almost nothing compared to the number of chemical compounds contained in the virtual chemistry space, which comprises definitely more than $10^{1000}$ compounds. Clearly, not the total virtual chemistry space is interesting in drug research. It is likely that compounds with potential drug activity cluster in parts of the space. But in these parts, there are so many compounds that HTS is an invalid tool. For the purpose of screening larger numbers of compounds virtual screening has to be used, which can be applied to a virtual compound library with 10$^6$ up to 10$^{12}$ compounds. Using suitable criteria and filters, repeated searches are made through the virtual library with the goal to stepwise decrease its size. There is no unique strategy that leads the virtual screening process. However, there are a number of criteria that have to be considered in virtual screening. Because virtual screening does heavily depend of the available computer resources and because of the industrial nature of CADD, virtual screening is applied mostly in a pragmatic way with the goal to speed up drug discovery in the most economic way. The ultimate goal for a pharmaceutic company is always profit, which becomes especially clear in the case of CADD. [1-7] Nevertheless, several companies do engage in CADD and virtual screening not just as suitable tools but also as important parts of computational chemistry, which is the reason for the strong development of CADD and virtual screening in industry rather than university. A combination of scientific and industrial interests does also stimulate progress in a field and this clearly obvious in CADD and virtual screening. As already mentioned virtual screening is an entirely computer based-method for searching and retrieving target compounds that may represent novel lead compounds. The non-virtual approach used to be a combination of random screening and rational design. Deficiencies in the available information that describe structure and properties of the receptor target do often make this a frustrating and not cost-effective way to develop drugs. [1-7,15,16]. Chemical synthesis is also costly in itself and not even possible when the goal is to build really large libraries of compounds. Many chemical compounds can not be stored over a long period of time without decay. The development of computers with better and better performance parameters has made it attractive to describe chemical compounds with suitable descriptors [17-20] and then to store the corresponding information in form of bit-strings, i.e. in a binary code made of 0 and 1. By this, virtual screening becomes a method that complements current advances in high-throughput chemical synthesis and biological assay. Within the library, simple, comparative compound searches can be easily performed. This is possible with a similarity association based strategy rather than by development of a single, successively renewed lead. Raw virtual screening must be used with care to avoid a low hit ratio. However, controlled use of virtual screening on especially focused sets of compounds can make the entire process of searching and optimizing leads cost-effective and fast [8]. This makes a comparison of different compounds easy so that millions of compounds can be screened for certain similarity features. [15,21,22] Two occurrences in identical positions in two compared bit-strings make a hit. The success of virtual screening is measured by the so-called hit frequency, i.e. the number of molecules found with similar features. This number can be specified in a quantitative way using for example Tanimoto coefficients [1,10]. Of course, virtual screening must be used with care to avoid a low hit ratio. A number of guidelines help to fulfill this task. For example, meaningful similarities between compounds can only be found if the virtual library is diverse and spans a larger part of the total virtual chemistry space. Therefore, special care has to be taken to define suitable diversity measures. A compliment to the Tanimoto coefficient can be used in this connection or, alternatively, new diversity measures such as the Hamming diversity measure and the Dixon-Koehler coefficient. [10] Techniques for virtual screening can be used parallel to non-virtual screen- ing methods to optimize specific results. In this situation, the advantages of the computational approach become clearly evident. Large databases, which take one or two months to be screened with conventional techniques, are easily managed by virtual screening over a time elapse of only some hours. [12] An intelligent use of virtual screening on special targeted sets of compounds can make the entire process of searching and optimizing leads cost effective and fast. [14] \bigskip \noindent {\bf 1.1 Objectives of this Thesis} \smallskip Virtual screening depends on the existence of suitable compound libraries. Since these are mostly compiled in pharmaceutic industry and are considered as one of the values of the company, they are not available for the public. Virtual libraries are literally not existent at universities and this is the reason why virtual screening can only be discussed at the university on a purely abstract basis rather than a hands-on basis. [1] It was the goal of this thesis to lift a bit the veil surrounding virtual screening and investigate some of the essentials of the virtual screening process. For this purpose a compound library had to be compiled within this thesis. It contains just 56 examples of bioactive compounds and has to be considered as being rather small, however it fulfills already the purpose of showing some basic features of virtual screening. Apart from the library it had to be decided which molecular features are used for the screening process. Always available are structural features, either in form of topological information, connectivity patterns or geometric data (internal coordinates, Cartesian coordinates, etc.). We have therefore concentrated in this work exclusively on structural features. In this connection, we have developed a procedure that converts structural information into bit-strings, which are easily compared for similarities. One comparative operation of one molecule with regard to a theoretically unlimited number of library molecules is imagined to take just a millisecond or so on a computer with a clock-cycle of a couple of nanoseconds. Clearly, these timings can even be reduced by using computers, for which the comparison steps are integrated in the computer hardware. Here, in this work more modest goals are accomplished. Using the library of 56 organic ligands, the following questions are investigated: {\noindent} 1) What is the fastest and computationally simplest way of converting and comparing organic ligands? {\noindent} 2) What is a useful similarity measure that makes use of raw hit frequencies only? How useful are Tanimoto coefficients in this connection? {\noindent} 3) What routes of improvement result out of this investigation, both with regard to more advanced similarity measures, inclusion of other compound properties, extended targeting, etc.? {\noindent} 4) What practical aspects have to be considered when designing a virtual screening method? {\noindent} 5) Can one make predictions about the future of virtual screening based on the development work carried out in this thesis? \noindent For the purpose of answering these questions, the results of this thesis are presented in the following way. In Chapter 2, the process of virtual screening and the techniques relevant in this connection are described in somewhat more detail. Chapter 3 focuses on similarity measures. The computational method developed in this work is described in chapter 4. Results are discussed in chapter 5 and the conclusions of this work are summarized in chapter 6. Since the major part of this work focuses on the establishing of a virtual library and the programming of suitable virtual screening method that carries out structure conversions and similarity comparison, this thesis contains a large appendix with the documentation of library, programs, and the results of the similarity comparison. \vfill\eject \def\chapter{2. Virtual Screening} \headline={\vbox{ \line{\strut \hss\ninerm\chapter\hss} \hrule} \hss} \def\num{2.} \def\title{Virtual Screening} {\kapitel} The development of virtual screening makes it possible to handle large structural databases of molecules. The modern way of selecting a small dedicated set of potential leads could start with a huge virtual library of $10^{12}$ molecules. Task is to find those compounds that fulfill certain criteria defined by one (or more) reference lead(s). First in the virtual screening process, REOS (rapid elimination of swill) [8-14] is employed to reduce the virtual library to about $10^9$ compounds. In a second step the reference compounds and 2D similarity are used to further reduce the library to about $10^7$ compounds. For these compounds only the most stable conformation is considered, which may not be the conformation with biological activity. Therefore, the library is extended by a factor of 10 by considering additional conformations. Considering 3D similarity and comparing reference leads and library compounds may lead to a reduction to ${10^5}-{10^6}$ compounds. By virtual docking, [14,18] scoring ($\Delta$G$-$calculations) [5], and ADME (absorption, distribution, metabolism, excretion) [1,4] one is able to concentrate on just 10$-$20 of these structures. ADME is actually not considered as a part of virtual screening. Hence, the number of candidates is about 100 at the end of the virtual screening process. Virtual screening is the logical answer to the question how a computational process can extend and replace HTS (high-throughput screening) [9] in the drug search process. In most cases, one prerequisite for virtual screening is a fair amount of knowledge about the desired lead properties (structure of the ligand, affinity to the receptor, etc.), which in turn depend on the actual task of curing a particular disease. This normally requires detailed knowledge about the receptor. As with HTS, only one receptor is generally considered as main reference. This is a typical simplification without which the research would be much more difficult. But virtual screening serves just as a tool to get targeted libraries. If rather unspecified properties such as molecular weight (MW) are available to restrict the library, one will obtain an open-ended subset of structures. Hence, the limits of use of virtual screening are determined by the properties which can be applied as suitable filters in the screening process. In view of the larger libraries that have to be handled the filtering and comparison must be extremely fast to handle the computational cost. Compared to this, the cost for development of suitable screening programs does not play an important role. Also costly is the compilation of a sufficiently large and sufficiently divers virtual library of compounds, that embeds the structure areas with potential lead compounds. One strategic path is to begin with a number of initial building blocks for the library, which are obtained from various areas of structural chemistry (crystallographic databanks [29], company databanks, cas-on-line databanks, etc.). One can build upon these data blocks the library in the virtual space using techniques that are simulate real conditions, so that a library of meaningful virtual compounds is achieved. Candidate molecules can be designed by an evolutionary protocol, guided by a fitness function based on the distance of the correlation vector from that of the reference structure. Within the frames of this library, searches for active compounds, similar to already known ligands, are done. \smallskip \noindent {\bf 2.1 Tools and Related Methods} \smallskip Virtual screening methods can be divided into two categories [9]. First, there are methods that use a lot of target information and then there are methods, which are based on descriptive work with small molecules, i.e. they concentrate more on the ligands. [11] A set of known ligands of known affinity with regard to a macromolecular target can reduce the need for structural information about the target itself. The common strategy is to use QSAR (quantitative structure activity relationship) [23-25] to develop pharmacophores [26,27] on which shape searches are made. In reality, there is no distinct border which separates the two categories of virtual screening methods. Hence, the target-based methods comprise docking of ligands to a receptor (mostly a protein), but also the development of pharmacophores from active site information. The pharmacophore approach does also apply to some methods where the focus is on similarity. One tool which is of special importance for the methods focusing on the ligand properties, is the compound library. As mentioned above, the library can be assembled from commercial sources or from results of chemical analysis so far it is available in journals and books. It can also be developed as a true virtual library, where all compounds are virtually designed. \smallskip \noindent {\bf 2.1.1 Methods for classification of ligands:} These include among others Cluster analysis, partitioning of compounds in an absolute cell space, and 3D and 4D QSAR. [9] Here, we just discuss shortly the QSAR methods. QSAR is a statistical tool to analyze a set of properties or descriptors [17-20] for a series of biologically active compounds. [11] The statistical information is used to make predictions about activity of other molecules with regard to the target, i.e. the receptor. QSAR can be used as a statistic filter for a series of proposed compounds in order to spend resources for synthesis and tests only on compounds, which are likely to be active. With new information from additional experiments, the statistic model can then be refined and used again. Properties that can be employed to describe small molecules are hash codes with substructural information and also global measurements like log P [30] and MW. Structural target information can also be merged into the statistical model. \smallskip \noindent {\bf 2.1.2 Statistical approaches:} In this connection we mention just recursive partitioning, which is a statistical method to internally relate information in large data sets where simple linear relationships are used. For this purpose, non-linear relationships are reformulated in a linear fashion. The subtechniques may imply PCA (principal component analysis),[31,34] or PLS (partial least squares analysis). [32,34] The origin of these methods has been in the area of theoretical biochemistry. Other methods such as binary QSAR and QSPR are described in the literature. [33] \smallskip \noindent {\bf 2.1.3 Methods which use target information:} Information on the receptor is used when developing active site-directed pharmacophores or in the docking of the ligand to the receptor where the latter process is not considered to be part of virtual screening. Active site-directed means that the primary interest is to identify an active site of the target, for instance by letting the computer fill up all void spaces between the atoms in the target structure with spheres, so that internal surface clefts and cavities can be identified. One program which can be employed in this connection is SURFNET [35]. The retrieved information is used to predict how a ligand should be designed to fit into a receptor bay or cleft. \bigskip \noindent {\bf 2.1.4 Methods that use molecular descriptors:} Here we mention just the 2D similarity tests, [15,16, 36] the technique of using 3D pharmacophores, [26,27, 37] and complex algorithms that are based on quantum mechanics or electrotopological maps [38]. \bigskip \noindent {\bf 2.1.5 Molecular fingerprints:} To this group of tools belong keyed 2D fingerprints [39,40] (all bit positions refer to individual chemical properties or features, hashed (randomly encoded) 2D fingerprints, and 3D fingerprints. [39,40,41] \bigskip \bigskip \noindent {\bf 2.2 Describing Chemical Structures by Bit-Strings} \smallskip The idea of standardized bit-strings as molecular descriptors implies that substructural features are indicated by binary ones (1:s) in a range of predefined positions in a string [42,43]. All bit-strings are subjected to standardization of length and bit positional encoding. This standardization leads to a set of structures which can be computationally compared by using bit operations (i.e boolean algebra). The outcome of these bit operations can, string pair by string pair, be statistically described and converted for instance into Tanimoto coefficient [10,44] (see Section 3.1). In the following we will shortly discuss some techniques, which play a role in this connection. \bigskip \noindent {\bf 2.2.1 Keying:} For the purpose of identifying the substructures of a molecule, one makes use of substructure keys, which is called ({\it keying}). Each position in a bit string corresponds to a specific substructure. If a given substructure is present, the corresponding bit will be turned to 1. No distance information is stored in this process. \bigskip \noindent {\bf 2.2.2 Hashing:} Another way of encoding substructural information is to use {\it hashing}. Here, the bit positions that correspond to special structural features are not predefined, but result from a pseudorandom number generator, which takes integrals from patterns produced by systematic transformations of the molecular graph. The sequence of positions connected to the substructure, is difficult to trace back to the molecular graph, but it does not matter as long as one and the same result from the pseudorandom number generator does always give the same output. Other inputs are highly unlikely to result in a similar output (see Section 3.4: Extended use of Similarity). \bigskip \noindent {\bf 2.2.3 Folding:} A bit string containing structural information can be subjected to {\it folding} in order to place more information into a file of limited size. This is a way to equalizing binary 1:s and 0:s in the bit string. A comparison within a library of folded representations will most likely return some totally irrelevant structures in response to a certain query. Each bit is representing the presence of a special fragment, where the different alternatives are chemically unrelated to each other. All molecules which would be returned without folding are present in this group, and there is a gain in performance which is not negligible. For the result, an atom-by-atom comparison can be made to rule out improper elements. This, latter step can not be done as fast as the screening operation itself. \bigskip \bigskip \noindent {\bf 2.3 Global Parameters for the Screening Process} \smallskip Global parameters are often used to describe molecules in a statistical fashion. They can also be flagged in bit- strings, although this is not common. Examples of parameters that can be handled this way are given in the following list [8-14]: {\obeylines {\noindent} 1) Number of amine groups {\noindent} 2) Number of nitro groups {\noindent} 3) Number of acid groups {\noindent} 4) Number of amide groups {\noindent} 5) Number of rotatable groups {\noindent} 6) Number of reactive groups {\noindent} 7) Molecular weight {\noindent} 8) Dipole moment {\noindent} 9) Solvent Accessible Surface Area {\noindent} 10) Hydrophobic Surface Area {\noindent} 11) Weakly Polar Surface Area {\noindent} 12) Volume {\noindent} 13) Solute-as-donor Hydrogen Bonds {\noindent} 14) Solute-as-acceptor Hydrogen Bonds {\noindent} 15) Onsager dipole solution index $(Dipole^2/V)$ {\noindent} 16) Cohesive index for solid {\noindent} 17) PM3 ionization potential and electron affinity {\noindent} 18) Globularity} \noindent One example of how this information can be used to reduce a virtual library is given by Waszkowycz et al. [14]. Before docking, all compounds in the library were filtered through a logical filter and statistically tested. The conditions were the following: [14] {\parindent = 1cm \litem{1.} MW 200-600 \litem{2.} Number of heavy atoms $>$ 10 \litem{3.} Log P (-8) - (7) \litem{4.} Number of hydrogen bond acceptors $<$ 9 \litem{5.} Number of hydrogen bond donors $<$ 9 \litem{6.} Number of rotatable bonds $<$ 11 \litem{7.} Number of asymmetric carbon atoms $<$ 4 \parindent = 20pt \noindent With focus on the first list, a factor which is partly covered by including intramolecular pair distances is the volume. If one believes that the volume is important in itself one will have to chose structural parameters or features that indirectly bring in the volume in the screening algorithm. Also, it is possible to consider the volume as an individual representation parameter. This however is not done in the programs developed in this work. It is highly likely that any decent presentation in terms of structural features and distances will be redundant in some ways. On the other hand, many of the structural parameters are extremely easy to single out for a description in computational form. \bigskip \bigskip \noindent {\bf 2.4 The Enrichment Factor of Virtual Screening} \smallskip The best tool for judging on the outcome of a chosen virtual screening approach is the {\it enrichment factor} [13]. Virtual Screening is used to suggest compounds in a data set, which are ideally of high affinity to a certain receptor. For validation of the made suggestions one has to check the outcome against a sample, chosen by random. The enrichment rate describe how much higher the proportion of hits (i.e active compounds) is in any given sample of entities compared with the randomly picked sample [11]. When performing virtual screening the ability of achieving enriched samples are of much greater value than the ability of predicting single affinities with high accuracy. In a library containing nL compounds, a sample with nA compounds has been experimentally verified as bioactive. By virtual screening nN compounds are predicted as active and of these f(nN) belong to the group of known bioactive compounds. A randomly picked sample will on average contain nA nN/nL active molecules. Therefore, the formula for the enrichment factor is [13]: $$ Ef={f(nN) / nN \over nA/nN} = {f(nN) \over nA} \eqno(2.1) $$ An enrichment factor of 1 is equally good as random selection. \vfill \eject \bigskip \def\chapter{3. Molecular Similarity} \headline={\vbox{ \line{\strut \hss\ninerm\chapter\hss} \hrule} \hss} \def\num{3.} \def\title{Similarity} {\kapitel} In the first 25 years of the existence of computational structural retrieval systems, the design of these methods was based on techniques derived from graph theory, which implied two-dimensionality. A new concept, which included three-dimensional descriptors, was introduced in the late 1980ies and then by time developed to comprise also conformational features. [15,16] In the beginning, one focused on exact searches and substructure searches. The next advancement was due to the introduction of a well-defined similarity concept. [15] \bigskip \noindent {\bf 3.1 Similarity Processing} \smallskip \noindent In the area of virtual screening, it is common to distinguish between three types of similarity processing approaches [15]. {\noindent} A) This method uses quantum mechanical descriptors to simulate chemical structures. Electron probability functions are used to model the molecular shape and nature. Agreement between electron probability density distributions of a given molecule with a suitable reference counts as evidence for (partial) similarity. The number of structures that is correctly related to each other by this approach is normally rather low. {\noindent} B) This approach is based on maps of steric, electrostatic, and hydrophobic properties of an individual molecule. The maps are broken down into point distances in an absolute space. The use of grids reminds of CoMFA (Comparative Molecular Field Analysis) [28] for 3D QSAR [23]. {\noindent} C) In the third type of approaches, the molecules are represented as fragmented bitmaps reflecting substructures or other easy to describe properties. Such an approach will be developed and used in this work. \vfill \eject \noindent {\bf 3.2 Two-Dimensional Similarity Searches} \smallskip One consider these algorithms as 2D-search techniques because they are using 2D-descriptors, i.e descriptors which do not consider conformational features, but just connectivity of the atoms in the molecule and the types of the atoms forming the molecule. The 2D-representations are at the same level as for example a Lewis structure of a molecule. Within a 2D-similarity comparison based on 2D-descriptors it is easy to carry out substructure searches. The presence of a substructure is marked by the setting of a binary 1 in a bit-string. Each pair of atoms is considered and the atoms in each pair are related to each other by the shortest bond path. In this way, all conformational information is disregarded. The atom,atom distances can be represented as integer labels for the atoms along a given bond path. All information can be digitalized and set into a bit string. If the same atom types are present in two different molecules and in both cases separated by distances, which are in a common interval, this will be interpreted as a case of substructural similarity. Searches for substructures are of course not necessarily limited to just one fragment, but can be for an entire set of substructures. \bigskip \noindent {\bf 3.3 Three-Dimension Similarity Searches} \smallskip When a 3D virtual library is screened, atom pairs are considered in a similar manner as for 2D-screening. The main difference is that as a third parameter (augmenting the integer labels for atom type 1 and atom type 2) is a real distance rather than another integer bond label that denotes edges of a 2D graph. The distances are categorized according to a predefined set of minimum-maximum intervals. This corresponds to a discrete representation which can be converted into a bit string. Isomorphic 3D searches [15, 41] are often referred to as geometric searches. Because an exact geometry is evaluated, different conformations of the molecules contained in the virtual library also become an important matter. To look at similarity in this way is the same as introducing the lock and key model and the bioactivity concept in the similarity definition. Although each molecule has its energetically optimal conformation, the union of receptor and interacting ligand will imply another energetically favorable conformation of the ligand. The latter is likely to be close to the first conformation, but experience suggests that reliable predictions cannot be made in this connection. This is the reason why 3D-searches must be done with different conformations of each considered molecule in the initial library where of course more weight is placed on low energy conformations. \bigskip \noindent {\bf 3.4 Extended use of Similarity} \smallskip The choice of descriptors must be done in a way that meaningful search results on common queries are returned by the screening algorithm [15,21,22]. This becomes even more important when gradual similarity is considered. Given an interest for substructure relationships between molecules, there are two different approaches. Either the question is limited to a single substructure, and all returned structures must contain this substructure, or it is formulated as an entire structure next to which the computer is expected to suggest a list of analogues by substructural resemblance. In the former case, all fragments included in the query must be in the returned structures. In the latter case, there must be a weighting scheme so that matches can be listed by priority. If solved properly, the output also will list the best matches first in the result list, so that truncation of the results becomes logical. \bigskip \noindent {\bf 3.5 Limitations of the Similarity Concept} \smallskip Raw screening has seldom resulted in successful drugs [8,9]. Although screening possibilities have been increased by several orders of magnitude, the frequency of drugs proposed for clinical trials does not show the same tendency. In order to use virtual screening as the effective tool it can be, one has to know about its limitations. Here are some examples: {\noindent} 1) If the search cannot be based on other active drugs fulfilling the same purpose, the method can not be used. {\noindent} 2) 2D similarity searches return results with low structure diversity compared to the activity diversity. {\noindent} 3) All methods that are built on the concept of similarity are expected to make use of analogies. Although there are less approximate subtechniques in use, the similarity methodology focuses on screening large databases for interesting compounds, an approach which does in general exclude all advanced types of structure describing formalism. Within these frames it is for instance not realistic to expect that one should be able to predict complicated solvation phenomena with any accuracy. {\noindent} 4) When using 3D pharmacophores, the fingerprint screening method must take a number of conformations into account. The larger the compounds are, the larger is in general also the number of possible low energy comformations. To be able to handle this, one has to limit the library size, to exclude some possible conformations already from the beginning (which may lead to inadequate results) and to use simple pharmacophores. {\noindent} 5) Usually binary fingerprints of 3D pharmacophores do merge all occurrences from all conformations in one and the same file. This means that occurrences, which in reality are never simultaneously present can be matched for simultaneous existence. High order pharmacophores solve this problem but also imply higher calculational cost. {\noindent} 6) The number of known compounds is much larger than the number of active drugs. Most known drugs, and therefore probably also most of the unknown drugs, are clustered in the molecular space, i.e they are not uniformly distributed. Because in the area of virtual screening, one is dealing with a huge number of molecules, it will always be difficult to find the right balance between congruence and divergence when composing a library. Also, the library, which is ideal for similarity searches targeting one type of drug, is not ideal for other drugs. There is no way to escape the presence of unwanted agglomeration. It may sound strangely, but similarity searches will be only successful if the virtual library screened is sufficiently divergent and avoids a clustering of compounds in an area of virtual chemistry space that is not hosting any potential leads. Hence, if one considers molecular similarity it will be a must to consider also molecular diversity. \bigskip \bigskip \noindent {\bf 3.6 Molecular Diversity} \smallskip The general concept of molecular diversity can be defined as the compliment of molecular similarity [45]. Molecular diversity is of special interest when clustering a data base, because it allows the use of complementary algorithms, which do not share the weaknesses of the average similarity-based algorithms, for instance when it comes to a dependence in the order in which compounds are taken into account. Molecular diversity concepts can be applied in different ways: \noindent \item{1.} Visualization by histograms, plots and multi-dimensional scaling \noindent \item{2.} Representation by molecular descriptors \noindent \item{3.} Reduction of dimensionality by factor analysis and PCA \noindent \item{4.} Clustering and diversity analysis of binary string representations \noindent The following discussion applies to (4). When calculating bimolecular diversity, two binary-strings are used, one as search query (A) and one for the reference structure for comparison (B). For the matching operations, the following information has to be given: \noindent {\obeylines a = Number of 1:s in A b = Number of 1:s in B c = Number of common 1:s for A and B d = All 1:s in A or B which are not common (XOR) n = The length of the bit-strings} \noindent The Tanimoto coefficient (Tc) is the most commonly used coefficient to make any conclusion with regard to similarity from this information. [10,44] $$ Tc = {c \over (a+b-c)} \eqno(3.1) $$ The Tanimoto diversity measure follows from (3.1): $$ D_{Tan}= {{1 - c} \over (a+b-c)} \eqno(3.2) $$ Small molecules with a limited number of features tend to show high diversity when compared with other molecules. \noindent Another measurement for diversity was proposed by Hamming [10,44]: $$ D_{Ham} = {d\over n} \eqno(3.3) $$ Large molecules with many structural features tend to show high diversity when compared with other molecules. \noindent Based on (3.2) and (3.3), a modified measure for diversity called the Dixon-Koehler modification [10,44] is proposed: $$ D_{D-K} = D_{Ham}D_{Tan} = {d \over n} \enskip {1-c \over a+b-c} \eqno(3.4) $$ The size effects connected to the diversity measures (3.2) and (3.3) cancel each other in the Dixon-Koehler diversity measure. \vfill\eject \bigskip \def\chapter{4. A Virtual Screening Method Based on Structural Searches} \def\num{4.} \def\title{A Virtual Screening Method Based on Structural Searches} {\kapitel} In this work, a virtual library was set up and a set of screening programs written to do virtual screening in this library. All screening processes carried out in this work are based on molecular similarity. Similarity is tested by comparing bit-strings of the molecules. These bit-strings are generated from the atomic coordinates of the molecules in the library to make screening an efficient process. The output of the similarity screening is given in form of few simple similarity numbers. In the following, we describe the programs and their purposes. \noindent {\bf Program "kol.f90":} For achieving a simple representation of all molecules before their conversion into bit-strings, a procedure was used in which all structures are formulated in terms of functional groups such as aromatic rings, amide, ester, acid, alcohol, or halide group. The original atom information is in this way compressed to functional group information although the positions of some hetero atoms are kept because they correspond to the center of a functional group. The results of the conversion into functional groups is stored in a new library keeping the old one. Information about the mass center for each molecule is also stored. The settings for the program kol.f90 are stored in the file "kolfil" (see Appendix). \noindent {\bf Program "repl.f90":} More structural features can be added by analysis of bioisosteric relationships. The input files are taken from the new library and the output is written as complementary information at the end of the previous input files. Because most bioisosteric features are highly special, much work must be put into the settings kept in the file "bioisosteri" (belonging to this program) to make it meaningful to run the program at all. This is the reason why the current investigation was done without including bioisosteric relationships. \noindent {\bf Program "bearbprogram4.f90":} This program will produce a third library from the second one generated in program "kol.f90". This third library comprises the same molecules as the previous ones, but now all molecules are represented as bit-strings. Only the lowest energy conformation of each molecule is taken into account. The structural feature information is redefined in form of group numbers chosen according to the intrafeature similarity (defined by the user). Some structural features will be represented by more than one group number. \noindent All programs are written in Fortran 90. Some additional programs are written in JavaScript. The JavaScript programs are listed in Appendix 10.3. \bigskip \noindent {\bf 4.1 The Approach used in this Work} \smallskip The actual method uses different levels of modeling to reformulate the ligand representations. In the first step, atoms and their coordinates are grouped for a molecule and condensed to structural features connected with (in most cases) new coordinates. It is not necessary to extract structural feature coordinates from atom coordinates. Because the initial representation is not determined by any other criteria rather than being part of a sequential approach. When finding ways to reduce the impact of stochastic influences, this will lead to considerable improvements of the screening process. There are two types of stochastic influences. A specific bit position can correspond to more than one measurement. If two non-identical measurements create a hit through identical bits, this is due to one kind of stochastic influence. Another influence can arise when the distance ranges and groups are set. The distance ranges are not continuous. Also, the priorities are partially arbitrary. The meaning of a stochastic influence is here general and does not divide in either of the types. It is even possible to chose an initial representation in such a way that the focus is on such structural features, which are considered important for a specific biological activity. One can think of potential or vibrational maps, although the latter are not important in themselves, however as a complement to other choices of description. This entire approach is designed in a way to make the comparison as fast as possible. Preparation of a suitable description for each molecule may take time, but this will always be a one-time procedure. The molecule can then be compared with any other molecule, and this without doing the conversion once again. Hence, the conversion should be done in the best way possible. The larger the number of parameters per compound is, the more stochastic influences accompany the comparison. Therefore it is important to reduce the number of trivial features (like $-CH$ and $-CH_2$), which do not present the molecule in a characteristic way. With the lock and key model in mind it seems logical to assume that features around the imagined contours of each compound is of special importance for the molecular description. [29] In order to emphasize the molecular characteristics which according to the key and lock-principle should be of expected importance, one can give priority to measurements that are more likely surface-related. In this work, the central point between each structural feature pair is set into relation to the center of mass of the molecule and the angle between the two lines connecting the structural features as well as the midpoint between the structural features and the center of mass is calculated. The resulting distances and angles can be examined and set into relationship to the values connected with other structural feature pairs. A fast way to give priority to structural features connected to molecular shape is to select a limited number of pairs spanning the the molecular dimensions and leading to the largest interfeature distances. In this way, each molecule is represented by the structural feature pairs that are far from each other or far from the center of mass. A more rigorous way to consider molecular shape features and molecular volume would be to trace out a Conolly surface [46] of the ligand, to represent this with a single sphere of radius r, and then to count only features which are connected to just this single radius. This is of course more costly (if done a million times), but when the calculations are once made, the usual screening can be performed in the common way. \bigskip \noindent {\bf 4.1.1 Conversion of structural information:} The conversion of structural information has the explicit aim to convert a series of atomic parameters (type of atom and position coordinates) for one single molecule to a set of logical characters, which specify "functional group types" or other features, separated by an defined distance. Agreement between two molecules with regard to a special feature is called a "hit", i.e. both molecules have in their bit-strings a 1 at the corresponding string position. There are other definitions of hits, for example that more than one structural feature agree specified in the search for a special drug activity, however this will not be considered in this work. The number of logical characters in the set of individual features is predefined and constant during all comparisons. It is based on how big the average molecule in the library is. If single molecules notably exceed the average molecule in size, they can be virtually decomposed in overlapping pieces and treated piece by piece. Of course, all distance characterization involving different pieces is then discarded. The ideal probability for an average occurrence within the set, must not be too high. This corresponds in the extreme case to a library where all molecules are represented by only one common structural feature, thus leading only to hits (or a 100 \% occurrence). This can become reality when the intended size for a molecule is gradually exceeded so that the probability increases that parts of the molecule agree with the structural features of the reference molecule. Hence, the number of structural features must relate to the size of the average molecule in a reasonable way. This ratio can be expressed in the way that the ideal probability for an occurrence is 50\%. It is possible to work with lower probabilities, which leads to additional cost for memory space and resources. \bigskip \noindent {\bf 4.1.2 Similarity } Because similarity is not limited to a special definition, there are many criteria for implementing similarity criteria. Coincidence lists for ligands and reference molecule, calculated from a set of distance intervals and group parameters, and extracted by program5mass.f90 shows the possibilities of defining structural similarity. It is also possible to use a correlation between structural similarity and functional similarity according to the general observation that structural similarity leads to similar chemical behavior. The question is then whether there is a need for this kind of extension. Many extensions are possible, however in the end it only matters whether the cost for additional similarity tests are justified by the gains achieved in this way. The global similarity of two ligands will always be related to and dependent of the small scale similarity, i.e. the similarity of local structural features. For a given program, the small scale similarity will be defined by the program writer and/or the user. In the present work, for example, functional groups and structural features are already categorized in the file "intervalspec" (see Appendix). By defining bioisosterical features and smaller distance intervals more and more choices can be made. Many questions arise in this connection. Should one consider, for example, a phenyl group with a chlorine as the same structural feature, as a phenyl group with an $NO_2$ susbstituent because both fall in the category of aromatic rings and both have about the same volume properties. Or should one consider the differences in the substituent electronegativities and choose two different structural parameters for the description of the two phenyl ring? Clearly, the answer depends on the receptor properties, for example whether docking of the ligand is determined by just volume or also electrostatic properties. In order to select the best possible similarity criteria, one can think of different concepts to optimize the preferred algorithm and the parameter settings. With a list of preliminary coincident summaries available, the ambition can be to search for extreme values. If two obviously dissimilar ligands show nevertheless a high number of coincidents one can analyze the nature of specific structural features that lead to unwanted similarity coincidents and try to eliminate them. One can also focus on similar molecules with low number of coincidents. In any case it is definitely easy to detect these extremes among average coincidents. In this connection, the probability of finding dissimilar ligands to lead to high coincidences is larger than the probability of finding similar molecules that lead just to few coincidents. This has partly to do with the fact that similarity in the wanted form takes place less often than dissimilarity, but is is also in line with the fact that identical molecules will always get 100\% of all possible matches, independent of structural feature classifications and chosen distance intervals. Identical parts of two molecules will also always guarantee the maximum sum of coincident points.It is just a matter of giving these coincidences enough weight in the comparison. The average sum of coincidents for all ligand pairs and an unspecified set of distance intervals can be about 63\%. This is astonishing because randomly given coincidents would give a value of 50\%. So either some occurrences are generally more frequent whereas other are less frequent or there are not equally many 1:s and 0:s for the collection of all lists. Probably, these two phenomena act together. Chemical occurrences are not independent of each other for each ligand and is is difficult to optimize the parameter settings for all different molecule sizes present in the virtual library. If there are in total for all bit-strings representing ligands more 0:s than 1:s a molecule with heavier weight will look more similar to a larger number of other molecules than a smaller molecule, which has a bitstring with many 1:s. Other aspects come in when biological activity is considered. Since biological activity is modeled in terms of pharmacophores, a different weighting of typical structural features is really needed when switching from one biological aspect to another. Thus, biological similarity is reflected by specific constellations of structural features, which has to be considered in similarity searches aimed at a specific biological activity. If one wants to overcome this problem one has to define an average biological similarity based on a large set of drugs of well-known biological activity. Independent of the choice of similarity descriptors, there will always be stochastic influences, which diminish the accuracy where this influence increases the smaller the ligand representation in terms of similarity descriptors is made. A simple representation of the molecule before describing it in a bitstring of 1:s and 0:s is likely to reduce the impact of stochastic influences. A well-balanced primary representation of the ligands in terms of structural features and their coordinates is a better basis for a more detailed binary representations and reduces stochastic influences. One procedure, which can reduce malicious tendencies depending on molecular size, can be the stochastic addition of 1:s until a prefixed sum limit is reached. This has the drawback that small molecules will get less coincidence points than large molecules when compared with similar molecules. On the other hand, the result will be less (in the limit, in-)dependent of the size of the reference molecule. \bigskip \noindent {\bf 4.1.3 Fuzzy Similarity:} It is interesting to see the similarity approach in relation to the general concept of fuzzy searches. [47] The results of high resolution measurements and exact calculations are not always in line with the concept of absolute accuracy. In the frame of the present work, one example is the choice of distance interval parameters. If one uses overlapping intervals, the result will be less restrained than if non-overlapping intervals are used. But in order to equalize as many distance relations as possible, the solution with overlapping intervals is recommended. Fuzzy systems are often designed with emphasis on the results, whereas exact systems are designed in reflection to a given model or concept. The AI (Artificial Intelligence) with neural networks [48] is one example of a result-focused concept. During the present work another idea was investigated. In a loop values can be processed and delivered by connection, atom by atom. Such a function is simple to achieve with a connection table already available. The atoms are then used as analogues to the interactive cells in a two dimensional cell map. Two matrices are used. One describes all atom values before each loop, whereas the other one describe the output. The loop is set to run a couple of times, then the resulting values for the different atoms can be used to describe the molecular entity. In this case the fuzziness is about giving each atom some environmental properties. Now an atom is not just an atom, but a representation of that atom type in the actual environment. It is like smoothing a painting with the finger. The gain when incorporating fuzziness becomes clear. A solution like that one let us detect chiral carbons without lots of combinatoric, systematic routines. When looking for an optimal set of parameters to retrieve useful results from similarity searches, this is one way where continuous adjustments according to output relevance is possible. For each search a number of preferred list relations between different molecules in a resulting output collection can be defined as adjustment tool for the exact parameters. Because of that the approach is probably the method of choice when the aim is to develop systems that support a arbitrary form of similarity relationship. \bigskip \noindent {\bf 4.2 Generation of the Virtual Library} The principal goal of this work is to convert molecular geometries via feature-based representations into a binary code. In this respect, the program kol.f90 is the main tool for translating atom-based molecular representations into structural feature-based representations. In the file "kolfil" the code is given, on which the conversion is based. The object, to which program kol.f90 is applied, is a virtual library, which was compiled in this work. Since the virtual screening process carried out in this work will consider just structural similarity, for each molecule the molecular geometry in form of Cartesian coordinates is stored in the virtual library. The selection of the molecules was based on three criteria: a) biochemical importance and biological activity; b) availability of geometry; c) molecular size (small molecules with less than 8 atoms and large molecules with more than 80 atoms were excluded to have molecules with typical ligand volumes). Some geometries of the molecules contained in the virtual library were computed with the program SYBYL, [49] whereas others are taken from a library of the program "BALL AND STICK". [50] If one uses geometries from different modeling sources (one cannot expect that for a large virtual library all molecular geometries are optimized as it is the case in quantum chemistry; as a matter fact most geometries are modeled from simple line representations using standard sets of bond lengths and bond angles), one and the same bond type may appear with rather different bond lengths, which leads to stochastic noise in the screening process. In the present work this was avoided by using different encoding sets for molecules of different origin. In the following all molecules constituting the virtual library are listed. First the English name and then in addition the Swedish name or code name used in the primary source are given. In addition, it is clarified which encoding set ("original" stands for structures obtained by geometry optimization with SYBYL, "PDP set" for geometries obtained from the Ball\&Stick library: "mc" denotes a manual correction as explained in the text; the "Nicotine set" was designed to be able to correctly interprets structures similar to nicotine: "Nicotine" and "Nicotine2" were obtained with SYBYL and "nicotine" was taken from the Ball\&Stick library) is used for each molecule. \smallskip \settabs\+ XXX&XX2-chloro-4-hydroxy-6-amino-&XX2-chloro-4-hydroxy-6... XX&&XXPDB set \cr \+ 1) & Benzatropine & "bensatropin" & original \cr \+ 2) & Budesonide & "budesonid" & original \cr \+ 3) & Bupivacaine & "bupivakain" & original \cr \+ 4) & 2-Chloro-4-hydroxy-6-amino- & "2-chloro-4-hydroxy-6..." & PDB set \cr \+ &1,3,5-triazine & & \cr \+ 5)& 2-Hydroxyglutarate & "2-hydroxyglutarate" & PDB set \cr \+ 6)& 2-Chloroethanol & "2-chloroethanol" &PDB set \cr \+ 7) & 2-Hydroxy-4,6-diamino- & "2-chloro-4-hydroxy-6..." & PDB set \cr \+ &1,3,5-triazine & & \cr \+ 8) & Cocaine & "kokain" & original \cr \+ 9) & 4-Methylheptan-3-ol & "4-Methylheptan-3-ol"& PDB set \cr \+ 10)& Lofepramine &"lofepramin"& original \cr \+ 11) & Amphetamine &"Amphetamine"& PDB set \cr \+ 12)& Acetophenone & "ar0016" & PDB set \cr \+ 13)& Meclizine & "meklozin" &original \cr \+ 14)& Methadone & "metadon" & original \cr \+ 15)& Morphine & "morfin" & Nicotine set \cr \+ 16)& Noscapine & "noskapin" & original \cr \+ 17)& Oxotremorine & "oxotremorin" & original \cr \+ 18)& Atrazine & "atrazine" & PDB set \cr \+ 19) & Benzyl alcohol & "Benalcohol" & PDB set \cr \+ 20) & Procaine & "prokain" & original \cr \+ 21) & Simanneal & "simanneal" & original \cr \+ 22) & Terfenadine & "terfenadin" & original \cr \+ 23) & Tiazotienol & "tiazotienol" & original \cr \+ 24) & Trimipramine & "trimipramin" & original \cr \+ 25) & Meropenem & "meropenem" & original \cr \+ 26) & Benzaldehyde & "Benzaldehyde" & PDB set \cr \+ 27) & 2,4-Dihydroxy-6-amino- & "2,4-dihydroxy-6-am..."& PDB set \cr \+ &1,3,5-triazine & & \cr \+ 28) & 2-Chloro-4,6-diamino- & "2-chloro-4-hydroxy-6..." & PDB set \cr \+ &1,3,5-triazine & & \cr \+ 29) & Spirodioxaundecane & "spirodioxaundecane" & PDB set \cr \+ 30) & Santene & "santene" & PDB set \cr \+ 31) & S-7-Methyl-3-nonanone & "S-6-Methyl" & PDB set \cr \+ 32) & R-Sulcatol & "R-Sulcatol" & PDB set \cr \+ 33) & Phenol & "Phenol" & PDB set \cr \+ 34) & R-Seudenol & "R-Seudenol" & PDB set \cr \+ 35) & Piperonal &" Piperonal" & PDB set \cr \+ 36) & Pentachlorophenol & "Pentachlorophenol" & PDB set \cr \+ 37) & Nicotine & "Nicotine" & original + mc \cr \+ 38) & Nicotine & "Nicotine2 "& original + mc \cr \+ 39) & Nicotine & "nicotine" & original + mc \cr \+ 40) & myo-Inositol & "myo-inositol" & PDB set \cr \+ 41) & Mescaline & "Mescaline" & PDB set \cr \+ 42) & m-Cresol & "m-Cresol" & PDB set \cr \+ 43) & Linoleate & "linoleate" & PDB set \cr \+ 44) & Isopropylammelide & "isopropylammelide" & PDB set \cr \+ 45)& Ibuprofen & "Ibuprofen" & PDB set \cr \+ 46) & Frontalin & "Frontalin" & PDB set \cr \+ 47) & exo-Brevicomin & "exo-Brevicomin" & PDB set \cr \+ 48) & Epinephrine & "Epinaphrine" & PDB set \cr \+ 49) & Dopamine & "Dopamine" & PDB set \cr \+ 50) & d-3-Hydroxyproline & "D-3-hydroxyproline" & PDB set \cr \+ 51) & d-Tartrate & "D-tartarate" & PDB set \cr \+ 52) & Coniine & " Coniine" & PDB set \cr \+ 53) & Coniine & "coniine" & PDB set \cr \+ 54) & Disulfiram & Disulfiram" & PDB set \cr \+ 55) & $\beta$-D-Galactose & "beta-D-Gal" & PDB set \cr \+ 56) & Benzoic acid & "Benzoic-acid" & PDB set \cr \smallskip \noindent In the set of training molecules, structures such nicotine or coniine were included on purpose several times although they differ onlt by their orientation in space or by minor conformational differences. These structures provided suitable examples for testing the procedure worked out in this thesis. The latter must recognize of course if there are identical structures or structure that differ by some minor conformational changes. \noindent Nicotine has a pyridine ring, which is recognized by the program as a ring with double bonds first and second as an aromatic ring. By removing the double bonds the more important feature 62 (aromatic six-membered ring) remains. In general, a priority list can be applied to give preference to important features such as aromatic rings. In this work, ligands have been selected in such a way that the need for such a list was no longer given. Probably, this could have been handled in a more elegant way, but because of the limited time available for this work, the manual correction was considered as an acceptable alternative. \noindent For the encoding of the 56 structures given above the following list of structural features was set up. \smallskip \settabs\+ Code numbeXXX & Structural feature elementXXX \cr \+ Code number & Structural feature element \hfill\cr \+1 \hfill & carbonyle\hfill\cr \+2 \hfill & not in use \hfill\cr \+3 \hfill & ether \hfill\cr \+4 \hfill & alcohol \hfill\cr \+5 \hfill & ester \hfill\cr \+6 \hfill & carbonylic acidÊ\hfill\cr \+7 \hfill & primary amine \hfill\cr \+8 \hfill & secondary amine \hfill\cr \+9 & tertiary amineÊÊ \cr \+10 & quartery amineÊ \cr \+11 & primary amideÊÊ \cr \+12 & secondary amideÊ \cr \+13 & tertiary amideÊ \cr \+14 & ÊÊÊnot in use \cr \+15 & primary imineÊ \cr \+16 & secondary imineÊ \cr \+17 & amideÊof the type R(O=)C-N=C-RÊ \cr \+18, 19 & ÊÊÊnot in use \cr \+20 & doubly bonded atomÊ \cr \+21 & triply bonded atom \cr \+22-29 & ÊÊÊnot in use \cr \+30 & non hydrogenated carbonÊÊ \cr \+31-39 & ÊÊÊnot in use \cr \+40 & divalent SÊ \cr \+41 & SO group \cr \+42 & SO2 group \cr \+43, 44 & ÊÊÊnot in use \cr \+45 & F atom \cr \+46 & Cl atom \cr \+47 & Br atom \cr \+48 & I atom \cr \+49 & ÊÊÊnot in use \cr \+50 & CH3 group \cr \+51 & C neighbour for CH3Ê \cr \+52-60 & ÊÊÊnot in use \cr \+61 & aromatic 5-membered ring centerÊ \cr \+62 & aromatic 6-membered ring centerÊÊÊ \cr \+63 & ÊÊÊnot in use \cr \+64 & center of a 4-membered ringÊÊÊ \cr \+65 & center of a 5-membered ringÊ \cr \+66 & center of a 6-membered ringÊ \cr \+67 & center of a 7-membered ring \cr \smallskip \noindent It has also to be mentioned that a series of molecules had to be excluded from the training library because of encoding problems. The molecules which were excluded from the scope because of incorrect interpretation are: \smallskip {\obeylines \noindent 1) chlordecone (incorrectly interpreted as ring complex); \noindent 2) cinnamylalcohole (long double bond); \noindent 3) caffeine (two amides in a six-membered ring); \noindent 4) levamizol (One five-membered ring (feature 65: 5-ring center) is missing, along with one sulfur); \noindent 5) karbamazepin (the amide and amine N atom could not be distinguished): \noindent 6) annotinine (incorrectly interpreted as ring complex) \noindent 7) 2-prophylthiethane-R (the 4 membered ring exceeds the range of distance intervals because of the sulfur atom) } \bigskip \noindent {\bf 4.3 Description of the Program kol.f90} \smallskip The input data for the program "kol.f90" consists of 4 independent variables for each atom in a molecule, namely the atomic number and the x, y, and z coordinates. The input format is as follows: \noindent AN x y z \noindent In addition each atom get an ID number which corresponds to its order in the input sequence, the first atom gets a 1, the second a 2, etc. This additional information is later needed to identify the atoms: {\obeylines \noindent ID AN x y z 1 2 3 . } \noindent All information is stored in two fields, the integer vector "atomsort" of length N, which contains the atomic number for each of the N atoms and the (3,N) matrix "posxyz" containing the x,y,z coordinates of each atom. There is another matrix called "platsixyz" corresponding to "posxyz", which has the following form: {\obeylines px py pz x y z 1 \ \ 1 \ \ 1 \ \ 2 \ \ 2 \ \ 2 \ \ 3 \ \ 3 \ \ 3 \ \ . } \noindent In this setup, px is connected to and sorted along with x, py with y, and pz with z. \noindent By sorting the elements of column vector px according to their position in x direction, the elements of column vector py according to their position in y direction, and those of vector pz according to their position in z direction independently, the information to which atom a certain px, py, or pz value belongs is lost, unless each px, py and pz is associated with the appropriate ID reference. Taking this into account, each set of position variables x,y,z is sorted according to all interatomic distances from large to small. All x-values are sorted placing the highest x-value of an atom first in the x-column, the second highest into position 2 and so on. All y-values are separately sorted, again with the highest y-value placed first in the y-column. The corresponding ordering is done for the z-values. The two dimensional integer field platsixzy of dimension (3,N), takes care that each x,y, or z value is associated with the atom ID it belongs to. If, for example the x value of the 5th atom with ID=5 winds up after the sorting in position 10, then element platsixyz(1,10) = 5, if the y value of atom 5 ends up in position 20, then element platsixyz(2,20) = 5, and if the z value of atom 5 is the smallest found, then element platsixyz(3,1) = 5, etc. So the correct expression to retrieve the value y value for atom 5 after the sorting is posxyz(2,platsixyz(2,5)). \noindent As it turned out, this approach may not be the most efficient for handling the actual problems of this work. However, the advantage is that sorting operations have replaced more expensive algebraic calculations which would be for example necessary if one would determine the connectivity via the distance matrix, where for each pair of atoms the Eukledian distance (square root of a sum of squares) has to be calculated. With the procedure developed in this thesis it is possible to determine the connectivity between the atoms in the molecule by choosing an ID and checking close values in x, y, z of other atoms. By proceeding stepwise in a sorted matrix, there are a lot of values that can be skipped. If a value exceeds a certain threshold (e.g. 1.44 {\AA} for aromatic CC bonds) in either x, y or z direction, no further steps in the corresponding direction will be needed. \noindent Such a search algorithm could be of particular interest in molecular dynamic simulations. To sort a vector which is already pre-sorted takes less computer power than sorting a completely disordered vector. During successive loops within an optimization, the values x, y, z for other atoms close to a single, targeted atom are many times just refined. This facilitates the sorting approach. \noindent In program kol.f90, no distances matrix is calculated. However, program kol.f90 checks all distances between non-hydrogen atoms parallel with possible secondary connections. Two atoms that are covalently bonded to a common neighbour atom are viewed as having a secondary connection to each other. In the search for atom bonds and secondary connections, first all atoms are considered as base atoms. For each base atoms, neighbors in the range of atom, atom bonds and secondary connections are found by searching in the corresponding pre-sorted vectors x, y and z vectors. Each pair of secondary neighbors is considered as an angle point of a triangle which implies the possibility of a ring, where the base atom is the third angle point. Atom candidates for the latter are then compared among each other with respect to distance. A constellation of tree atoms and tree appropriate distances (which does not exclude the possibility of having the atoms incorporated in a common ring) gives a potential center defined by the average xa, ya, za of all involved positions. The number of atoms involved in the ring can be calculated from the distance between the two atoms related to the base atom by the secondary connectivity. \noindent Potential centers, which are closely located to atom neighbors of the respective base atom from which they are derived or to other potential centers that originate from the same base atom, are deleted. Definite ring centers are obtained through a confirmation process where in space closely related centers are flocked in groups. Flocking means that several units with respect to a certain aspect float together and either are treated as a new unit, or at least become less easy to distinguish as individual units. If the number of potential centers in a group is high enough, a definite center is taken from the average x, y, z values for all participating potential centers. \noindent The flocking process is performed in a similar fashion as if the atoms were related to each other in space, accordingly by a sort process and stepwise consideration of different dimensions. \noindent The direct connectivity is used when information about functional relations between atoms are stored and transferred to determine functional complexity, e.g. the characterization of functional groups. An ester is for instance determined by detection of the typical C=O distance and setting of the information to the -O- atom from C in combination with the own base atom setting for -O-. All features except rings are defined with respect to measurements which are given by direct connectivity, i.e. covalent bonds. But to achieve functional groups including more than three atoms and two covalent bonds, it is necessary to rely on some kind of transfer of information above the limits of a base atom and its closest neighbors. This applies to features like esters vs carbonyles and also primary vs secondary amides, and to more complex groups in general. However, the coverage of complex groups in general is difficult in the framework of a F90 program, and, therefore would be beyond the scope of this thesis. (The JavaScript-programs in the appendix are, in contrast, easy adaptable to new, complex groups of any reasonable size.) \noindent All typical structural features of a given molecule, such as rings, amines, amides, ethers and so on used in this work, are listed in the output file. The format of the output files is similar to that of the input file: \noindent F x y x \smallskip \noindent {\bf 4.3.1 Unavailable Structural Features: } Program kol.f90 describes molecules by a number of structural features, among which $-CH_3$ (feature no. 50) can be one. If $-CH_3$ is attached to $-C-$ (because hydrogens are ignored when the molecule is modeled) this second carbon rather than the methyl C is center of the methyl structural feature. Longer carbon chains will not change this labeling. The label will stay on carbon two. However, moving of the label from carbon one to its neighbour will not be done if the second carbon carries already another label. Also if the other label is set later (depending on which order different atom IDs initially are listed), feature no. 50 will be returned to the methyl carbon. \noindent Some tests were done in EXCEL to decide whether the methyl labeling could be handled in a more accurate but still simple way. However these tests showed that only with a connectivity matrix, the problem can be solved in an easy way. Therefore, an alternative methyl labeling is available in a program written in the JavaScript. \noindent Another interesting chemical property is chirality. Chiral carbons can be identified by a process that fully characterizes a substituent. One starts with the first atom of the substituent directly connected to the C atom and then, neighbour by neighbour, assigns all atoms of the substituent until the substituent is fully characterized the result of which would be a structural feature matrix. The matrices of the four substituents are then compared to determine any equivalences. Of course this procedure can be made simpler by determining a connectivity matrix. In this thesis, this was done in a JavaScript program and the configuration problem was solved in a similar fashion utilizing character strings. \bigskip \noindent {\bf 4.4 Program program4.f90} \smallskip This program loops through all listed structural features. Each structural feature is related to another by distance. This distance is one of the parameters to be encoded into the bit-string. Every molecule has a center of mass. The connection line between two structural features (e.g., a ring center and an amide group) is related to this center of mass by drawing a line form the midpoint of the interfeature connection line and the center of mass. \smallskip \centerline {\example} \smallskip \noindent {\bf Figure 4.1 } A molecular example demonstrating the encoding of structural features \smallskip \noindent The two lines enclose two pairs of angles where the value of the smaller angle is another parameter to be encoded in the bit-string together with the two distances. Hence, structural feature 1, structural feature 2, the two distances, and the angle lead to the position parameters in the position table, specified in the output file "molecule.fin" where for "molecule" the ligand name is used. \noindent For N functional groups, there will be $H = [N^2+N]/ 2$ combinations of functional groups and with M different distance intervals this leads to $M [N^2+N]/ 2$ position parameters. A single position (bp') for the combination of two groups 0$<$p$<$q and 0$<$q$<$N, where p and q are mirrors of the listing of the respective group from 1 to N, is calculated with the formula: $$ bp' = {{N^2+N}\over 2}-{{(N-p+1)^2+N-p+1}\over 2}+q-p+1 \eqno(4.1) $$ This formula can be mathematically simplified, but fulfills its function in the present form of the program. In order to differentiate for various angles and distances, the following value is added to bp': $$ bp = bp' + a(M^2)H + jMH + iH \eqno(4.2) $$ where $a$ is the interval index for the angle (not the number of intervals, $i$ the interval index for the interfeature distance 1, and j for distance 2 from the center of mass to the midpoint of distance 1 (order is arbitrary). For each position, there are two possible settings, 1 and 0. If for a given ligand, a combination of two groups, intervals, and one angle, is actualized by two features belonging to the two groups and one distance within the interval, the setting for the actual position is changed from 0 to 1. Identical combinations of groups and intervals does not change this setting. That means that if an occurrence is marked by a 1 on a certain position, another occurrence on the same position will not change this mark from 1 to 0 and neither is something else changed in the bit-string representation. \smallskip \noindent {\bf 4.5 Program program5mass.f90} \smallskip The task solved in program5mass.f90 is to sum up all coincidences for each pair of ligands. The input for program5mass.f90 is the output of program4.f90, but two files considered now at a time. Each file consists of a bit-string identifying $AM^2H$ positions, where A is the number of angle intervals. For each position in the two files, there will be 4 possible outcomes if boolean algebra is applied to the two files. {\obeylines \noindent 1 .AND. 1 $\rightarrow$ 1 \noindent 0 .AND. 0 $\rightarrow$ 0 \noindent 1 .AND. 0 $\rightarrow$ 0 \noindent 0 .AND. 1 $\rightarrow$ 0} \noindent This setting is used to stress the biological activity, because only if a structural feature connected with bioactivity is found for both ligand and reference molecule, the biological activity of the ligand will be possible. If however similarity as such is stressed, then a second setting will be more appropriate: {\obeylines \noindent 1 .AND. 1 $\rightarrow$ 1 \noindent 0 .AND. 0 $\rightarrow$ 1 \noindent 1 .AND. 0 $\rightarrow$ 0 \noindent 0 .AND. 1 $\rightarrow$ 0 } \noindent The latter settings must be used if for instance only one distance and two structural features are counted. The used settings are fine when looking for a special part of a pattern and when all occurrences are describing a 50\% probability. \vfill \eject \noindent {\bf 4.6 Intervals and Distances} \smallskip When deciding on the distance intervals for the program kol.f90, starting point is a table of valence radii (given in {\AA}). [] In the table, any atom which is connected by a single bond is marked by (s), atoms connected by a double bond are marked by (d), and atoms connected by triple bonds are marked by (t). The length of a covalent bond is given by the sum of the corresponding radii values. The table looks as follows [1]:\smallskip {\obeylines \noindent H 0.37 \noindent C(s) 0.77; C(d) 0.67; C(t) 0.60 \noindent N(s) 0.74; N(d) 0.65 \noindent O(s) 0.66; O(d) 0.57 \noindent F 0.64 \noindent S(s) 1.04; S(d) 0.95 \noindent Cl 0.99 } \noindent Example bond lengths from molecules, energetically minimized with the program Sybyl, were taken as basis for expected variations. There are cases when this approach does not lead to reasonable values. In the ligand meropenem, there is a 4-membered ring with nitrogen as a neighbour to a double-bonded carbon in the next ring. The bond between the two atoms is shorter than the upper limit for double bonds, probably due to the unusually high ring strain. Resonance phenomena, except for aromatic ring systems, are also not included in the algorithm. The variations depending on the type of atom may be included in the overall structural variations. Intervals that cover secondary bond relation distances are of course less easy to set than primary bond relation intervals. In program kol.f90, all ring features are made up by considering secondary bond distances. To handle the problem with secondary bond distance variations, rings are first suggested from different data, which is brought together and confirmed by redundancy. A structure like ecgonine \bigskip \centerline {\ecgonine} \noindent {\bf Figure 4.2 } Structure of ecgonine. \smallskip \noindent with one 7-membered ring and a bridge can be described in form of three separated rings. The S-configured carbon (carrying the OH substituent) will be the base atom for a 3-membered ring where all atoms are part of both the 6-membered and the 7-membered ring. This makes it impossible to determine both ring sizes by the single distance between the two secondary atoms. However, each carbon is tested as base atom. The 7-membered ring will be confirmed by 6 approximately good distances and the 6-membered ring by 3. A six-membered ring can be confirmed only by 3 independent distances because the three corresponding triangles possess equal side lengths on the average. \noindent The settings in the file "intervalspec" are partially based on statistical data, extracted with the help program "program2.f90", which sorts all distances into small intervals ( 0-1 {\AA}, 1-2 {\AA}, 2-3 {\AA}, etc.) and displays them in a statistical frequency program. The program is not of importance for this work and therefore not listed. Each interval is displayed along with the average number of feature pairs which are separated by distances corresponding to the interval. Rough calculations from this data gives a idea of good interval settings to get an acceptable hit distribution. The best way to refine these settings is to run the programs that uses them, to make judgments about the outcome, and then to compensate for unwanted effects and tendencies by replacing bad settings. One example can be the correction made after the first run where only raw distances and features, without mass point reference, are taken into account. The distances are given in the following intervals: {\obeylines \parindent=0pt 1) 0.00 - 2.39 {\AA} 2) 1.54 - 3.93 {\AA} 3) 2.39 - 4.89 {\AA} 4) 3.93 - 6.65 {\AA} 5) 4.89 - 7.93 {\AA} 6) 6.65 - 10.05 {\AA} 7) 7.93 - 12.63 {\AA} 8) 10.05 - 100.00 {\AA} } \noindent Based on these intervals, an average output file for the ligands considered in this work was without any 1:s for interval 8 (10 to 100 {\AA}) and of course heavily populated with 1:s for the lower intervals. By multiplying all interval values by 0.75, a better performance was obtained. To improve the entire algorithm, the center of mass point was introduced for each structure. Each pair of features is still separated by a distance, but the line defined by the positions of two structural features has also a direction. The midpoint M between the positions of two structural features is related to the center of mass and defines a new line with a direction (Figure 4.1). The angle between the two lines is calculated and the distance between the center of mass and the midpoint M is determined. In this way, there are two distances and one angle, which can be used to describe each structural feature pair. The angles are on the average close to 62 deg. as given by the examples studied. Therefore, the intervals must roughly be set to divide the empirical occurrences for the test set in three equal parts. With three intervals this can be done as follows: {\obeylines \parindent=0pt 1) 0 - 62 deg 2) 39 - 75 deg 3) 58 - 90 deg} \noindent With two distances and one angle there is a need for reducing the number of intervals that result from distance and angle combinations. Otherwise, the resulting output file will be too large. \bigskip \noindent {\bf 4.7 The functional unit} \smallskip The objective of virtual screening is to be able to send a query (a bit-string, which represents a molecular structure or a pharmacophore) to the virtual library and to be able to get a list of matching structures from the library as a result. One can use a set of programmed functional units as a virtual library. The programming consists simply of getting bit-strings, which represent different molecules, stored at the functional unit. In the simplest case, each functional unit stores just one bit-string representing exactly one molecule. All functional units are called at the same time by sending a query to the virtual library. Only units which have matching bit-strings (molecules) stored will answer. Each of these functional unit will return a single molecule (i.e. the ID which leads to this molecule). The definition of a match is that the bit-string (bit-string S), which represent the stored molecule and the query bitstring (bitstring Q) have a HitF (Hit Frequency) above a defined limit: S + Q > limit. The criteria for such functional units are the following: {\noindent} 1) Every functional unit stores bit-strings. This bit-string is set with a setting operation. A coded signal, which results from a match of the bit-string and certain preformulated rules, makes the functional unit answer with its identity number. Thus, the way to call the unit is to provide an input which has a HitF over a defined range (more than N bits). The signal then consists of two parts. Part one is a number of bit positions with 1:s. Part two is the minimum number of hits, which is required to give a match. {\noindent} 2) To set the bitstring on each functional unit, a signal in two parts is required. Part one is the ID of the functional unit. Part two is a bit-string or a sequence of positions for each 1 in the bit-string which the unit is set to represent. {\noindent} 3) To prevent responses from to many functional units on a query at the same time, the bitstring and the query, part two, can be assigned dummy positions, each with the probability of 50\% for being set. By rising the required HitF and by at the same time set more 1:s in the dummy sequence in the query, less matches can be provoked. \vfill \eject \bigskip \def\chapter{5. Results and Discussion} \headline={\vbox{ \line{\strut \hss\ninerm\chapter\hss} \hrule} \hss} \def\num{5.} \def\title{Results and Discussion} {\kapitel} In this Chapter, the results of this work are presented and discussed. The tables listing the results of the ligand comparison are given in the Appendices (Chapter 9 and 10). (Tables 9.1 to 9.5). Table 9.1, 9.2, 9.3 and 9.4 contain results of the similarity comparison and represent the essential outcome of this work. Table 9.1 contains special examples for similarity, dissimilarity, and clustering. Tables 9.2, 9.3, and 9.4 give a general comparison in terms of hit frequencies. The results in Table 9.2 are based on bit-strings with 10 occurrences in each (according to the priority listings beginning with high priorities and with a final cut for just 10 occurrences). Table 9.3 is based on bit-strings with 20 occurrences and more (as is Table 9.1), while Table 9.4 take 30 and more occurrences in account. The hypothetical probability for the listed results if all occurrences were distributed entirely by random over the bitstrings, and then the bitstrings compared with exactly that hit frequency as result, is provided as "P". Probabilities lower then $1*10^{-9}$ can be considered as unreliable. The programs written in this thesis utilizing F90 are listed in Chapter 9, where Section 9.7 contains program kol.f90, 9.8 program program5mass.f90, 9.9 program bearbprogram5.f90, and 9.10 program refl.f90. Program kol.f90 converts molecules into a collection of features and a center of mass. Program bearbprogram4.f90 measures distances between features and compares them with intervals where the result is used to form a bit-string. Program program5mass.f90 compares bit-strings to determine a hit ratio (and some other outputs). Section 9.11 gives the definitions file for program kol.f90. The program will read this file to have its distances and properties set. Section 9.11 is separated into three parts, with markers at the beginning of each part that correlate to the table given in Chapter 4.2 entitled "Generation of the Virtual Library." All defined limits, ranges, distances and settings are given in Section 9.12. Chapter 9.13 lists the Help file "intervalspec", which is used by the program bearbprogram4.f90 to make bit-strings of distances and angles calculated from feature locations previously provided by program kol.f90. Chapter 9.15 provides as an example the analysis of ipratropbromid. It shows how rings can be detected via ring centers and then verified as rings. For the molecule ipratropbromid intermediate results are given generated by program kol.f90. Appendix B (Chapter 10) contains in Sections 10.1 and 10.2 the results of the structural feature calculations as obtained with the JavaScript program. Table 10.2 is analogous to Table 9.2. \bigskip {\bf 5.1 Results} \smallskip Information about biological similarity is reached via geometrical similarity. The existence of different functional groups in a given geometrical relationship, i.e. a pharmophore, can the be associated with biological activity. In this way, geometrical similarity leads directly to biological similarity. In general, a Tanimoto coefficient (Tc) above 0.85 for strucure,s which are compared in a 2D approach, does often indicate similarity in bioactivity. The Tc value however has a particular defficence in its dependence on molecular size. By sorting and listing of all feature pairs in form of a predetermined priority, after which only a number of pairs at the top of the list are counted (PEEL: prerequisite of edited and enriched list), all molecules above a certain size are represented by the same amount of bits so that the size effect is eliminated. Here, it is anticipated that in connection with the PEEL priority algorithm one can make use of measurements, retrieved by the earlier described method of bulk implying relationship dependence (BIRD) which at least gives two distances and one angle. The algorithm also uses multiple grouping (MG) to foresee different aspects of every feature. This increases the number of feature pairs, because one feature can be described as a member of several groups, which can of course lead to the risk that one places unwanted emphasis on certain features. To combine MG and PEEL can be particularly dangerous if the settings are not well chosen. A good example of how relationship in biological activity can be mirrored by geometrical similarity is seen for the compound pair bupivacaine and cocaine with a hit frequency (HitF) of 10 and a Tc of 0.33. It must be noted that bit-strings with fewer bits are generally related to each other by low Tc values. Using 20 1:s for each compound (PEEL 20), a Tc of 0.33 is high. It would be most effective if the comparison is carried out on special, parallel chips (using only boolean algebra) rather than on parallel CPU units. In this case, 3D-BIRD-PEEL-HitF would be a much better tool for similarity comparisons than 3D-BIRD-PEEL-Tc. With effective PEEL (where the compounds are large enough), every HitF is connected to a specific Tc. The information weight is identical. Another example of successfully determined biochemical similarity, is given for morphine and metadone with a HitF of 12. Also epinaphrine and dopamine are listed as similar. The nicotine geometries 37, 38, and 39 as well as the coniine geometries 52 and 53 must be described as being identical by the program. The nicotine geometries lead in the comparison only to 19 hits (i.e. 1:s), which does not speak for an effective PEEL. The analysis of big molecules seem to be more accurate than the evaluation of small molecules. Lofepramine and trimipramine, both tricyclic, antidepressive drugs, have as a pair a HitF of 8 and a Tc of 0.25. Benzatropine and bupivacaine, respective benzatropine and cocaine, lead to HitF of 12 and 14, respectively. This is in line with the biological activity of the compounds. Procaine is not identified as a cocaine analog, probably due to differences in size. \bigskip {\bf 5.2 Discussion} \smallskip Most of the compounds that are recognized to be similar by the algorithm are also confirmed by chemical judgment to be similar. This holds for the triazines in the test set as well as for structural pairs such as frontalin and exobrevicomin or myoinositol and $\beta$-D-galactase. Other results are more difficult to understand. Nicotine does not look like isoporopylammelide or ibuprofen. Muscarine, santene, and disulfirame also look different. It is probably reasonable to reserve the feature of carbon without hydrogens just for aliphatic carbons. This setting does even give a HitF of 2 for cocaine and procaine. Still, the similarity between these two compounds does appear most obvious when size dependent PEEL is not used. The pre-PEEL priority list can be set up in a way that feature relevance is included. For instance, rings can be weighed with a factor different from the factor for amides, a factor which is then multiplied into the product that gives the priority. It is interesting to note that metabolic aspects may be implemented by correct weights for functional groups like esters and amides. To make this possible, the mass point reference must probably be modified somehow. A better method is probably multiple booking with respect to expected metabolites. Then, the trick with different feature weights can be used exclusively to range functional groups according to their biochemical relevance. A quarteray amine is more important than a primary amine. A aromatic ring is more important than CH$_3$ and an acid more important than iodide. All these standards can be modified in view of special circumstances. There are no limitations for more complicated settings to manage PEEL so that for instance a functional group will be heavily weighted if another group is present in the molecule, or if it is attached to an aromatic ring. Some of these goals can also be reached by the introduction of additional structural features. It is obvious that descriptive methods that reduce the number of points in space and compensate this by an increasing number of features generally lead to better results. This may imply a heavier use of biosteric analogs, which have not been used in this work although a sample program is added to this work. In order to gain the advantage of such "cleanness" in a general fashion, electronegative centers can be described by the endpoints, like carbon chains are already. One example where this could be done is seen in the compound disulfirame. The two N atoms represent two obvious endpoints of electronegative centers. For small molecules, symmetry descriptors would be of value. On the other hand, small molecules are not especially interesting objects for virtual screening. They are not likely to function as leads for drugs that can lead to a patent. Extensions with regard to common features like azides, nitrogroups, groups that contain phosphorus, and all simple combinations of ketone-amine-ketone, amine-ketone-amine, diamides, diesters, etc. are also highly recommendable. \bigskip {\bf 5.3 Interpretation} \smallskip The program kol.f90 represents a carbon acid by the COOH carbon so that in this way both oxygen atoms are equal. Because of this, acids and acid anions can be treated in the same way. To support a consistent algorithm, esters and amides are also centered at the carbon position. Problems occur when constellations such as carbonyl-amine-carbonyl have to be interpreted. Correctly, this should be seen as an amide placed at the position of the N atom. Caffeine is a molecule where these problems occur. It is crucial for the correct modeling of rings that all distances that occur in the molecule are within the intervals which are set in the file "kolfil". The settings used seem to hinder the correct interpretation of molecules that contain imidazolidine-like rings. Too short distances could inaccurately lead to 4-membered ring features. Aromatic rings, which include an imino NH group, are interpreted as rings with an -N= group. Because H atoms are not considered, there is no way to distinguish between NH and N in an aromatic ring. All distances involved are equal. Complex ring systems like those in chlordecone and annonitine are not correctly described. The rings are here so distorted that the distances are outside the most preferable settings. This work has shown that conventional solutions based on a connectivity matrix are much more reliable than the solution used here. A connectivity matrix, used for bonds, does not exclude an algorithm that is based on sorting, for instance during energy minimization calculations. It allows H-bonds to be brought in as supporting information when other distances do not provide the complete picture. Rings can be localized with 100\% certainty, also when they include a large number of atoms. Then the statistics for secondary bond intervals in rings are not even necessary as supporting information. With a connection matrix available, it is also easy to transfer relative information in all directions when different subgroups are considered. \vfill\eject \bigskip \def\chapter{6. Conclusions and Outlook} \def\num{6.} \def\title{Conclusions and Outlook} {\kapitel} \noindent The following conclusions can be drawn from this work. {\noindent} 1) Secondary bond distances and sorting procedures can not work as a general alternative to connectivity tables. Sorting procedures can, however, be used as a complement to connectivity tables. {\noindent} 2) Comparisons based on bit-strings can be used to achieve useful information about structural and biochemical similarity of molecules. {\noindent} 3) 3D-BIRD-PEEL-HitF can be used as a standard to carry out these comparisons. The use of a Tanimoto coefficient does not lead to advantages. {\noindent} 4) For large molecules, the HitF can be used in combination with special memory chips to make parallel bit searches without parallel processors units possible. As discussed in Section 4.7, an efficient virtual screening process will be based on a combination of hardware and software devices. If every functional unit is physically located on a chip and serves as storage for one type of molecule, then a query can be sent to all functional units simultaneously thus leading to a shortening of turn around times. The details of such an approach have been described in this work (see 4.7). \bigskip \bigskip \vfill\eject \bigskip \def\chapter{7. Acknowledgments} \def\num{7.} \def\title{Acknowledgments} {\kapitel} \noindent The following persons have been of help for finishing this work. \noindent {\bf Dieter Cremer}, one of my supervisors, has helped me selecting the topic, finding the appropriate literature, and starting this work. He also helped me extensively with summarizing this work, correcting my drafts, and finalizing the thesis. \noindent {\bf Elfi Kraka}, my second supervisor, has answered my questions concerning the basic set up, discussing results, and supporting me with the programming and the documentation of results. She also has been helpful for the accessibility of the necessary computer equipment. \noindent {\bf J\"urgen Gr\"afenstein} has in detail and with great care answered my questions related to the computer system. He has also provided answers to many other technical questions. \noindent {\bf Mikael Filatov} has helped me with some programming related problems. \noindent {\bf Tell Tuttle} has answered several questions concerning Sybyl and Tex. \noindent I also want to thank everyone else in the Theoretical Chemistry group for providing a good working atmosphere. \vfill\eject \bigskip \def\chapter{8. References} \def\num{8.} \def\title{References} {\kapitel} \item{[1]} Kraka, E.; Cremer, D. {\it Molecular Modelling and Computer-assisted Drug Design}, {\it Lecture Series}. Department of Theoretical Chemistry, G\"oteborg, {\bf 2002}. \item{[2]} {\it Textbook of Drug Design and Discovery, 3.ed.}, Krogsgaard, P.; Liljefors, T.; Madsen, U. Eds., Taylor and Francis, London, {\bf 2002}. \item{[3]} {\it Guidebook on Molecular Modeling in Drug Design}, Cohen, N. C., Edt., Academic Press, New York, {\bf 1996}. \item{[4]} {\it Computer-Assisted Lead Finding and Optimization}, de Waterbeemd, H.; Testa, B.; Folkers, G. Eds., Wiley, New York, {\bf 1997}. \item{[5]} {\it Molecular Modeling and Prediction of Bioactivity}, Grundertofte, K.; Jorgensen, F.S., Eds., Kluwer Academic, New York, {\bf 2000}. \item{[6]} Boyd, D.B. in {\it Encyclopedia of Computational Chemistry}, Eds. Allinger, N.L.; Clark, T.; Gasteiger, J.; Kollman, P.A.; Schaefer, H.F.; Schreiner, P.R., Wiley, New York, {\bf 1998}. p. 795, and references cited therein. \item{[7]} {\it Molecular Modeling and Prediction of Bioactivity}, Gundertofte, K.; Jorgensen, F. S., Eds., Kluwer Academic/Publishing, New York {\bf 2000}. \item{[8]} W. P. Walters, M.T. Stahl, M.A. Murcko in {\it Encyclopedia of Computational Chemistry}, Eds. Allinger, N.L.; Clark, T.; Gasteiger, J.; Kollman, P.A.; Schaefer, H.F.; Schreiner, P.R., Wiley, New York, {\bf 1998}, p. 1225, and references cited therein. \item{[9]} Bajorath, J. {\it Virtual Screening in Drug Discovery: Methods, Expectations, and Reality}, Albany Molecular Research Inc, 2. University of Washington, USA, {\bf 2002}, p. 24. \item{[10]} Bjoraker, Blower, P.; J.; Fligner, M.; Verducci, J. {\it A new association coefficient for molecular dissimilarity} in {\it The Second Joint Sheffield Conference on Chemoinformatics, Sheffield}, England, {\bf 2001}. \item{[11]} Agrafiotis, D.K.; Xu, H. {\it Retrospect and prospect of Virtual Screening in Drug Discovery}, {\it Current Topics in Medical Chemistry}, {\bf 2002}, p. 1305. \item{[12]} Joseph - McCarthy, D. {\it An owerview of in silico design and screening}, {\it Toward efficient drug discovery}, Principal Scientist, Wyeth Research, USA, {\bf 2002}, p. 20. \item{[13]} Li, J.; Perkins, T.D.J.; Sykes, R.A.; Waszkowycz, B. {\it Large-Scale Virtual Screening for Discovering Leads in the Postgenomic Era}, {\bf 1999}. \item{[14]} Waszkowycz, B.; Perkins, T. D. J.; Sykes, R. A.; Li, J. {\it IBM Systems Journal} {\bf 2001}, {\it 40}, 360. \item{[[15]} Downs, G.M.; Willett, P. {\it Similarity Searching in Databases of Chemical Structures} in {\it Reviews in Computational Chemistry}, Edts. Boyd, D.B.; Lipkowitz, K.B. {\bf 1995}, {\it 7}, 1. \item{[16]} Good, A.C.; Mason, J.S. {\it Three-dimensional Structure-data base Searches} in {\it Reviews in Computational Chemistry}, Edts. Boyd, D.B.; Lipkowitz, K.B. {\bf 1995}, {\it 7}, p. 67. \item{[17]} Brustle, M.; Beck, B.; Schindler, T.; King, W.; Mitchell, T.; Clark, T., {\it J. Med. Chem.}, {\bf 2002}, {\it 45}, 3345. \item{[18]} Murcko, M.A.; Walters, A. W. P. {\it J. Med. Chem.}, {\bf 1998}, {\it 41}, 3314. \item{[19]} Karelson, M.; Lobanov, V.S., {\it Acc. Chem. Res.} {\bf 1996}, {\it 96}, 1027. \item{[20]} Arteca, G.A. in {\it Reviews in Computational Chemistry}, Edts. Boyd, D.B.; Lipkowitz, K.B. {\bf 1996}, {\it 9}, 191. % similarity \item{[21]} {\it Concepts and Application of Molecular Similarity}, Johnson, M. A.; Maggiorra, G. M., Eds., Wiley, New York, {\bf 1990}. \item{[22]} {\it Molecular Similarity in Drug Design}, Dean, P. M., Ed. Chapman and Hall, New York, {\bf 1995}. \item{[23]} Oprea, T.I.; Waller, C.L. in {\it Reviews in Computational Chemistry}, Edts. Boyd, D.B.; Lipkowitz, K.B. {\bf 1997}, {\it 11}, 127. \item{[24]} Kubinyi, H. in {\it Encyclopedia of Computational Chemistry}, Eds. Allinger, N.L.; Clark, T.; Gasteiger, J.; Kollman, P.A.; Schaefer, H.F.; Schreiner, P.R., Wiley, New York, {\bf 1998}, 2309, and references cited therein. \item{[25]} Hansch, C., {\it Acc. Chem. Res.}, {\bf 1993}, {\it 26} 147. % Pharmacophores \item{[26]} Milne, G. W. A. in {\it Encyclopedia of Computational Chemistry}, Eds. Allinger, N.L.; Clark, T.; Gasteiger, J.; Kollman, P.A.; Schaefer, H.F.; Schreiner, P.R., Wiley, New York, {\bf 1998}, 2046, and references cited therein. \item{[27]} Liljefors. T.; Petterson,I, in {\it Textbook of Drug Design and Discovery, 3.ed.}, Krogsgaard, P.; Liljefors, T.; Madsen, U. Eds., Taylor and Francis, London, {\bf 2002} , p. 86. %ComFA \item{[28]} Kubinyi, H. in {\it Encyclopedia of Computational Chemistry}, Eds. Allinger, N.L.; Clark, T.; Gasteiger, J.; Kollman, P.A.; Schaefer, H.F.; Schreiner, P.R., Wiley, New York, {\bf 1998}, 448, and references cited therein. \item{[29]} {\it Cambridge Crystallographic Data Center:}, {\it Acta Cryst.} {\bf 1979}, {\it B35}, 2331. %log p \item{[30]} {\it Quant. Struct. Relat}, {bf 1996}, {\it 15}, 403, {\it and references cited therein}. %PCA, PLS, binary QSAR \item{[31]} Jolliffe I.T., {\it Principal Component Analysis}, Springer Series in Statistics, {\bf 1986}. \item{[32]} Hansch, C.; Leo, A. {\it Exploring QSAR: Fundamentals and Applications in Chemistry and Biology}, ACS Professional Reference Book, American Chemical Society, Washington, DC, {\bf 1995}. \item{[33]} Labute, P., {\it Binary QSAR: A new method for the determination of quantitative structure activity relationships}â IUPAC. Symp. Biocomput., {\bf 1986}, {\it 7}, 444. %Chemometrics \item{[34]} Otto, M.; {\it Chemometrics}, Wiley-VCH, New York, 1999. %Surfnet \item{[35]} Laskowski, R. A., {\it SURFNET: A program for visualizing molecular surfaces, cavities and intermolecular interactions}, {\it J. Mol. Graph.}, {\bf 1995}, {\it 13}, 323-330. \item{[36]} Barnard, J.M.; Downs, G.M.; Willett, P. {\it Chemical similarity searching} in {\it J. Chem. Inf. Comput. Sci.}, {\bf 1998}, {\it 38}, 983. %more Pharmacophore \item{[37]} Leach, A. R. {\it Molecular Modelling}, Prentise Hall, New York, 1996, chapter 10, p 542. %electrotopological maps \item{[38]} Mezey, P. G. {\it Shape in Chemistry}, VCH Publishers, New York, {\bf 1993}. %fingerprints \item{[39]} Gray, N.A.B., {\it Computer-Assisted Structure Elucidation}, Wiley, New York, {\bf 1986}. \item{[40]} Barnard, J.M.; Downs, G.M.; Willett, P. {\it Chemical similarity searching} in {\it J. Chem. Inf. Comput. Sci.}, {\bf 1998}, {\it 38}, 983. \item{[41]} K\"obler, J.; Sch\"oning, U.; Toran, J., {\it The Graph Isomorphism Problem: Its Structural Complexity}, Birkhauser Verlag, Boston, {\bf 1993}. %Bit string based similarity \item{[42]} {\it J. Chem. Inf. Comput. Sci.}, {\bf 1998}, {\it 38}, 379. \item{[43]} {\it Molecular Similarity in Drug Design}, Dean, P. M. Ed., Chapman and Hall, London 1995. %Tanimoto \item{[44]} {\it see for example,} Burden, F. {\it J. Chem. Inf. Comput. Sci. 29}, {\bf 1989}, 225. %molecular diversity \item{[45]} {\it Molecular Diversity}, Roy, K. Special Ed., Vol.8, Issue 4, Kluwer, New York, {\bf 2004}. %Conolly \item{[46]} Brickmann, J.; Exner, T.; Keil, M.; Marh\"ofer, R.; Moeckel, G. in {\it Encyclopedia of Computational Chemistry}, Eds. Allinger, N.L.; Clark, T.; Gasteiger, J.; Kollman, P.A.; Schaefer, H.F.; Schreiner, P.R., Wiley, New York, {\bf 1998}, 1648, and references cited therein. %fuzzy searches \item{[47]} Tanaka, k.; Niimura, T. {\it An Introduction to Fuzzy Logic for Practical Applications}, Prentise Hall, New York, 2003. %Neural networks \item{[48]} Zupan, J.; Gasteiger, J. {\it Neural Networks for Chemists, an Introduction}, VCH, Weinheim, {\bf 1993}. %SYBYL \item{[49]} SYBYL Molecular Modelling Package, Version 6.2 and 6.22, Tripos Inc., St. Louis, USA, {\bf 2004}. %Ball and Stick \item{[50]} {\it Ball and Stick program, Version 3.7}, M\"uller, N., Linz, Austria, {\bf 1996}. \vfill\eject \bigskip \def\chapter{9. Appendix A} \headline={\vbox{ \line{\strut \hss\ninerm\chapter\hss} \hrule} \hss} \def\num{9.} \def\title{Appendix} {\kapitel} \bigskip In Appendix A, most important results obtained in this work are listed (Sections 9.1 to 9.6). Also listed are the FTN90 programs, which were written for this work (Sedctions 9.7 to 9.15). In Section 9.1 containing Table 9.1, important results of the similarity comparison are given for a set of molecules where the coincidence of structural features is set to a limit of max. 20 features. The number of coincidences between each pair of bit-strings representing called "hits" and the Tanimoto Coefficient, Tc (still based on max. 20 1:s) is listed. Molecules with long names have been abbreviated to structure indices. Sections 9.2 to 9.5 contain Tables 9.2 - 9.5, which give the results of similarity comparisons for increasing numbers of structural features considered in the comparison: Table 9.2: max. 10, Table 9.3: max. 20; Table 9.4: max. 30; 9.5: max. 1000. The probabilities P for all coincidences are also listed in these tables. Probabilities below $10^{-8}$ are not relevant except as a low marker reference because of the used algorithm. Note that a higher Tc sometimes can correlate to a higher P. The Tanimoto similarity measure "boosts" big molecules. Also, a relatively low coincidence rating can depend on small molecules in the pair (with fewer features than the max. limit allows). Only similarities with a Tc value $>$ 0.7 are listed in Tables 9.1 to 9.5. \vfill\eject \hoffset -0.6 truecm \noindent {\bf 9.1 Table 9.1} \noindent \halign { \strut \vrule\quad\hfil#\hfil\quad\vrule&& \quad\hfil #\hfil\quad\vrule \cr \noalign{\hrule} &&&&\cr {\sf Names} & {\sf L. Str.1} & {\sf L. Str.2} & {\sf Hits} & {\sf Tc} \cr &&&&\cr \noalign{\hrule} &&&&\cr \vbox{ \hbox{{\sf bensatropin}} \hbox{{\sf bupivakain}} } & {\bildett} & {\bildtre} & 10 & 0.33 \cr \vbox{ \hbox{{\sf bensatropin}} \hbox{{\sf kokain}} } & {\bildett} & {\bildatta} & 12 & 0.43 \cr \vbox{ \hbox{{\sf bupivakain}} \hbox{{\sf kokain}} } & {\bildtre} & {\bildatta} & 10 & 0.33 \cr \vbox{ \hbox{{\sf bupivakain}} \hbox{{\sf lofepramin}} } & {\bildtre} & {\bildtio} & 8 & 0.25 \cr \vbox{ \hbox{{\sf structure 4}} \hbox{{\sf structure 27}} } & {\bildfyra} & {\bildtjugosju} & 12 & 0.43 \cr \vbox{ \hbox{{\sf structure 4}} \hbox{{\sf structure 28}} } & {\bildfyra} & {\bildtjugoatta} & 12 & 0.43 \cr \vbox{ \hbox{{\sf structure 4}} \hbox{{\sf R-Suedenol}} } & {\bildfyra} & {\bildtrettiofyra} & 6 & 0.18 \cr \vbox{ \hbox{{\sf structure 4}} \hbox{{\sf Pentachloroph.}} } & {\bildfyra} & {\bildtrettiosex} & 6 & 0.18 \cr \vbox{ \hbox{{\sf structure 7}} \hbox{{\sf structure 27}} } & {\bildsju} & {\bildtjugosju} & 9 & 0.29 \cr \vbox{ \hbox{{\sf structure 7}} \hbox{{\sf structure 28}} } & {\bildsju} & {\bildtjugoatta} & 8 & 0.25 \cr &&&&\cr\noalign{\hrule}} \smallskip \vfill\eject \noindent \halign { \strut \vrule\quad\hfil#\hfil\quad\vrule&& \quad\hfil #\hfil\quad\vrule \cr \noalign{\hrule} &&&&\cr {\sf Names} & {\sf L. Str.1} & {\sf L. Str.2} & {\sf Hits} & {\sf Tc} \cr &&&&\cr \noalign{\hrule} \vbox{ \hbox{{\sf kokain}} \hbox{{\sf simanneal}} } & {\bildatta} & {\bildtjugoett} & 8 & 0.25 \cr \vbox{ \hbox{{\sf lofepramin}} \hbox{{\sf trimipramin}} } & {\bildtio} & {\bildtjugofyra} & 8 & 0.25 \cr \vbox{ \hbox{{\sf metadon}} \hbox{{\sf morfin}} } & {\bildfjorton} & {\bildfemton} & 12 & 0.43 \cr \vbox{ \hbox{{\sf metadon}} \hbox{{\sf ibuprofen}} } & {\bildfjorton} & {\bildfyrtiofem} & 9 & 0.29 \cr \vbox{ \hbox{{\sf morfin}} \hbox{{\sf simanneal}} } & {\bildfemton} & {\bildtjugoett} & 6 & 0.18 \cr \vbox{ \hbox{{\sf Myo-inositol}} \hbox{{\sf Dopamine}} } & {\bildfyrtio} & {\bildfyrtionio} & 6 & 0.18 \cr \vbox{ \hbox{{\sf atrazine}} \hbox{{\sf ibuprofen}} } & {\bildarton} & {\bildfyrtiofem} & 1 & 0.33 \cr \vbox{ \hbox{{\sf Myo-inositol}} \hbox{{\sf Beta-D-gal}} } & {\bildfyrtio} & {\bildfemtiofem} & 14 & 0.54 \cr \vbox{ \hbox{{\sf Frontalin}} \hbox{{\sf Exo-B.comin}} } & {\bildfyrtiosex} & {\bildfyrtiosju} & 9 & 0.29 \cr &&&&\cr\noalign{\hrule}} \smallskip \vfill\eject \hoffset 0 truecm \noindent \halign { \strut \vrule\quad\hfil#\hfil\quad\vrule&& \quad\hfil #\hfil\quad\vrule \cr \noalign{\hrule} &&&&\cr {\sf Names} & {\sf L. Str.1} & {\sf L. Str.2} & {\sf Hits} & {\sf Tc} \cr &&&&\cr \noalign{\hrule} &&&&\cr \vbox{ \hbox{{\sf Sp.dioxaundec.}} \hbox{{\sf Frontalin}} } & {\bildtjugonio} & {\bildfyrtiosex} & 8 & 0.26 \cr \vbox{ \hbox{{\sf Coniine}} \hbox{{\sf Coniine'}} } & {\bildfemtiotva} & {\bildfemtiotva} & 5 & 1.0 \cr \vbox{ \hbox{{\sf Epinaphrine}} \hbox{{\sf Dopamine}} } & {\bildfyrtioatta} & {\bildfyrtionio} & 9 & 0.29 \cr \vbox{ \hbox{{\sf structure 27}} \hbox{{\sf Epinaphrine}} } & {\bildtjugosju} & {\bildfyrtioatta} & 9 & 0.29 \cr \vbox{ \hbox{{\sf Dopamine}} \hbox{{\sf Beta-D-gal}} } & {\bildfyrtionio} & {\bildfemtiofem} & 6 & 0.18 \cr \vbox{ \hbox{{\sf Sp.dioxaundec.}} \hbox{{\sf Frontalin}} } & {\bildtjugonio} & {\bildfyrtiosex} & 8 & 0.26 \cr \vbox{ \hbox{{\sf Sp.dioxaundec.}} \hbox{{\sf Exo-brevicomin}} } & {\bildtjugonio} & {\bildfyrtiosju} & 8 & 0.26 \cr \vbox{ \hbox{{\sf Santene}} \hbox{{\sf R-sulcatol}} } & {\bildtrettio} & {\bildtrettiotva} & 6 & 0.18 \cr \vbox{ \hbox{{\sf Santene}} \hbox{{\sf Mescaline}} } & {\bildtrettio} & {\bildfyrtioett} & 6 & 0.18 \cr \vbox{ \hbox{{\sf Santene}} \hbox{{\sf I.prop.ammelide}} } & {\bildtrettio} & {\bildfyrtiofyra} & 6 & 0.18 \cr &&&&\cr\noalign{\hrule}} \smallskip \vfill\eject \noindent \halign { \strut \vrule\quad\hfil#\hfil\quad\vrule&& \quad\hfil #\hfil\quad\vrule \cr \noalign{\hrule} &&&&\cr {\sf Names} & {\sf L. Str.1} & {\sf L. Str.2} & {\sf Hits} & {\sf Tc} \cr &&&&\cr \noalign{\hrule} &&&&\cr \vbox{ \hbox{{\sf S-6-Methyl}} \hbox{{\sf Ibuprofen}} } & {\bildtrettioett} & {\bildfyrtiofem} & 5 & 0.24 \cr \vbox{ \hbox{{\sf structure 27}} \hbox{{\sf R-suedenol}} } & {\bildtjugosju} & {\bildtrettiofyra} & 14 & 0.54 \cr \vbox{ \hbox{{\sf 2-hydroxyglutarate}} \hbox{{\sf D-3-hydroxyproline}} } & {\bildfem} & {\bildfemtio} & 3 & 0.33 \cr \vbox{ \hbox{{\sf Nicotine}} \hbox{{\sf Metadon}} } & {\bildtrettiosju} & {\bildfjorton} & 6 & 0.22 \cr \vbox{ \hbox{{\sf Nicotine}} \hbox{{\sf Nicotine'}} } & {\bildtrettiosju} & {\bildtrettiosju} & 13 & 1.0 \cr \vbox{ \hbox{{\sf Nicotine}} \hbox{{\sf Nicotine''}} } & {\bildtrettiosju} & {\bildtrettiosju} & 13 & 1.0 \cr \vbox{ \hbox{{\sf Nicotine}} \hbox{{\sf Ibuprofen}} } & {\bildtrettiosju} & {\bildfyrtiofem} & 6 & 0.22 \cr &&&&\cr\noalign{\hrule}} \smallskip \vfill\eject \noindent {\bf 9.2: Table 9.2} { \centerline{ Comp. Limit: 10 features}} \noindent The average N of hits is: 0.298051953 \smallskip \noindent \leavevmode \vbox{\hsize=0.75\hsize \bigskip {\offinterlineskip \tabskip=0pt \halign { \strut \vrule\quad\hfil#\hfil\quad\vrule&& \quad\hfil #\hfil\quad\vrule \cr \noalign{\hrule} &&&&\cr Mol. 1 & Mol.2 & hits & Tc & P \cr &&&&\cr \noalign{\hrule} &&&&\cr Nicotine &Nicotine2 & 10 & 1.000 & 1.66E-09\cr Nicotine &nicotine & 10 & 1.000 & 1.66E-09\cr Nicotine2 &nicotine & 10 & 1.000 & 1.66E-09\cr Coniine &coniine & 5 & 1.000 & 1.76E-09\cr Frontalin &exo-Brevicomin & 7 & 0.538 & 1.34E-09\cr bensatropin &metadon & 6 & 0.429 & 1.01E-09\cr 2-hydroxyglutar &D-3-hydroxyprol & 3 & 0.333 & 8.92E-08\cr lofepramin &trimipramin & 5 & 0.333 & 1.68E-09\cr 2,4-dihydroxy-6 &Epinaphrine & 5 & 0.333 & 1.68E-09\cr myo-inositol &Dopamine & 5 & 0.333 & 1.68E-09\cr myo-inositol &beta-D-Gal & 5 & 0.333 & 1.68E-09\cr bensatropin &trimipramin & 5 & 0.333 & 1.68E-09\cr 2-chloro-4-hydr &Pentachlorophen & 4 & 0.250 & 5.16E-08\cr 2-hydroxy-4,6-d &isopropylammeli & 4 & 0.250 & 5.16E-08\cr lofepramin &metadon & 4 & 0.250 & 5.16E-08\cr bupivakain &kokain & 4 & 0.250 & 5.16E-08\cr metadon &simanneal & 4 & 0.250 & 5.16E-08\cr metadon &trimipramin & 4 & 0.250 & 5.16E-08\cr morfin &simanneal & 4 & 0.250 & 5.16E-08\cr 2,4-dihydroxy-6 &2-chloro-4,6-di & 4 & 0.250 & 5.16E-08\cr 2,4-dihydroxy-6 &m-Cresol & 4 & 0.250 & 5.16E-08\cr 2-chloro-4,6-di &m-Cresol & 4 & 0.250 & 5.16E-08\cr 2-chloro-4,6-di &Epinaphrine & 4 & 0.250 & 5.16E-08\cr spirodioxaundec &exo-Brevicomin & 4 & 0.250 & 5.16E-08\cr santene &R-Sulcatol & 4 & 0.250 & 5.16E-08\cr Pentachlorophen &myo-inositol & 4 & 0.250 & 5.16E-08\cr bensatropin &lofepramin & 4 & 0.250 & 5.16E-08\cr bensatropin &morfin & 4 & 0.250 & 5.16E-08\cr myo-inositol &Epinaphrine & 4 & 0.250 & 5.16E-08\cr &&&&\cr\noalign{\hrule}}}} \vfill\eject \noindent Comparision {\it cont.} \smallskip \noindent \leavevmode \vbox{\hsize=0.75\hsize \bigskip {\offinterlineskip \tabskip=0pt \halign { \strut \vrule\quad\hfil#\hfil\quad\vrule&& \quad\hfil #\hfil\quad\vrule \cr \noalign{\hrule} &&&&\cr Mol. 1 & Mol.2 & hits & Tc & P \cr &&&&\cr \noalign{\hrule} &&&&\cr bupivakain &metadon & 4 & 0.250 & 5.16E-08\cr Mescaline &Disulfiram & 4 & 0.250 & 5.16E-08\cr m-Cresol &Epinaphrine & 4 & 0.250 & 5.16E-08\cr 2-chloro-4-hydr &2,4-dihydroxy-6 & 4 & 0.250 & 5.16E-08\cr Epinaphrine &Dopamine & 4 & 0.250 & 5.16E-08\cr Epinaphrine &beta-D-Gal & 4 & 0.250 & 5.16E-08\cr bensatropin &simanneal & 4 & 0.250 & 5.16E-08\cr S-6-Methyl &Ibuprofen & 3 & 0.231 & 2.52E-06\cr 4-Methylheptan- &S-6-Methyl & 2 & 0.200 & 1.41E-04\cr 2-chloro-4-hydr &2-chloroethanol & 2 & 0.200 & 2.84E-05\cr 2-chloroethanol &Pentachlorophen & 2 & 0.200 & 2.84E-05\cr Amphetamine &ar0016 & 3 & 0.200 & 7.34E-06\cr budesonid &atrazine & 3 & 0.176 & 1.49E-05\cr lofepramin &noskapin & 3 & 0.176 & 1.49E-05\cr Pentachlorophen &Dopamine & 3 & 0.176 & 1.49E-05\cr bupivakain &lofepramin & 3 & 0.176 & 1.49E-05\cr budesonid &noskapin & 3 & 0.176 & 1.49E-05\cr noskapin &trimipramin & 3 & 0.176 & 1.49E-05\cr metadon &morfin & 3 & 0.176 & 1.49E-05\cr 2-chloro-4-hydr &Epinaphrine & 3 & 0.176 & 1.49E-05\cr kokain &lofepramin & 3 & 0.176 & 1.49E-05\cr &&&&\cr\noalign{\hrule}}}} \smallskip \noindent Comp. Limit: limited number of similarities (corresponding to "1" in the bitstring. The similarities are listed in the bitstring according to priorities and Comp. Limit 10 means that the 10 highest similarity features are just considered. Average N is the average HitF for all pairs of molecules in the comparison. \vfill\eject \noindent {\bf 9.3: Table 9.3} { \centerline{ Comp. Limit: 20 features}} \noindent The average N of hits is: 0.836363614 \smallskip \noindent \leavevmode \vbox{\hsize=0.75\hsize \bigskip {\offinterlineskip \tabskip=0pt \halign { \strut \vrule\quad\hfil#\hfil\quad\vrule&& \quad\hfil #\hfil\quad\vrule \cr \noalign{\hrule} &&&&\cr Mol. 1 & Mol.2 & hits & Tc & P \cr &&&&\cr \noalign{\hrule} &&&&\cr Nicotine2 &nicotine & 20 & 1.000 & 1.10E-09\cr Coniine &coniine & 5 & 1.000 & 1.76E-09\cr Nicotine &Nicotine2 & 18 & 0.818 & 1.21E-09\cr Nicotine &nicotine & 18 & 0.818 & 1.21E-09\cr 2-chloro-4-hydr &2,4-dihydroxy-6 & 12 & 0.429 & 1.02E-09\cr 2-chloro-4-hydr &2-chloro-4,6-di & 12 & 0.429 & 1.02E-09\cr myo-inositol &beta-D-Gal & 12 & 0.429 & 1.02E-09\cr Frontalin &exo-Brevicomin & 12 & 0.429 & 1.02E-09\cr bensatropin &kokain & 12 & 0.429 & 1.02E-09\cr 2-hydroxyglutar &D-3-hydroxyprol & 3 & 0.333 & 8.92E-08\cr metadon &Nicotine & 10 & 0.333 & 1.56E-09\cr metadon &Nicotine2 & 10 & 0.333 & 1.56E-09\cr metadon &nicotine & 10 & 0.333 & 1.56E-09\cr bensatropin &bupivakain & 10 & 0.333 & 1.56E-09\cr bupivakain &kokain & 10 & 0.333 & 1.56E-09\cr 2-hydroxy-4,6-d &2,4-dihydroxy-6 & 9 & 0.290 & 1.76E-09\cr metadon &morfin & 9 & 0.290 & 1.76E-09\cr 2,4-dihydroxy-6 &Epinaphrine & 9 & 0.290 & 1.76E-09\cr Epinaphrine &Dopamine & 9 & 0.290 & 1.76E-09\cr spirodioxaundec &Frontalin & 8 & 0.258 & 1.82E-09\cr spirodioxaundec &exo-Brevicomin & 8 & 0.258 & 1.82E-09\cr 2-hydroxy-4,6-d &2-chloro-4,6-di & 8 & 0.250 & 1.51E-09\cr kokain &simanneal & 8 & 0.250 & 1.51E-09\cr lofepramin &trimipramin & 8 & 0.250 & 1.51E-09\cr metadon &Ibuprofen & 8 & 0.250 & 1.51E-09\cr morfin &simanneal & 8 & 0.250 & 1.51E-09\cr bupivakain &metadon & 8 & 0.250 & 1.51E-09\cr bupivakain &lofepramin & 8 & 0.250 & 1.51E-09\cr lofepramin &terfenadin & 7 & 0.212 & 1.95E-09\cr &&&&\cr\noalign{\hrule}}}} \vfill\eject \noindent Comparision {\it cont.} \smallskip \noindent \leavevmode \vbox{\hsize=0.75\hsize \bigskip {\offinterlineskip \tabskip=0pt \halign { \strut \vrule\quad\hfil#\hfil\quad\vrule&& \quad\hfil #\hfil\quad\vrule \cr \noalign{\hrule} &&&&\cr Mol. 1 & Mol.2 & hits & Tc & P \cr &&&&\cr \noalign{\hrule} &&&&\cr metadon &trimipramin & 7 & 0.212 & 1.95E-09\cr lofepramin &oxotremorin & 7 & 0.212 & 1.95E-09\cr bensatropin &simanneal & 7 & 0.212 & 1.95E-09\cr Nicotine &Ibuprofen & 7 & 0.212 & 1.95E-09\cr Nicotine2 &Ibuprofen & 7 & 0.212 & 1.95E-09\cr nicotine &Ibuprofen & 7 & 0.212 & 1.95E-09\cr 2-chloro-4-hydr &Epinaphrine & 7 & 0.212 & 1.95E-09\cr 4-Methylheptan- &S-6-Methyl & 2 & 0.200 & 1.41E-04\cr Amphetamine &ar0016 & 3 & 0.200 & 7.34E-06\cr oxotremorin &Coniine & 4 & 0.190 & 5.73E-08\cr oxotremorin &coniine & 4 & 0.190 & 5.73E-08\cr Benalcohol &2,4-dihydroxy-6 & 4 & 0.190 & 5.73E-08\cr Benzaldehyde &Piperonal & 4 &