2

So, I'm trying to extract information from this xml --

<bdb:getTargetByCompoundResponse xmlns:bdb="http://ws.bindingdb.org/xsd">
   <bdb:smile>C[C@H]1[C@H](C)CC[C@]2(C)CC[C@]3(C)C(=CC(=O)[C@@H]4[C@@]5(C)CC[C@@H](O)[C@](C)(C(=O)O)[C@@H]5CC[C@]43C)[C@H]12</bdb:smile>
   <bdb:inchi>InChI=1S/C30H46O4/c1-17-8-11-26(3)14-15-28(5)19(23(26)18(17)2)16-20(31)24-27(4)12-10-22(32)30(7,25(33)34)21(27)9-13-29(24,28)6/h16-18,21-24,32H,8-15H2,1-7H3,(H,33,34)/t17-,18+,21-,22-,23+,24-,26-,27+,28-,29-,30-/m1/s1 AuxInfo=1/1/N:4,1,8,19,12,33,25,5,30,21,6,20,31,9,10,14,3,2,13,15,29,22,34,17,26,7,18,11,32,24,16,23,27,28/E:(33,34)/it:im/rA:34CC.oC.eCCCC.oCCCC.oCCCCOC.eC.eCCCC.oOC.oCCOOC.eCCC.eCC.e/rB:s1;s2;s3;s3;s5;s6;s7;s7;s9;s10;s11;s11;d-13;s14;d15;s15;s17;s18;s18;s20;s21;s22;s22;s24;s24;d26;s26;s18s24;s29;s30;s11s17s31;s32;s2s7s13;/rC:2.8737,-5.8026,0;2.1037,-7.1363,0;.5637,-7.1363,0;-.2063,-5.8026,0;-.2063,-8.47,0;.5637,-9.8037,0;2.1037,-9.8037,0;1.3337,-11.1374,0;2.8737,-11.1374,0;4.4137,-11.1374,0;5.1837,-9.8037,0;5.9537,-11.1374,0;4.4137,-8.47,0;5.1837,-7.1363,0;6.7237,-7.1363,0;7.4937,-5.8026,0;7.4937,-8.47,0;9.0337,-8.47,0;8.2637,-7.1363,0;9.8037,-7.1363,0;11.3437,-7.1363,0;12.1137,-8.47,0;13.6537,-8.47,0;11.3437,-9.8037,0;11.0763,-11.3203,0;12.7908,-10.3304,0;13.9705,-9.3405,0;13.0582,-11.847,0;9.8037,-9.8037,0;9.0337,-11.1374,0;7.4937,-11.1374,0;6.7237,-9.8037,0;6.8847,-11.3352,0;2.8737,-8.47,0;</bdb:inchi>
   <bdb:hit>7</bdb:hit>
   <bdb:affinities>
      <bdb:monomerid>50241261</bdb:monomerid>
      <bdb:inhibitor>BDBM50241261</bdb:inhibitor>
      <bdb:target>Polyunsaturated fatty acid 5-lipoxygenase</bdb:target>
      <bdb:species>Homo sapiens (Human)</bdb:species>
      <bdb:affinity_type>IC50</bdb:affinity_type>
      <bdb:affinity>3000</bdb:affinity>
      <bdb:smiles>C[C@@H]1CC[C@]2(C)CC[C@]3(C)C(=CC(=O)[C@@H]4[C@@]5(C)CC[C@@H](O)[C@@](C)([C@@H]5CC[C@@]34C)C(O)=O)[C@@H]2[C@H]1C</bdb:smiles>
      <bdb:inchi>InChI=1S/C30H46O4/c1-17-8-11-26(3)14-15-28(5)19(23(26)18(17)2)16-20(31)24-27(4)12-10-22(32)30(7,25(33)34)21(27)9-13-29(24,28)6/h16-18,21-24,32H,8-15H2,1-7H3,(H,33,34)/t17-,18+,21-,22-,23+,24-,26-,27+,28-,29-,30-/m1/s1</bdb:inchi>
      <bdb:tanimoto>1.00000</bdb:tanimoto>
   </bdb:affinities>
   <bdb:affinities>
      <bdb:monomerid>50241261</bdb:monomerid>
      <bdb:inhibitor>BDBM50241261</bdb:inhibitor>
      <bdb:target>Prolyl endopeptidase</bdb:target>
      <bdb:species>Homo sapiens (Human)</bdb:species>
      <bdb:affinity_type>IC50</bdb:affinity_type>
      <bdb:affinity>36320</bdb:affinity>
      <bdb:smiles>C[C@@H]1CC[C@]2(C)CC[C@]3(C)C(=CC(=O)[C@@H]4[C@@]5(C)CC[C@@H](O)[C@@](C)([C@@H]5CC[C@@]34C)C(O)=O)[C@@H]2[C@H]1C</bdb:smiles>
      <bdb:inchi>InChI=1S/C30H46O4/c1-17-8-11-26(3)14-15-28(5)19(23(26)18(17)2)16-20(31)24-27(4)12-10-22(32)30(7,25(33)34)21(27)9-13-29(24,28)6/h16-18,21-24,32H,8-15H2,1-7H3,(H,33,34)/t17-,18+,21-,22-,23+,24-,26-,27+,28-,29-,30-/m1/s1</bdb:inchi>
      <bdb:tanimoto>1.00000</bdb:tanimoto>
   </bdb:affinities>
   <bdb:affinities>
      <bdb:monomerid>50241261</bdb:monomerid>
      <bdb:inhibitor>BDBM50241261</bdb:inhibitor>
      <bdb:target>Prostaglandin E synthase</bdb:target>
      <bdb:species>Homo sapiens (Human)</bdb:species>
      <bdb:affinity_type>IC50</bdb:affinity_type>
      <bdb:affinity>3000</bdb:affinity>
      <bdb:smiles>C[C@@H]1CC[C@]2(C)CC[C@]3(C)C(=CC(=O)[C@@H]4[C@@]5(C)CC[C@@H](O)[C@@](C)([C@@H]5CC[C@@]34C)C(O)=O)[C@@H]2[C@H]1C</bdb:smiles>
      <bdb:inchi>InChI=1S/C30H46O4/c1-17-8-11-26(3)14-15-28(5)19(23(26)18(17)2)16-20(31)24-27(4)12-10-22(32)30(7,25(33)34)21(27)9-13-29(24,28)6/h16-18,21-24,32H,8-15H2,1-7H3,(H,33,34)/t17-,18+,21-,22-,23+,24-,26-,27+,28-,29-,30-/m1/s1</bdb:inchi>
      <bdb:tanimoto>1.00000</bdb:tanimoto>
   </bdb:affinities>
   <bdb:affinities>
      <bdb:monomerid>50241261</bdb:monomerid>
      <bdb:inhibitor>BDBM50241261</bdb:inhibitor>
      <bdb:target>Prostaglandin G/H synthase 1</bdb:target>
      <bdb:species>Ovis aries (Sheep)</bdb:species>
      <bdb:affinity_type>IC50</bdb:affinity_type>
      <bdb:affinity>&gt;40000</bdb:affinity>
      <bdb:smiles>C[C@@H]1CC[C@]2(C)CC[C@]3(C)C(=CC(=O)[C@@H]4[C@@]5(C)CC[C@@H](O)[C@@](C)([C@@H]5CC[C@@]34C)C(O)=O)[C@@H]2[C@H]1C</bdb:smiles>
      <bdb:inchi>InChI=1S/C30H46O4/c1-17-8-11-26(3)14-15-28(5)19(23(26)18(17)2)16-20(31)24-27(4)12-10-22(32)30(7,25(33)34)21(27)9-13-29(24,28)6/h16-18,21-24,32H,8-15H2,1-7H3,(H,33,34)/t17-,18+,21-,22-,23+,24-,26-,27+,28-,29-,30-/m1/s1</bdb:inchi>
      <bdb:tanimoto>1.00000</bdb:tanimoto>
   </bdb:affinities>
   <bdb:affinities>
      <bdb:monomerid>50241261</bdb:monomerid>
      <bdb:inhibitor>BDBM50241261</bdb:inhibitor>
      <bdb:target>Prostaglandin G/H synthase 2</bdb:target>
      <bdb:species>Homo sapiens (Human)</bdb:species>
      <bdb:affinity_type>IC50</bdb:affinity_type>
      <bdb:affinity>&gt;40000</bdb:affinity>
      <bdb:smiles>C[C@@H]1CC[C@]2(C)CC[C@]3(C)C(=CC(=O)[C@@H]4[C@@]5(C)CC[C@@H](O)[C@@](C)([C@@H]5CC[C@@]34C)C(O)=O)[C@@H]2[C@H]1C</bdb:smiles>
      <bdb:inchi>InChI=1S/C30H46O4/c1-17-8-11-26(3)14-15-28(5)19(23(26)18(17)2)16-20(31)24-27(4)12-10-22(32)30(7,25(33)34)21(27)9-13-29(24,28)6/h16-18,21-24,32H,8-15H2,1-7H3,(H,33,34)/t17-,18+,21-,22-,23+,24-,26-,27+,28-,29-,30-/m1/s1</bdb:inchi>
      <bdb:tanimoto>1.00000</bdb:tanimoto>
   </bdb:affinities>
   <bdb:affinities>
      <bdb:monomerid>50241261</bdb:monomerid>
      <bdb:inhibitor>BDBM50241261</bdb:inhibitor>
      <bdb:target>Tyrosine-protein phosphatase non-receptor type 1</bdb:target>
      <bdb:species>Homo sapiens (Human)</bdb:species>
      <bdb:affinity_type>IC50</bdb:affinity_type>
      <bdb:affinity>8040</bdb:affinity>
      <bdb:smiles>C[C@@H]1CC[C@]2(C)CC[C@]3(C)C(=CC(=O)[C@@H]4[C@@]5(C)CC[C@@H](O)[C@@](C)([C@@H]5CC[C@@]34C)C(O)=O)[C@@H]2[C@H]1C</bdb:smiles>
      <bdb:inchi>InChI=1S/C30H46O4/c1-17-8-11-26(3)14-15-28(5)19(23(26)18(17)2)16-20(31)24-27(4)12-10-22(32)30(7,25(33)34)21(27)9-13-29(24,28)6/h16-18,21-24,32H,8-15H2,1-7H3,(H,33,34)/t17-,18+,21-,22-,23+,24-,26-,27+,28-,29-,30-/m1/s1</bdb:inchi>
      <bdb:tanimoto>1.00000</bdb:tanimoto>
   </bdb:affinities>
   <bdb:affinities>
      <bdb:monomerid>50241261</bdb:monomerid>
      <bdb:inhibitor>BDBM50241261</bdb:inhibitor>
      <bdb:target>Tyrosine-protein phosphatase non-receptor type 2</bdb:target>
      <bdb:species>Homo sapiens (Human)</bdb:species>
      <bdb:affinity_type>IC50</bdb:affinity_type>
      <bdb:affinity>9450</bdb:affinity>
      <bdb:smiles>C[C@@H]1CC[C@]2(C)CC[C@]3(C)C(=CC(=O)[C@@H]4[C@@]5(C)CC[C@@H](O)[C@@](C)([C@@H]5CC[C@@]34C)C(O)=O)[C@@H]2[C@H]1C</bdb:smiles>
      <bdb:inchi>InChI=1S/C30H46O4/c1-17-8-11-26(3)14-15-28(5)19(23(26)18(17)2)16-20(31)24-27(4)12-10-22(32)30(7,25(33)34)21(27)9-13-29(24,28)6/h16-18,21-24,32H,8-15H2,1-7H3,(H,33,34)/t17-,18+,21-,22-,23+,24-,26-,27+,28-,29-,30-/m1/s1</bdb:inchi>
      <bdb:tanimoto>1.00000</bdb:tanimoto>
   </bdb:affinities>
</bdb:getTargetByCompoundResponse>

But I'm getting the following error-

xpath does not return any nodes. Be sure row level nodes are in xpath. If document uses namespaces denoted with xmlns, be sure to define namespaces and use them in xpath.

I tried this code

smile = 'C[C@H]1[C@H](C)CC[C@]2(C)CC[C@]3(C)C(=CC(=O)[C@@H]4[C@@]5(C)CC[C@@H](O)[C@](C)(C(=O)O)[C@@H]5CC[C@]43C)[C@H]12'

api_binding = requests.get(f'https://bindingdb.org/axis2/services/BDBService/getTargetByCompound?smiles={smile}&cutoff=1')
           
df = pd.read_xml(api_binding.text, xpath = ".//bdb", namespaces = {"bdb":"https://ws.bindingdb.org/xsd"})
result = df.loc[df["species"] == "Homo sapiens (Human)", "target"]
Stu Sztukowski
  • 10,597
  • 1
  • 12
  • 21
  • The only problem with what you have is that the xpath you're querying will try to match nodes whose tags *match* `.//bdb`. You just need to add a `*` to the end of the xpath you have, to tell it to look for an element that *starts* with `bdb`. – L0tad Feb 16 '23 at 23:08
  • In fact, if you use `xpath="./bdb:affinities"`, you'll get exactly the result you want, without having to do any cleanup. – L0tad Feb 16 '23 at 23:21

3 Answers3

0

If you remove xpath, you'll read it in almost as expected. We know from the output that there should be 7 rows (i.e. <bdb:hit> shows 7). The first three tags seem to contain only some metadata about what it returned:

<bdb:smile>
<bdb:inchi>
<bdb:hit>

These make up rows 0-2 of the dataframe. Your actual data starts on row 3, or wherever we see the <bdb:affinities> tag. Based on this pattern, we can keep only the data where target is not missing. You can choose any other column where you know there will always be data, but in this case, we're going to stick with target.

df = pd.read_xml(api_binding.text, namespaces = {"bdb":"https://ws.bindingdb.org/xsd"}).dropna(subset='target')

result = df.loc[df["species"] == "Homo sapiens (Human)", "target"]

Output:

3           Polyunsaturated fatty acid 5-lipoxygenase
4                                Prolyl endopeptidase
5                            Prostaglandin E synthase
7                        Prostaglandin G/H synthase 2
8    Tyrosine-protein phosphatase non-receptor type 1
9    Tyrosine-protein phosphatase non-receptor type 2
Stu Sztukowski
  • 10,597
  • 1
  • 12
  • 21
  • Even better would be to use `xpath=".//bdb:affinities"`, then you'll get one row per entry, and won't have to mess around with dropping rows to get to what you actually want – L0tad Feb 16 '23 at 23:15
  • When I tried that, I received `xpath does not return any nodes` – Stu Sztukowski Feb 17 '23 at 00:58
0

you can build the data frame for all the attributes, and then can the result. for example:

df = pd.read_xml(str_xml, xpath = ".//bdb:*", namespaces = {"bdb":"http://ws.bindingdb.org/xsd"})
result = df.loc[df["species"] == "Homo sapiens (Human)", "target"]
result

result would be:

3     Polyunsaturated fatty acid 5-lipoxygenase
7                                          None
13                         Prolyl endopeptidase
17                                         None
simpleApp
  • 2,885
  • 2
  • 10
  • 19
  • This does not produce the correct output. The result should contain all of the `target`s with the exception of the sheep one in row 3. – L0tad Feb 16 '23 at 23:25
0

There are a couple things going on here that I think the other answers missed, so I will try to give a complete overview and explanation.

In your sample of the the code you've tried, you are querying xpath=".//bdb", which will look for any element beneath the root node with a tag of "bdb". There are no such elements (whose tag is just bdb), so you get ValueError: xpath does not return any nodes. simpleApp's answer correctly identifies this issue, and suggests adding an asterisk (i.e. xpath=".//bdb:*") to tell xpath to look for any elements whose tag starts with bdb:. However, this returns all of the various nested elements in the XML, since in your case everything is prefixed with bdb:, so we lose the structural information contained in the hierarchy of the XML content, leading to a mess of a DataFrame.

Stu Sztukowski's answer suggests dropping the xpath kwarg completely, which will make pandas more or less do what you want, except then you get unwanted rows resulting from the <bdb:smile/>, <bdb:inchi/>, and <bdb:hit/> elements at the top that you have to clean up yourself after you create the DataFrame.

Also, you need to use http in your namespaces dictionary, not https, since the namespace mapping for bdb (shown in first line/root of your XML content) is http://ws.bindingdb.org/xsd, with no s.

A clean solution that will get you exactly the DataFrame that you want is

df = pd.read_xml(api_binding.text, xpath = "./bdb:affinities",
                 namespaces = {"bdb": "http://ws.bindingdb.org/xsd"})
result = df.loc[df["species"] == "Homo sapiens (Human)", "target"]

You can use either xpath = "./bdb:affinities" or xpath = ".//bdb:affinities" (single or double slash) since in this case all of the bdb:affinities elements are right below the root anyway, so the two syntaxes do the same thing.

This solution yields a df that looks like:

   monomerid     inhibitor                                            target               species affinity_type affinity                                             smiles                                              inchi  tanimoto
0   50241261  BDBM50241261         Polyunsaturated fatty acid 5-lipoxygenase  Homo sapiens (Human)          IC50     3000  C[C@@H]1CC[C@]2(C)CC[C@]3(C)C(=CC(=O)[C@@H]4[C...  InChI=1S/C30H46O4/c1-17-8-11-26(3)14-15-28(5)1...       1.0
1   50241261  BDBM50241261                              Prolyl endopeptidase  Homo sapiens (Human)          IC50    36320  C[C@@H]1CC[C@]2(C)CC[C@]3(C)C(=CC(=O)[C@@H]4[C...  InChI=1S/C30H46O4/c1-17-8-11-26(3)14-15-28(5)1...       1.0
2   50241261  BDBM50241261                          Prostaglandin E synthase  Homo sapiens (Human)          IC50     3000  C[C@@H]1CC[C@]2(C)CC[C@]3(C)C(=CC(=O)[C@@H]4[C...  InChI=1S/C30H46O4/c1-17-8-11-26(3)14-15-28(5)1...       1.0
3   50241261  BDBM50241261                      Prostaglandin G/H synthase 1    Ovis aries (Sheep)          IC50   >40000  C[C@@H]1CC[C@]2(C)CC[C@]3(C)C(=CC(=O)[C@@H]4[C...  InChI=1S/C30H46O4/c1-17-8-11-26(3)14-15-28(5)1...       1.0
4   50241261  BDBM50241261                      Prostaglandin G/H synthase 2  Homo sapiens (Human)          IC50   >40000  C[C@@H]1CC[C@]2(C)CC[C@]3(C)C(=CC(=O)[C@@H]4[C...  InChI=1S/C30H46O4/c1-17-8-11-26(3)14-15-28(5)1...       1.0
5   50241261  BDBM50241261  Tyrosine-protein phosphatase non-receptor type 1  Homo sapiens (Human)          IC50     8040  C[C@@H]1CC[C@]2(C)CC[C@]3(C)C(=CC(=O)[C@@H]4[C...  InChI=1S/C30H46O4/c1-17-8-11-26(3)14-15-28(5)1...       1.0
6   50241261  BDBM50241261  Tyrosine-protein phosphatase non-receptor type 2  Homo sapiens (Human)          IC50     9450  C[C@@H]1CC[C@]2(C)CC[C@]3(C)C(=CC(=O)[C@@H]4[C...  InChI=1S/C30H46O4/c1-17-8-11-26(3)14-15-28(5)1...       1.0

And a result that looks like:

0           Polyunsaturated fatty acid 5-lipoxygenase
1                                Prolyl endopeptidase
2                            Prostaglandin E synthase
4                        Prostaglandin G/H synthase 2
5    Tyrosine-protein phosphatase non-receptor type 1
6    Tyrosine-protein phosphatase non-receptor type 2
Name: target, dtype: object
L0tad
  • 574
  • 3
  • 15