Open In Colab

Lab 4: Descriptors for Molecules and Materials#

This notebook is intended to introduce the use of molecular and materials descriptors to obtain features for machine learning.

We will cover the use of RDKit to generate molecular descriptors, and Matminer and Dscrib to generate materials descriptors.

Installation#

  • To install RDKit follow the instructions here

  • To install Matminer follow the instructions here

  • To install Dscribe follow the instructions here

Optional

For visualization of the crystals in Jupyter Notebook, install crystal-toolkit

RDKit#

Create molecules#

# Smiles string of a molecule

benzene_smi = "c1ccccc1"
# Rdkit molecule

from rdkit import Chem

benzene = Chem.MolFromSmiles(benzene_smi)

benzene
../_images/6f84e9e62687bc68e29470de95175f024d2de1fdf4018249b582d791e7591284.png
EC_smi = "C1COC(=O)O1"

EC = Chem.MolFromSmiles(EC_smi)

EC
../_images/0e96452bd52a1eb7dfe257fbfb6e1c808ba0776e7bcd31ce1734a4f220d19e67.png

Morgan/Circular/ECFP fingerprints#

from rdkit.Chem import AllChem

# Create generator
morgan_generator = AllChem.GetMorganGenerator(radius=2)

# Generate the Morgan fingerprint
morgan_fp = morgan_generator.GetFingerprint(EC)

# Convert the fingerprint to a binary string
fp_binary = morgan_fp.ToBitString()

print("Fingerprint length:", len(fp_binary))

print("First five bits:", fp_binary[:5])
Fingerprint length: 2048
First five bits: 00000

Crystal composition & structure#

from pymatgen.core import Structure

filename = "./data/NaCl.POSCAR"

NaCl = Structure.from_file(filename)

print(NaCl)
Full Formula (Na4 Cl4)
Reduced Formula: NaCl
abc   :   5.588126   5.588126   5.588126
angles:  90.000000  90.000000  90.000000
pbc   :       True       True       True
Sites (8)
  #  SP      a    b    c
---  ----  ---  ---  ---
  0  Na    0    0    0
  1  Na    0    0.5  0.5
  2  Na    0.5  0    0.5
  3  Na    0.5  0.5  0
  4  Cl    0    0    0.5
  5  Cl    0    0.5  0
  6  Cl    0.5  0    0
  7  Cl    0.5  0.5  0.5
NaCl_comp = NaCl.composition

NaCl_comp
Composition('Na4 Cl4')

Visualize it#

import crystal_toolkit

NaCl
No module named 'phonopy'
/opt/hostedtoolcache/Python/3.10.13/x64/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

Matminer#

Compositional descriptor#

from matminer.featurizers.composition import ElementProperty

ep = ElementProperty.from_preset(preset_name="magpie")
features = ep.featurize(NaCl_comp)

# Length of the feature vector
len(features)
132
# First 5 features
features[:5]
[11.0, 17.0, 6.0, 14.0, 3.0]
# What are these features?

labels = ep.feature_labels()

labels[:5]
['MagpieData minimum Number',
 'MagpieData maximum Number',
 'MagpieData range Number',
 'MagpieData mean Number',
 'MagpieData avg_dev Number']

Structure descriptor#

from matminer.featurizers.structure import RadialDistributionFunction

cutoff = 20
rdf = RadialDistributionFunction(cutoff=cutoff)

features = rdf.featurize(NaCl)

len(features)
200
features.shape
(200,)
import matplotlib.pyplot as plt
import numpy as np

fig, ax = plt.subplots()

X = np.arange(0, cutoff, cutoff/len(features))
Y = features

ax.plot(X, Y)

ax.set_xlabel(r"Distance $(\mathrm{\AA})$")
ax.set_ylabel("Radial distribution function")
Text(0, 0.5, 'Radial distribution function')
../_images/f0765ee6f4a0a0557f34c110f9df20b273d23870895443436c097cecbc175873.png

Crystal composition & structure#

  • ASE (an alternative to pymatgen)

from ase.io import read

NaCl = read("./data/NaCl.POSCAR")

NaCl
Atoms(symbols='Na4Cl4', pbc=True, cell=[[5.588126435439935, 0.0, 3e-16], [9e-16, 5.588126435439935, 3e-16], [0.0, 0.0, 5.588126435439935]])
from ase.visualize import view

view(NaCl, viewer='x3d')
ASE atomic visualization

SOAP Descriptor#

from dscribe.descriptors import SOAP

soap = SOAP(r_cut=5, n_max=8, l_max=6, species=["Na", "Cl"], periodic=True)

features = soap.create(NaCl)

# The shape is (n_atoms, n_features)
features.shape
(8, 952)
# The first 10 features for the first atom
features[0][:10]
array([ 0.01150823,  0.05439198,  0.08569168,  0.30573843,  0.00483586,
        0.906287  , -1.43857222,  2.26457369,  0.25707581,  0.40500933])