Source code for robocrys.condense.component
"""This module implements functions for handling structure components."""
from __future__ import annotations
from copy import deepcopy
from typing import TYPE_CHECKING, Any
import networkx as nx
import numpy as np
from monty.fractions import gcd
from pymatgen.analysis.structure_matcher import StructureMatcher
from pymatgen.core.composition import Composition
from pymatgen.core.periodic_table import get_el_sp
from pymatgen.core.structure import PeriodicSite, Structure
from pymatgen.symmetry.analyzer import SpacegroupAnalyzer
from pymatgen.util.string import formula_double_format
from robocrys.condense.fingerprint import get_structure_fingerprint
from robocrys.util import common_formulas
if TYPE_CHECKING:
Component = dict[str, Any]
[docs]def get_structure_inequiv_components(
components: list[Component],
use_structure_graph: bool = False,
fingerprint_tol: float = 0.01,
) -> list[Component]:
"""Gets and counts the structurally inequivalent components.
Supports matching through StructureMatcher or by a combined structure graph/
site fingerprint approach. For the latter method, the component data has to
have been generated with ``inc_graph=True``.
Args:
components: A list of structure components, generated using
:obj:`pymatgen.analysis.dimensionality.get_structure_components`.
If ``use_structure_graph=True``, the components should be generated
with ``inc_graph=True``.
use_structure_graph: Whether to use the bonding graph and site
fingerprints to determine if components are structurally equivalent.
If ``False``,
:obj:`pymatgen.analysis.structure_matcher.StructureMatcher` will be
used to compare the components.
fingerprint_tol: The fingerprint tolerance to determine whether two
components have a matching fingerprint. This option is ignored if
``use_structure_graph=False``.
Returns:
A list of the structurally inequivalent components. Any duplicate
components will only be returned once. The component objects are in the
same format is given by
:obj:`pymatgen.analysis.dimensionality.get_structure_components` but
have an additional field:
- ``"count"`` (:obj:`int`): The number of times this component appears
in the structure.
"""
components = deepcopy(components)
for component in components:
component["count"] = 1
if use_structure_graph:
# check fingerprints match and components are isomorphic.
fingerprints = [
get_structure_fingerprint(c["structure_graph"].structure)
for c in components
]
seen_components = [components[0]]
seen_fingers = [fingerprints[0]]
for component, fingerprint in zip(components[1:], fingerprints[1:]):
graph_match = [
components_are_isomorphic(component, c) for c in seen_components
]
finger_match = [
np.linalg.norm(fingerprint - c) < fingerprint_tol for c in seen_fingers
]
structure_match = [i and f for i, f in zip(graph_match, finger_match)]
if any(structure_match):
# there should only ever be a single match so we take index of
# the first match and increment the component count
loc = np.where(structure_match)[0][0]
seen_components[loc]["count"] += 1
else:
seen_components.append(component)
seen_fingers.append(fingerprint)
else:
sm = StructureMatcher()
seen_components = [components[0]]
for component in components[1:]:
structure_match = [
sm.fit(
component["structure_graph"].structure,
c["structure_graph"].structure,
)
for c in seen_components
]
if any(structure_match):
# there should only ever be a single match so we take index of
# the first match and increment the component count
loc = np.where(structure_match)[0][0]
seen_components[loc]["count"] += 1
else:
seen_components.append(component)
return seen_components
[docs]def components_are_isomorphic(
component_a: Component, component_b: Component, use_weights: bool = False
):
"""Determines whether the graphs of two components are isomorphic.
Only takes into account graph connectivity and not local geometry (e.g. bond
angles and distances).
Args:
component_a: The first component.
component_b: The second component.
use_weights: Whether to use the graph edge weights in comparing graphs.
Returns:
Whether the components are isomorphic.
"""
def node_match(n1, n2):
return n1["specie"] == n2["specie"]
def edge_match(e1, e2):
if use_weights:
return e1["weight"] == e2["weight"]
return True
graph_a = component_a["structure_graph"].graph
graph_b = component_b["structure_graph"].graph
species_a = {
n: {"specie": str(component_a["structure_graph"].structure[n].specie)}
for n in graph_a
}
species_b = {
n: {"specie": str(component_b["structure_graph"].structure[n].specie)}
for n in graph_b
}
nx.set_node_attributes(graph_a, species_a)
nx.set_node_attributes(graph_b, species_b)
return nx.is_isomorphic(
graph_a, graph_b, node_match=node_match, edge_match=edge_match
)
[docs]def get_sym_inequiv_components(
components: list[Component], spg_analyzer: SpacegroupAnalyzer
) -> list[Component]:
"""Gets and counts the symmetrically inequivalent components.
Component data has to have been generated with ``inc_site_ids=True``.
Args:
components: A list of structure components, generated using
:obj:`pymatgen.analysis.dimensionality.get_structure_components`,
with ``inc_site_ids=True``.
spg_analyzer: A `pymatgen.symmetry.analyzer.SpacegroupAnalyzer` analyzer
object for the structure containing the components.
Returns:
A list of the symmetrically inequivalent components. Any duplicate
components will only be returned once. The component objects are in the
same format is given by
:obj:`pymatgen.analysis.dimensionality.get_structure_components` but
the additional property:
- ``"count"`` (:obj:`int`): The number of times this component appears
in the structure.
"""
components = deepcopy(components)
sym_inequiv_components = {}
equivalent_atoms = spg_analyzer.get_symmetry_dataset().equivalent_atoms
for component in components:
sym_indices = frozenset(equivalent_atoms[x] for x in component["site_ids"])
# if two components are composed of atoms that are symmetrically
# equivalent they are the same.
if sym_indices in sym_inequiv_components:
sym_inequiv_components[sym_indices]["count"] += 1
continue
component["count"] = 1
sym_inequiv_components[sym_indices] = component
return list(sym_inequiv_components.values())
[docs]def get_formula_inequiv_components(
components: list[Component],
use_iupac_formula: bool = True,
use_common_formulas: bool = True,
) -> list[Component]:
"""Gets and counts the inequivalent components based on their formuula.
Note that the counting of compounds is different to in
``get_sym_inequiv_equivalent``. I.e. the count is not the number of
components with the same formula. For example, the count of the formula
"GaAs" in a system with two Ga2As2 components would be 4.
Args:
components: A list of structure components, generated using
:obj:`pymatgen.analysis.dimensionality.get_structure_components`,
with ``inc_site_ids=True``.
use_iupac_formula (bool, optional): Whether to order formulas by the
iupac "electronegativity" series, defined in Table VI of
"Nomenclature of Inorganic Chemistry (IUPAC Recommendations 2005)".
This ordering effectively follows the groups and rows of the
periodic table, except the Lanthanides, Actanides and hydrogen. If
set to ``False``, the elements will be ordered according to the
electronegativity values.
use_common_formulas: Whether to use the database of common formulas.
The common formula will be used preferentially to the iupac or
reduced formula.
Returns:
A list of the compositionally inequivalent components. Any duplicate
components will only be returned once. The component objects are in the
same format is given by
:obj:`pymatgen.analysis.dimensionality.get_structure_components` but
have two additional fields:
- ``"count"`` (:obj:`int`): The number of formula units of this
component. Note, this is not the number of components with the same
formula. For example, the count of the formula "GaAs" in a system
with two Ga2As2 components would be 4.
- ``"formula"`` (``list[int]``): The reduced formula of the component.
"""
components = deepcopy(components)
inequiv_components = {}
for component in components:
formula, factor = get_component_formula_and_factor(
component,
use_iupac_formula=use_iupac_formula,
use_common_formulas=use_common_formulas,
)
# if two components have the same composition we treat them as the same
if formula in inequiv_components:
inequiv_components[formula]["count"] += factor
continue
component["count"] = factor
component["formula"] = formula
inequiv_components[formula] = component
return list(inequiv_components.values())
[docs]def filter_molecular_components(
components: list[Component],
) -> tuple[list[Component], list[Component]]:
"""Separate list of components into molecular and non-molecular components.
Args:
components: A list of structure components, generated using
:obj:`pymatgen.analysis.dimensionality.get_structure_components`.
Returns:
The filtered components as a tuple of ``(molecular_components,
other_components)``.
"""
molecular_components = [c for c in components if c["dimensionality"] == 0]
other_components = [c for c in components if c["dimensionality"] != 0]
return molecular_components, other_components
[docs]def get_reconstructed_structure(
components: list[Component], simplify_molecules: bool = True
) -> Structure:
"""Reconstructs a structure from a list of components.
Has the option to simplify molecular components into a single site
positioned at the centre of mass of the molecular. If using this option,
the components must have been generated with ``inc_molecule_graph=True``.
Args:
components: A list of structure components, generated using
:obj:`pymatgen.analysis.dimensionality.get_structure_components`,
with ``inc_molecule_graph=True``.
simplify_molecules: Whether to simplify the molecular components into
a single site positioned at the centre of mass of the molecule.
Returns:
The reconstructed structure.
"""
mol_sites = []
other_sites = []
if simplify_molecules:
mol_components, components = filter_molecular_components(components)
if mol_components:
lattice = mol_components[0]["structure_graph"].structure.lattice
mol_sites = [
PeriodicSite(
c["structure_graph"].structure[0].specie,
c["molecule_graph"].molecule.center_of_mass,
lattice,
coords_are_cartesian=True,
)
for c in mol_components
]
if components:
other_sites = [
site for c in components for site in c["structure_graph"].structure
]
return Structure.from_sites(other_sites + mol_sites)
[docs]def get_component_formula_and_factor(
component: Component,
use_iupac_formula: bool = True,
use_common_formulas: bool = True,
) -> tuple[str, int]:
"""Gets the reduced formula and factor of a single component.
Args:
component: A structure component, generated using
:obj:`pymatgen.analysis.dimensionality.get_structure_components`.
use_iupac_formula (bool, optional): Whether to order formulas by the
iupac "electronegativity" series, defined in Table VI of
"Nomenclature of Inorganic Chemistry (IUPAC Recommendations 2005)".
This ordering effectively follows the groups and rows of the
periodic table, except the Lanthanides, Actanides and hydrogen. If
set to ``False``, the elements will be ordered according to the
electronegativity values.
use_common_formulas: Whether to use the database of common formulas.
The common formula will be used preferentially to the iupac or
reduced formula.
Returns:
The formula and factor of the component.
"""
formula, factor = component[
"structure_graph"
].structure.composition.get_reduced_formula_and_factor(
iupac_ordering=use_iupac_formula
)
reduced_formula = component["structure_graph"].structure.composition.reduced_formula
if use_common_formulas and reduced_formula in common_formulas:
formula = common_formulas[reduced_formula]
return formula, factor
[docs]def get_component_formula(
component: Component,
use_iupac_formula: bool = True,
use_common_formulas: bool = True,
) -> str:
"""Gets the reduced formula of a single component.
Args:
component: A structure component, generated using
:obj:`pymatgen.analysis.dimensionality.get_structure_components`.
use_iupac_formula (bool, optional): Whether to order formulas by the
iupac "electronegativity" series, defined in Table VI of
"Nomenclature of Inorganic Chemistry (IUPAC Recommendations 2005)".
This ordering effectively follows the groups and rows of the
periodic table, except the Lanthanides, Actanides and hydrogen. If
set to ``False``, the elements will be ordered according to the
electronegativity values.
use_common_formulas: Whether to use the database of common formulas.
The common formula will be used preferentially to the iupac or
reduced formula.
Returns:
The formula and factor of the component.
"""
return get_component_formula_and_factor(
component,
use_iupac_formula=use_iupac_formula,
use_common_formulas=use_common_formulas,
)[0]
[docs]def get_formula_from_components(
components: list[Component],
molecules_first: bool = False,
use_iupac_formula: bool = True,
use_common_formulas: bool = True,
) -> str:
"""Reconstructs a chemical formula from structure components.
The chemical formulas for the individual components will be grouped
together. If two components share the same composition, they will be
treated as equivalent.
Args:
components: A list of structure components, generated using
:obj:`pymatgen.analysis.dimensionality.get_structure_components`.
molecules_first: Whether to put any molecules (zero-dimensional
components) at the beginning of the formula.
use_iupac_formula (bool, optional): Whether to order formulas by the
iupac "electronegativity" series, defined in Table VI of
"Nomenclature of Inorganic Chemistry (IUPAC Recommendations 2005)".
This ordering effectively follows the groups and rows of the
periodic table, except the Lanthanides, Actanides and hydrogen. If
set to ``False``, the elements will be ordered according to the
electronegativity values.
use_common_formulas: Whether to use the database of common formulas.
The common formula will be used preferentially to the iupac or
reduced formula.
Returns:
The chemical formula.
"""
def order(comp_formula):
composition = Composition(comp_formula)
if use_iupac_formula:
return sum(
[get_el_sp(s).iupac_ordering for s in composition.elements]
) / len(composition.elements)
return composition.average_electroneg
components = get_formula_inequiv_components(
components,
use_iupac_formula=use_iupac_formula,
use_common_formulas=use_common_formulas,
)
if molecules_first:
mol_comps, other_comps = filter_molecular_components(components)
else:
mol_comps = []
other_comps = components
formulas = sorted([c["formula"] for c in mol_comps], key=order) + sorted(
[c["formula"] for c in other_comps], key=order
)
# if components include special formulas, then the count can be 0.5
# therefore if any non integer amounts we can just use a factor of 2
all_int = all(v["count"] % 1 == 0 for v in components)
prefactor = 1 if all_int else 2
form_count_dict = {c["formula"]: int(c["count"] * prefactor) for c in components}
# the following is based on ``pymatgen.core.composition.reduce_formula``
num_comps = len(formulas)
factor = abs(gcd(*(form_count_dict.values())))
reduced_form = []
for i in range(0, num_comps):
formula = formulas[i]
norm_amt = form_count_dict[formula] * 1.0 / factor
formatted_formula = formula if norm_amt == 1 else f"({formula})"
reduced_form.append(formatted_formula)
reduced_form.append(str(formula_double_format(norm_amt)))
reduced_form = "".join(reduced_form)
return reduced_form
[docs]def components_are_vdw_heterostructure(components: list[Component]) -> bool:
"""Whether a list of components form a van der Waals heterostructure.
A heterostructure is defined here as a structure with more than one
formula inequivalent 2D component.
Args:
components: A list of structure components, generated using
:obj:`pymatgen.analysis.dimensionality.get_structure_components`.
Returns:
Whether the list of components from a heterostructure.
"""
components = get_formula_inequiv_components(components)
return bool(len([c for c in components if c["dimensionality"] == 2]))
[docs]def get_vdw_heterostructure_information(
components: list[Component],
use_iupac_formula: bool = True,
use_common_formulas: bool = True,
inc_ordered_components: bool = False,
inc_intercalants: bool = False,
) -> dict[str, Any]:
"""Gets information about ordering of components in a vdw heterostructure.
Args:
components: A list of structure components, generated using
:obj:`pymatgen.analysis.dimensionality.get_structure_components`
with ``inc_orientation=True``.
use_iupac_formula (bool, optional): Whether to order formulas by the
iupac "electronegativity" series, defined in Table VI of
"Nomenclature of Inorganic Chemistry (IUPAC Recommendations 2005)".
This ordering effectively follows the groups and rows of the
periodic table, except the Lanthanides, Actanides and hydrogen. If
set to ``False``, the elements will be ordered according to the
electronegativity values.
use_common_formulas: Whether to use the database of common formulas.
The common formula will be used preferentially to the iupac or
reduced formula.
inc_ordered_components: Whether to return a list of the ordered
components. If False, just the component formulas will be returned.
inc_intercalants: Whether to return a list of the intercalants. If
False, just the intercalant formulas will be returned.
Returns:
Information on the heterostructure, as an :obj:`dict` with they keys:
- ``"repeating_unit"`` (``list[str]``): A :obj:`List` of formulas of the
smallest repeating series of components. For example. if the
structure consists of A and B components ordered as "A B A B A B",
the repeating unit is "A B".
- ``"num_repetitions"`` (:obj:`int`): The number of repetitions of the
repeating unit that forms the overall structure. For example. if
the structure consists of A and B components ordered as
"A B A B A B", the number of repetitions is 3.
- ``"intercalant_formulas"`` (:obj:`list[str]`): The formulas of the
intercalated compounds.
- ``"ordered_components"`` (``list[component]``): If
``inc_ordered_components``, a :obj:`List` of components, ordered as
they appear in the heteostructure stacking direction.
- ``"intercalants"`` (``list[component]``: If ``inc_intercalants``, a
:obj:`List` of intercalated components.
"""
if not components_are_vdw_heterostructure(components):
raise ValueError("Components do not form a heterostructure.")
try:
millers = {c["orientation"] for c in components if c["dimensionality"] == 2}
except KeyError as exc:
if "orientation" in str(exc):
raise KeyError(
"Components not generated with inc_orientation=True"
) from exc
raise exc
if len(millers) != 1:
raise ValueError("2D components don't all have the same orientation.")
cart_miller = (
components[0]["structure_graph"]
.structure.lattice.get_cartesian_coords(millers.pop())
.tolist()
)
# plane is used to find the distances of all components along a certain axis
# should use normal vector of plane to get exact distances but we just care
# about relative distances.
def distances_to_plane(points):
return [np.dot(cart_miller, pp) for pp in points]
min_distances = [
min(distances_to_plane(c["structure_graph"].structure.cart_coords))
for c in components
]
# sort the components by distance to plane
ordering = np.argsort(min_distances)
ordered_components = [components[x] for x in ordering]
# only consider the layered components formulae
ordered_layers = [c for c in ordered_components if c["dimensionality"] == 2]
ordered_layers_formula = [
get_component_formula(
c,
use_iupac_formula=use_iupac_formula,
use_common_formulas=use_common_formulas,
)
for c in ordered_layers
]
num_layer_formulas = len(set(ordered_layers_formula))
repeating_formula = ordered_layers_formula
num_repetitions = 1
# depending on the number of inequivalent formulae, there is a maximum
# number of repetitions that can occur. To avoid unnecessary work we start
# from this number of repetitions and move to 1 repetition (e.g. no
# repetition)
max_repetitions = int(np.floor(len(ordered_layers) / num_layer_formulas))
for n in range(max_repetitions, 0, -1):
if (
all(
len(set(ordered_layers_formula[i::num_layer_formulas])) == 1
for i in range(n)
)
and len(ordered_layers) % n == 0
):
repeating_formula = ordered_layers_formula[: int(len(ordered_layers) / n)]
num_repetitions = n
break
intercalants = [c for c in components if c["dimensionality"] < 2]
intercalant_formulas = [get_component_formula(c) for c in intercalants]
data = {
"repeating_unit": repeating_formula,
"num_repetitions": num_repetitions,
"intercalant_formulas": intercalant_formulas,
}
if inc_intercalants:
data["intercalants"] = intercalants
if inc_ordered_components:
data["ordered_components"] = ordered_components
return data