Source code for robocrys.describe.describer

"""This module provides a class for generating descriptions of condensed structure
data.

Todo:
    * Indicate when molecules have been simplified in the mineral description.
    * Handle distortion in connected polyhedra description.
"""
from __future__ import annotations

from typing import Any

import inflect
from pymatgen.util.string import htmlify, latexify, latexify_spacegroup, unicodeify

from robocrys.describe.adapter import DescriptionAdapter
from robocrys.util import (
    dimensionality_to_shape,
    geometry_to_polyhedra,
    get_el,
    get_formatted_el,
    htmlify_spacegroup,
    polyhedra_plurals,
    unicodeify_spacegroup,
)

en = inflect.engine()


[docs]class StructureDescriber: def __init__( self, describe_mineral: bool = True, describe_component_makeup: bool = True, describe_components: bool = True, describe_symmetry_labels: bool = True, describe_oxidation_states: bool = True, describe_bond_lengths: bool = True, bond_length_decimal_places: int = 2, distorted_tol: float = 0.6, only_describe_cation_polyhedra_connectivity: bool = True, only_describe_bonds_once: bool = True, fmt: str = "raw", return_parts: bool = False, ): """A class to convert condensed structure data into text descriptions. Args: describe_mineral: Whether to describe the crystal mineral data. describe_component_makeup: Whether to describe the component makeup of the structure. describe_components: Whether to describe the component's sites. describe_symmetry_labels: Whether to include the symmetry labels when describing atomic sites. describe_oxidation_states: Whether to include oxidation states in the description. describe_bond_lengths: Whether to describe bond lengths. bond_length_decimal_places: Number of decimal places to round bond lengths. distorted_tol: The value under which the site geometry will be classified as distorted. only_describe_cation_polyhedra_connectivity: Whether to only describe cation polyhedra instead of both cation and anion polyhedra. only_describe_bonds_once: Whether only describe bond lengths once. For example, don't describe the bond lengths from Pb to I and also from I to Pb. fmt: How to format element strings, formulas and spacegroups. Options are: - "raw" (default): Don't apply special formatting (e.g. "SnO2"). - "unicode": Format super/subscripts using unicode characters (e.g. SnO₂). - "latex": Use LaTeX markup for formatting (e.g. "SnO$_2$"). - "html": Use html markup for formatting (e.g. "SnO<sub>2</sub>"). return_parts: Whether to return the individual parts of the description as a :obj:`dict`, or the whole description as a :obj:`str`. """ self.distorted_tol = distorted_tol self.describe_mineral = describe_mineral self.describe_component_makeup = describe_component_makeup self.describe_components = describe_components self.describe_symmetry_labels = describe_symmetry_labels self.describe_oxidation_state = describe_oxidation_states self.describe_bond_lengths = describe_bond_lengths self.bond_length_decimal_places = bond_length_decimal_places self.cation_polyhedra_only = only_describe_cation_polyhedra_connectivity self.only_describe_bonds_once = only_describe_bonds_once self.fmt = fmt self.return_parts = return_parts self.angle_decimal_places = 0 if fmt == "latex": self.angstrom = r"$\AA$" self.degree = r"$^{\circ}$" else: self.angstrom = "Å" self.degree = "°" self._da: DescriptionAdapter = None self._seen_bonds: set = None
[docs] def describe( self, condensed_structure: dict[str, Any] ) -> str | dict[str, str]: """Convert a condensed structure into a text description. Args: condensed_structure: The condensed structure data, formatted as produced by :meth:`StructureCondenser.condense_structure`. Returns: A description of the structure. If :attr:`StructureDescriber.return_parts` is ``False``, the description will be returned as a :obj:`str`. If it is equal to ``True``, the description will be returned as a :obj:`dict` with the keys 'mineral', 'component_makeup' and 'components', each containing the relevant part of the description. """ self._da = DescriptionAdapter(condensed_structure) self._seen_bonds = set() description = {} if self.describe_mineral: description["mineral"] = self.get_mineral_description() if self.describe_component_makeup: description["component_makeup"] = self.get_component_makeup_summary() if self.describe_components: description["components"] = self.get_all_component_descriptions() if not self.return_parts: return " ".join( description[part] for part in ["mineral", "component_makeup", "components"] if part in description and description[part] != "" ) return description
[docs] def get_mineral_description(self) -> str: """Gets the mineral name and space group description. If the structure is a perfect match for a known prototype (e.g. the distance parameter is -1, the mineral name is the prototype name. If a structure is not a perfect match but similar to a known mineral, "-like" will be added to the mineral name. If the structure is a good match to a mineral but contains a different number of element types than the mineral prototype, "-derived" will be added to the mineral name. Returns: The description of the mineral name. """ spg_symbol = self._da.spg_symbol formula = self._da.formula if self.fmt == "latex": spg_symbol = latexify_spacegroup(self._da.spg_symbol) formula = latexify(formula) elif self.fmt == "unicode": spg_symbol = unicodeify_spacegroup(self._da.spg_symbol) formula = unicodeify(formula) elif self.fmt == "html": spg_symbol = htmlify_spacegroup(self._da.spg_symbol) formula = htmlify(formula) mineral_name = get_mineral_name(self._da.mineral) if mineral_name: desc = f"{formula} is {mineral_name} structured and" else: desc = f"{formula}" desc += " crystallizes in the {} {} space group.".format( self._da.crystal_system, spg_symbol ) return desc
[docs] def get_component_makeup_summary(self) -> str: """Gets a summary of the makeup of components in a structure. Returns: A description of the number of components and their dimensionalities and orientations. """ component_groups = self._da.get_component_groups() if ( len(component_groups) == 1 and component_groups[0].count == 1 and component_groups[0].dimensionality == 3 ): desc = "" else: if self._da.dimensionality == 3: desc = "The structure consists of " else: desc = "The structure is {}-dimensional and consists of ".format( en.number_to_words(self._da.dimensionality) ) component_makeup_summaries = [] nframeworks = len( [ c for g in component_groups for c in g.components if c.dimensionality == 3 ] ) for component_group in component_groups: if nframeworks == 1 and component_group.dimensionality == 3: s_count = "a" else: s_count = en.number_to_words(component_group.count) dimensionality = component_group.dimensionality if component_group.molecule_name: shape = "atom" if component_group.nsites == 1 else "molecule" shape = en.plural(shape, s_count) formula = component_group.molecule_name else: shape = en.plural(dimensionality_to_shape[dimensionality], s_count) formula = component_group.formula if self.fmt == "latex": formula = latexify(formula) elif self.fmt == "unicode": formula = unicodeify(formula) elif self.fmt == "html": formula = htmlify(formula) comp_desc = f"{s_count} {formula} {shape}" if component_group.dimensionality in [1, 2]: orientations = list( {str(c.orientation) for c in component_group.components} ) s_direction = en.plural("direction", len(orientations)) comp_desc += " oriented in the {} {}".format( en.join(orientations), s_direction ) component_makeup_summaries.append(comp_desc) if nframeworks == 1 and len(component_makeup_summaries) > 1: # when there is a single framework, make the description read # "... and 8 Sn atoms inside a SnO2 framework" instead of # "..., 8 Sn atoms and one SnO2 framework" # This works because the component summaries are sorted by # dimensionality desc += en.join(component_makeup_summaries[:-1]) desc += f" inside {component_makeup_summaries[-1]}." else: desc += en.join(component_makeup_summaries) + "." return desc
[docs] def get_all_component_descriptions(self) -> str: """Gets the descriptions of all components in the structure. Returns: A description of all components in the structure. """ if len(self._da.components) == 1: return self.get_component_description( self._da.get_component_groups()[0].components[0].index, single_component=True, ) component_groups = self._da.get_component_groups() component_descriptions = [] for group in component_groups: for component in group.components: if group.molecule_name: # don't describe known molecules continue formula = group.formula group_count = group.count component_count = component.count shape = dimensionality_to_shape[group.dimensionality] if self.fmt == "latex": formula = latexify(formula) elif self.fmt == "unicode": formula = unicodeify(formula) elif self.fmt == "html": formula = htmlify(formula) if group_count == component_count: s_filler = "the" if group_count == 1 else "each" else: s_filler = f"{en.number_to_words(component_count)} of the" shape = en.plural(shape) desc = f"In {s_filler} {formula} {shape}, " desc += self.get_component_description(component.index) component_descriptions.append(desc) return " ".join(component_descriptions)
[docs] def get_component_description( self, component_index: int, single_component: bool = False ) -> str: """Gets the descriptions of all sites in a component. Args: component_index: The index of the component single_component: Whether the structure contains only a single component. Returns: The description for all sites in the components. """ desc = [] first_group = True for site_group in self._da.get_component_site_groups(component_index): if len(site_group.sites) == 1: desc.append(self.get_site_description(site_group.sites[0])) else: element = get_formatted_el( site_group.element, "", use_oxi_state=self.describe_oxidation_state, use_sym_label=False, fmt=self.fmt, ) s_there = "there" if first_group and not single_component else "There" s_count = en.number_to_words(len(site_group.sites)) desc.append(f"{s_there} are {s_count} inequivalent {element} sites.") for i, site in enumerate(site_group.sites): s_ordinal = en.number_to_words(en.ordinal(i + 1)) desc.append(f"In the {s_ordinal} {element} site,") desc.append(self.get_site_description(site)) first_group = False return " ".join(desc)
[docs] def get_site_description(self, site_index: int) -> str: """Gets a description of the geometry and bonding of a site. If the site likeness (order parameter) is less than ``distorted_tol``, "distorted" will be added to the geometry description. Args: site_index: An inequivalent site index. Returns: A description of the geometry and bonding of a site. """ site = self._da.sites[site_index] if site["poly_formula"] and ( self.cation_polyhedra_only or "+" in site["element"] ): desc = self._get_poly_site_description(site_index) tilt_desc = self.get_octahedral_tilt_description(site_index) if tilt_desc: desc += ". " + tilt_desc else: element = get_formatted_el( site["element"], self._da.sym_labels[site_index], use_oxi_state=self.describe_oxidation_state, use_sym_label=self.describe_symmetry_labels, fmt=self.fmt, ) if site["geometry"]["likeness"] < self.distorted_tol: s_geometry = "distorted " else: s_geometry = "" s_geometry += site["geometry"]["type"] desc = f"{element} is bonded in {en.a(s_geometry)} geometry to " desc += self._get_nearest_neighbor_description(site_index) bond_length_desc = self._get_nearest_neighbor_bond_length_descriptions( site_index ) if bond_length_desc: desc += ". " + bond_length_desc else: desc += "." return desc
def _get_poly_site_description(self, site_index: int): """Gets a description of a connected polyhedral site. If the site likeness (order parameter) is less than ``distorted_tol``, "distorted" will be added to the geometry description. Args: site_index: An inequivalent site index. Returns: A description the a polyhedral site, including connectivity. """ site = self._da.sites[site_index] nnn_details = self._da.get_next_nearest_neighbor_details( site_index, group=not self.describe_symmetry_labels ) from_element = get_formatted_el( site["element"], self._da.sym_labels[site_index], use_oxi_state=self.describe_oxidation_state, use_sym_label=self.describe_symmetry_labels, fmt=self.fmt, ) from_poly_formula = site["poly_formula"] if self.fmt == "latex": from_poly_formula = latexify(from_poly_formula) elif self.fmt == "unicode": from_poly_formula = unicodeify(from_poly_formula) elif self.fmt == "html": from_poly_formula = htmlify(from_poly_formula) s_from_poly_formula = get_el(site["element"]) + from_poly_formula if site["geometry"]["likeness"] < self.distorted_tol: s_distorted = "distorted " else: s_distorted = "" s_polyhedra = geometry_to_polyhedra[site["geometry"]["type"]] s_polyhedra = polyhedra_plurals[s_polyhedra] nn_desc = self._get_nearest_neighbor_description(site_index) desc = f"{from_element} is bonded to {nn_desc} to form " # handle the case we were are connected to the same type of polyhedra if ( nnn_details[0].element == site["element"] and len( {(nnn_site.element, nnn_site.poly_formula) for nnn_site in nnn_details} ) ) == 1: connectivities = list({nnn_site.connectivity for nnn_site in nnn_details}) s_mixture = "a mixture of " if len(connectivities) != 1 else "" s_connectivities = en.join(connectivities) desc += "{}{}{}-sharing {} {}".format( s_mixture, s_distorted, s_connectivities, s_from_poly_formula, s_polyhedra, ) return desc # otherwise loop through nnn connectivities and describe individually desc += "{}{} {} that share ".format( s_distorted, s_from_poly_formula, s_polyhedra ) nnn_descriptions = [] for nnn_site in nnn_details: to_element = get_formatted_el( nnn_site.element, nnn_site.sym_label, use_oxi_state=False, use_sym_label=self.describe_symmetry_labels, ) to_poly_formula = nnn_site.poly_formula if self.fmt == "latex": to_poly_formula = latexify(to_poly_formula) elif self.fmt == "unicode": to_poly_formula = unicodeify(to_poly_formula) elif self.fmt == "html": to_poly_formula = htmlify(to_poly_formula) to_poly_formula = to_element + to_poly_formula to_shape = geometry_to_polyhedra[nnn_site.geometry] if len(nnn_site.sites) == 1 and nnn_site.count != 1: s_equivalent = " equivalent " else: s_equivalent = " " if nnn_site.count == 1: s_an = f" {en.an(nnn_site.connectivity)}" else: s_an = "" to_shape = polyhedra_plurals[to_shape] nnn_descriptions.append( "{}{} with {}{}{} {}".format( s_an, en.plural(nnn_site.connectivity, nnn_site.count), en.number_to_words(nnn_site.count), s_equivalent, to_poly_formula, to_shape, ) ) return desc + en.join(nnn_descriptions) def _get_nearest_neighbor_description(self, site_index: int) -> str: """Gets a description of a sites nearest neighbors. Note: This function is intended to be run directly after :meth:`get_site_description`, as the output will not form a complete sentence on its own. Args: site_index: An inequivalent site index. Returns: A description of the nearest neighbors. """ nn_details = self._da.get_nearest_neighbor_details( site_index, group=not self.describe_symmetry_labels ) last_count = 0 nn_descriptions = [] for nn_site in nn_details: element = get_formatted_el( nn_site.element, nn_site.sym_label, use_oxi_state=self.describe_oxidation_state, use_sym_label=self.describe_symmetry_labels, fmt=self.fmt, ) if len(nn_site.sites) == 1 and nn_site.count != 1: s_equivalent = " equivalent " else: s_equivalent = " " nn_descriptions.append( "{}{}{}".format( en.number_to_words(nn_site.count), s_equivalent, element ) ) last_count = nn_site.count s_atoms = "atom" if last_count == 1 else "atoms" return f"{en.join(nn_descriptions)} {s_atoms}" def _get_nearest_neighbor_bond_length_descriptions(self, site_index: int) -> str: """Gets the descriptions of the bond lengths for nearest neighbor sites. Args: site_index: An inequivalent site index. Returns: A description of the nearest neighbor bond lengths. """ if not self.describe_bond_lengths: return "" nn_details = self._da.get_nearest_neighbor_details( site_index, group=not self.describe_symmetry_labels ) bond_descriptions = [] for nn_site in nn_details: bond_descriptions.append( self.get_bond_length_description(site_index, nn_site.sites) ) # filter empty bond length description strings return " ".join(filter(lambda x: x, bond_descriptions))
[docs] def get_bond_length_description(self, from_site: int, to_sites: list[int]) -> str: """Gets a description of the bond lengths between two sets of sites. Args: from_site: An inequivalent site index. to_sites: A :obj:`list` of site indices. The site indices should all be for the same element. Returns: A description of the bond lengths or an empty string if :attr:`StructureDescriber.only_describe_bonds_once` is ``True`` and all all bond lengths have already been described. """ if self.only_describe_bonds_once: to_sites = self._filter_seen_bonds(from_site, to_sites) if not to_sites: return "" from_element = get_formatted_el( self._da.elements[from_site], self._da.sym_labels[from_site], use_oxi_state=False, use_sym_label=self.describe_symmetry_labels, ) to_element = get_formatted_el( self._da.elements[to_sites[0]], self._da.get_sym_label(to_sites), use_oxi_state=False, use_sym_label=self.describe_symmetry_labels, ) dists = self._da.get_distance_details(from_site, to_sites) # if only one bond length if len(dists) == 1: return "The {}-{} bond length is {}.".format( from_element, to_element, self._distance_to_string(dists[0]) ) discrete_bond_lengths = self._rounded_bond_lengths(dists) # if multiple bond lengths but they are all the same if len(set(discrete_bond_lengths)) == 1: s_intro = "Both" if len(discrete_bond_lengths) == 2 else "All" return "{} {}-{} bond lengths are {}.".format( s_intro, from_element, to_element, self._distance_to_string(dists[0]) ) # if two sets of bond lengths if len(set(discrete_bond_lengths)) == 2: small = min(discrete_bond_lengths) s_small_count = en.number_to_words(discrete_bond_lengths.count(small)) big = max(discrete_bond_lengths) s_big_count = en.number_to_words(discrete_bond_lengths.count(big)) s_length = en.plural("length", s_big_count) return ( "There {} {} shorter ({}) and {} longer ({}) {}-{} bond {}." ).format( en.plural_verb("is", s_small_count), s_small_count, self._distance_to_string(small), s_big_count, self._distance_to_string(big), from_element, to_element, s_length, ) # otherwise just detail the spread of bond lengths return ("There are a spread of {}-{} bond distances ranging from {}.").format( from_element, to_element, self._distance_range_to_string( min(discrete_bond_lengths), max(discrete_bond_lengths) ), )
[docs] def get_octahedral_tilt_description( self, site_index: int, ) -> str: """Gets a description of octahedral tilting angles between two sites. Currently only implemented for corner-sharing octahedra. Will throw an error if the two sites are not next nearest neighbors. Args: site_index: An inequivalent site index. Returns: A description of the octahedral tilting angles. """ nnn_details = self._da.get_next_nearest_neighbor_details( site_index, group=not self.describe_symmetry_labels ) to_sites = [ site for nnn_site in nnn_details for site in nnn_site.sites if nnn_site.geometry == "octahedral" and nnn_site.connectivity == "corner" ] angles = self._da.get_angle_details(site_index, to_sites, "corner") discrete_angles = list(set(self._rounded_angles(angles))) tilts = [abs(180 - angle) for angle in discrete_angles] if not tilts: return "" # if only one bond length if len(tilts) == 1: if tilts[0] == 0: return "The corner-sharing octahedra are not tilted" return "The corner-sharing octahedral tilt angles are {}".format( self._angle_to_string(tilts[0]) ) # otherwise just detail the spread of bond lengths return "The corner-sharing octahedral tilt angles range from {}".format( self._angle_range_to_string(min(tilts), max(tilts)) )
def _filter_seen_bonds(self, from_site: int, to_sites: list[int]) -> list[int]: """Filter the list of to_sites to only include unseen bonds. Args: from_site: An inequivalent site index. to_sites: A :obj:`list` of site indices. The site indices should all be for the same element. Returns: The list of unseen bonds. """ # get a list of tuples of (from_site, to_site) for all to_sites bonds = [(f, t) for f, t in zip([from_site] * len(to_sites), to_sites)] # use frozen set as it is invariant to the order of the sites not_seen = [x for x in bonds if frozenset(x) not in self._seen_bonds] # only describe the bonds between unseen site pairs not_seen_to_sites = [] for from_site, to_site in not_seen: not_seen_to_sites.append(to_site) self._seen_bonds.add(frozenset((from_site, to_site))) return not_seen_to_sites def _rounded_bond_lengths(self, data: list[float]) -> tuple[float]: """Function to round bond lengths to a number of decimal places.""" return tuple( float("{:.{}f}".format(x, self.bond_length_decimal_places)) for x in data ) def _distance_to_string(self, distance: float) -> str: """Utility function to round a distance and add an Angstrom symbol.""" return "{:.{}f} {}".format( distance, self.bond_length_decimal_places, self.angstrom ) def _distance_range_to_string(self, dist_a: float, dist_b: float) -> str: """Utility function to format a range of distances.""" return "{:.{}f}-{:.{}f} {}".format( dist_a, self.bond_length_decimal_places, dist_b, self.bond_length_decimal_places, self.angstrom, ) def _rounded_angles(self, data: list[float]) -> tuple[float]: """Function to round angles to a number of decimal places.""" return tuple( float("{:.{}f}".format(x, self.angle_decimal_places)) for x in data ) def _angle_to_string(self, angle: float) -> str: """Utility function to round a distance and add an Angstrom symbol.""" return "{:.{}f}{}".format(angle, self.angle_decimal_places, self.degree) def _angle_range_to_string(self, angle_a: float, angle_b: float) -> str: """Utility function to format a range of distances.""" return "{:.{}f}-{:.{}f}{}".format( angle_a, self.angle_decimal_places, angle_b, self.angle_decimal_places, self.degree, )
[docs]def get_mineral_name(mineral_dict: dict[str, Any]) -> str | None: """Get the mineral name from a mineral dictionary. Args: mineral_dict: The mineral dictionary from the condensed description. Returns: If ``mineral_dict["type"]`` is set, the mineral name will be returned as a string, else ``None`` will be returned. """ if mineral_dict["type"]: if not mineral_dict["n_species_type_match"]: suffix = "-derived" elif mineral_dict["distance"] >= 0: suffix = "-like" else: suffix = "" return "{}{}".format(mineral_dict["type"], suffix) return None