diff --git a/molecode/markush/mermaid_to_rdkit.py b/molecode/markush/mermaid_to_rdkit.py index b8a558d..394bfa2 100644 --- a/molecode/markush/mermaid_to_rdkit.py +++ b/molecode/markush/mermaid_to_rdkit.py @@ -28,6 +28,7 @@ class MermaidMolParser: '---': Chem.BondType.SINGLE, '===': Chem.BondType.DOUBLE, '-.-': Chem.BondType.TRIPLE, + '<-->': Chem.BondType.AROMATIC, '-->': Chem.BondType.DATIVE, # 配位键 } @@ -88,7 +89,7 @@ def _parse_line(self, line: str): # 尝试匹配普通键连接: atom1 bond_type atom2 # 原子ID可能包含手性后缀 (_R 或 _S) - bond_pattern = r'([\w_]+)\s*(---|\===|-\.-|-->)\s*([\w_]+)' + bond_pattern = r'([\w_]+)\s*(<-->|---|\===|-\.-|-->)\s*([\w_]+)' bond_match = re.search(bond_pattern, line) if bond_match: @@ -241,17 +242,10 @@ def _build_mol(self) -> Optional[Chem.Mol]: atom_obj = mol.GetAtomWithIdx(idx) atom_obj.SetProp("_abbreviation", self.abbreviations[atom_id]) - # 设置手性(如果有) - if atom_id in self.chirality: - chirality_type = self.chirality[atom_id] - atom_obj = mol.GetAtomWithIdx(idx) - - if chirality_type == 'R': - atom_obj.SetChiralTag(Chem.ChiralType.CHI_TETRAHEDRAL_CW) - elif chirality_type == 'S': - atom_obj.SetChiralTag(Chem.ChiralType.CHI_TETRAHEDRAL_CCW) - - # 添加键 + # 添加键。双键 E/Z 和四面体 R/S 都需要在完整拓扑存在后设置, + # 因此这里先记录,待 SanitizeMol 后统一恢复。 + stereo_bonds = [] + aromatic_atom_idxs = set() for bond_info in self.bonds: if len(bond_info) == 3: # 普通键: (atom1_id, atom2_id, bond_type_str) @@ -270,40 +264,16 @@ def _build_mol(self) -> Optional[Chem.Mol]: mol.AddBond(idx1, idx2, bond_type) - # 设置立体化学(稍后统一处理,需要先添加所有键) + if bond_type_str == '<-->': + aromatic_atom_idxs.update((idx1, idx2)) + bond = mol.GetBondBetweenAtoms(idx1, idx2) + bond.SetIsAromatic(True) + if stereo_type: - # 记录需要设置立体化学的键 - if not hasattr(mol, '_stereo_bonds'): - mol._stereo_bonds = [] - mol._stereo_bonds.append((idx1, idx2, stereo_type)) - - # 在转换为不可编辑的Mol之前,设置立体化学 - if hasattr(mol, '_stereo_bonds'): - for idx1, idx2, stereo_type in mol._stereo_bonds: - bond = mol.GetBondBetweenAtoms(idx1, idx2) - - # 获取双键两端原子的邻接原子(用于定义立体化学) - atom1 = mol.GetAtomWithIdx(idx1) - atom2 = mol.GetAtomWithIdx(idx2) - - # 找到idx1的邻居(除了idx2) - neighbors1 = [n.GetIdx() for n in atom1.GetNeighbors() if n.GetIdx() != idx2] - # 找到idx2的邻居(除了idx1) - neighbors2 = [n.GetIdx() for n in atom2.GetNeighbors() if n.GetIdx() != idx1] - - # 如果两端都有邻居,设置立体化学 - if neighbors1 and neighbors2: - # 使用第一个邻居作为参考原子 - bond.SetStereoAtoms(neighbors1[0], neighbors2[0]) - - if stereo_type == 'E': - bond.SetStereo(Chem.BondStereo.STEREOE) - elif stereo_type == 'Z': - bond.SetStereo(Chem.BondStereo.STEREOZ) - elif stereo_type == 'CIS': - bond.SetStereo(Chem.BondStereo.STEREOCIS) - elif stereo_type == 'TRANS': - bond.SetStereo(Chem.BondStereo.STEREOTRANS) + stereo_bonds.append((idx1, idx2, stereo_type)) + + for idx in aromatic_atom_idxs: + mol.GetAtomWithIdx(idx).SetIsAromatic(True) # 转换为不可编辑的Mol对象 mol = mol.GetMol() @@ -322,9 +292,79 @@ def _build_mol(self) -> Optional[Chem.Mol]: # 完全失败,返回未清理的版本 pass + self._assign_chirality_from_ids(mol, atom_id_to_idx) + self._assign_double_bond_stereo(mol, stereo_bonds) + return mol + def _assign_chirality_from_ids(self, mol: Chem.Mol, atom_id_to_idx: Dict[str, int]): + """根据 atom id 的 _R/_S 后缀恢复绝对 CIP 手性。""" + if not self.chirality: + return + + for atom_id, desired_cip in self.chirality.items(): + idx = atom_id_to_idx.get(atom_id) + if idx is None: + continue + + atom = mol.GetAtomWithIdx(idx) + matched = False + + for chiral_tag in ( + Chem.ChiralType.CHI_TETRAHEDRAL_CW, + Chem.ChiralType.CHI_TETRAHEDRAL_CCW, + ): + atom.SetChiralTag(chiral_tag) + try: + Chem.AssignStereochemistry(mol, cleanIt=True, force=True) + except Exception: + continue + + if atom.HasProp('_CIPCode') and atom.GetProp('_CIPCode') == desired_cip: + matched = True + break + + if not matched: + atom.SetChiralTag(Chem.ChiralType.CHI_UNSPECIFIED) + + try: + Chem.AssignStereochemistry(mol, cleanIt=False, force=True) + except Exception: + pass + + def _assign_double_bond_stereo(self, mol: Chem.Mol, stereo_bonds: List[Tuple[int, int, str]]): + """恢复 ===|E| / ===|Z| 双键构型。""" + for idx1, idx2, stereo_type in stereo_bonds: + bond = mol.GetBondBetweenAtoms(idx1, idx2) + if bond is None: + continue + + atom1 = mol.GetAtomWithIdx(idx1) + atom2 = mol.GetAtomWithIdx(idx2) + neighbors1 = [n.GetIdx() for n in atom1.GetNeighbors() if n.GetIdx() != idx2] + neighbors2 = [n.GetIdx() for n in atom2.GetNeighbors() if n.GetIdx() != idx1] + + if not neighbors1 or not neighbors2: + continue + + bond.SetStereoAtoms(neighbors1[0], neighbors2[0]) + + if stereo_type == 'E': + bond.SetStereo(Chem.BondStereo.STEREOE) + elif stereo_type == 'Z': + bond.SetStereo(Chem.BondStereo.STEREOZ) + elif stereo_type == 'CIS': + bond.SetStereo(Chem.BondStereo.STEREOCIS) + elif stereo_type == 'TRANS': + bond.SetStereo(Chem.BondStereo.STEREOTRANS) + + try: + Chem.AssignStereochemistry(mol, cleanIt=False, force=True) + except Exception: + pass + + def has_invalid_atoms(mol: Chem.Mol) -> bool: """ 检查分子是否包含无效原子(Dummy Atom) diff --git a/molecode/markush/rdkit_to_mermaid.py b/molecode/markush/rdkit_to_mermaid.py index f1b595d..53bcd1f 100644 --- a/molecode/markush/rdkit_to_mermaid.py +++ b/molecode/markush/rdkit_to_mermaid.py @@ -163,18 +163,16 @@ def _generate_atom_id(self, atom: Chem.Atom) -> str: # 基础ID base_id = f"{clean_name}_{symbol}_{count}" - # 检测手性并添加后缀 - chiral_tag = atom.GetChiralTag() - - if chiral_tag == Chem.ChiralType.CHI_TETRAHEDRAL_CW: - # 顺时针 (R构型) - return f"{base_id}_R" - elif chiral_tag == Chem.ChiralType.CHI_TETRAHEDRAL_CCW: - # 逆时针 (S构型) - return f"{base_id}_S" - else: - # 无手性或未指定 - return base_id + # 使用 RDKit 计算出的绝对 CIP 构型,而不是直接把 + # CHI_TETRAHEDRAL_CW/CCW 当作 R/S。CW/CCW 依赖原子顺序, + # 只有 _CIPCode 才是可序列化的绝对 R/S 标签。 + if atom.HasProp('_CIPCode'): + cip_code = atom.GetProp('_CIPCode') + if cip_code in ('R', 'S'): + return f"{base_id}_{cip_code}" + + # 无手性或未指定 + return base_id def _generate_atom_label(self, atom: Chem.Atom) -> str: """ diff --git a/molecode/molecule/mermaid_to_rdkit.py b/molecode/molecule/mermaid_to_rdkit.py index df5e835..f573ea9 100644 --- a/molecode/molecule/mermaid_to_rdkit.py +++ b/molecode/molecule/mermaid_to_rdkit.py @@ -28,6 +28,7 @@ class MermaidMolParser: '---': Chem.BondType.SINGLE, '===': Chem.BondType.DOUBLE, '-.-': Chem.BondType.TRIPLE, + '<-->': Chem.BondType.AROMATIC, '-->': Chem.BondType.DATIVE, # 配位键 } @@ -86,7 +87,7 @@ def _parse_line(self, line: str): # 尝试匹配普通键连接: atom1 bond_type atom2 # 原子ID可能包含手性后缀 (_R 或 _S) - bond_pattern = r'([\w_]+)\s*(---|\===|-\.-|-->)\s*([\w_]+)' + bond_pattern = r'([\w_]+)\s*(<-->|---|\===|-\.-|-->)\s*([\w_]+)' bond_match = re.search(bond_pattern, line) if bond_match: @@ -215,17 +216,10 @@ def _build_mol(self) -> Optional[Chem.Mol]: idx = mol.AddAtom(atom) atom_id_to_idx[atom_id] = idx - # 设置手性(如果有) - if atom_id in self.chirality: - chirality_type = self.chirality[atom_id] - atom_obj = mol.GetAtomWithIdx(idx) - - if chirality_type == 'R': - atom_obj.SetChiralTag(Chem.ChiralType.CHI_TETRAHEDRAL_CW) - elif chirality_type == 'S': - atom_obj.SetChiralTag(Chem.ChiralType.CHI_TETRAHEDRAL_CCW) - - # 添加键 + # 添加键。双键 E/Z 和四面体 R/S 都需要在完整拓扑存在后设置, + # 因此这里先记录,待 SanitizeMol 后统一恢复。 + stereo_bonds = [] + aromatic_atom_idxs = set() for bond_info in self.bonds: if len(bond_info) == 3: # 普通键: (atom1_id, atom2_id, bond_type_str) @@ -244,40 +238,16 @@ def _build_mol(self) -> Optional[Chem.Mol]: mol.AddBond(idx1, idx2, bond_type) - # 设置立体化学(稍后统一处理,需要先添加所有键) + if bond_type_str == '<-->': + aromatic_atom_idxs.update((idx1, idx2)) + bond = mol.GetBondBetweenAtoms(idx1, idx2) + bond.SetIsAromatic(True) + if stereo_type: - # 记录需要设置立体化学的键 - if not hasattr(mol, '_stereo_bonds'): - mol._stereo_bonds = [] - mol._stereo_bonds.append((idx1, idx2, stereo_type)) - - # 在转换为不可编辑的Mol之前,设置立体化学 - if hasattr(mol, '_stereo_bonds'): - for idx1, idx2, stereo_type in mol._stereo_bonds: - bond = mol.GetBondBetweenAtoms(idx1, idx2) - - # 获取双键两端原子的邻接原子(用于定义立体化学) - atom1 = mol.GetAtomWithIdx(idx1) - atom2 = mol.GetAtomWithIdx(idx2) - - # 找到idx1的邻居(除了idx2) - neighbors1 = [n.GetIdx() for n in atom1.GetNeighbors() if n.GetIdx() != idx2] - # 找到idx2的邻居(除了idx1) - neighbors2 = [n.GetIdx() for n in atom2.GetNeighbors() if n.GetIdx() != idx1] - - # 如果两端都有邻居,设置立体化学 - if neighbors1 and neighbors2: - # 使用第一个邻居作为参考原子 - bond.SetStereoAtoms(neighbors1[0], neighbors2[0]) - - if stereo_type == 'E': - bond.SetStereo(Chem.BondStereo.STEREOE) - elif stereo_type == 'Z': - bond.SetStereo(Chem.BondStereo.STEREOZ) - elif stereo_type == 'CIS': - bond.SetStereo(Chem.BondStereo.STEREOCIS) - elif stereo_type == 'TRANS': - bond.SetStereo(Chem.BondStereo.STEREOTRANS) + stereo_bonds.append((idx1, idx2, stereo_type)) + + for idx in aromatic_atom_idxs: + mol.GetAtomWithIdx(idx).SetIsAromatic(True) # 转换为不可编辑的Mol对象 mol = mol.GetMol() @@ -296,9 +266,82 @@ def _build_mol(self) -> Optional[Chem.Mol]: # 完全失败,返回未清理的版本 pass + self._assign_chirality_from_ids(mol, atom_id_to_idx) + self._assign_double_bond_stereo(mol, stereo_bonds) + return mol + def _assign_chirality_from_ids(self, mol: Chem.Mol, atom_id_to_idx: Dict[str, int]): + """根据 atom id 的 _R/_S 后缀恢复绝对 CIP 手性。""" + if not self.chirality: + return + + for atom_id, desired_cip in self.chirality.items(): + idx = atom_id_to_idx.get(atom_id) + if idx is None: + continue + + atom = mol.GetAtomWithIdx(idx) + matched = False + + for chiral_tag in ( + Chem.ChiralType.CHI_TETRAHEDRAL_CW, + Chem.ChiralType.CHI_TETRAHEDRAL_CCW, + ): + atom.SetChiralTag(chiral_tag) + try: + Chem.AssignStereochemistry(mol, cleanIt=True, force=True) + except Exception: + continue + + if atom.HasProp('_CIPCode') and atom.GetProp('_CIPCode') == desired_cip: + matched = True + break + + if not matched: + atom.SetChiralTag(Chem.ChiralType.CHI_UNSPECIFIED) + + try: + Chem.AssignStereochemistry(mol, cleanIt=False, force=True) + except Exception: + pass + + def _assign_double_bond_stereo(self, mol: Chem.Mol, stereo_bonds: List[Tuple[int, int, str]]): + """恢复 ===|E| / ===|Z| 双键构型。""" + for idx1, idx2, stereo_type in stereo_bonds: + bond = mol.GetBondBetweenAtoms(idx1, idx2) + if bond is None: + continue + + # 获取双键两端原子的邻接原子(用于定义立体化学) + atom1 = mol.GetAtomWithIdx(idx1) + atom2 = mol.GetAtomWithIdx(idx2) + + neighbors1 = [n.GetIdx() for n in atom1.GetNeighbors() if n.GetIdx() != idx2] + neighbors2 = [n.GetIdx() for n in atom2.GetNeighbors() if n.GetIdx() != idx1] + + if not neighbors1 or not neighbors2: + continue + + bond.SetStereoAtoms(neighbors1[0], neighbors2[0]) + + if stereo_type == 'E': + bond.SetStereo(Chem.BondStereo.STEREOE) + elif stereo_type == 'Z': + bond.SetStereo(Chem.BondStereo.STEREOZ) + elif stereo_type == 'CIS': + bond.SetStereo(Chem.BondStereo.STEREOCIS) + elif stereo_type == 'TRANS': + bond.SetStereo(Chem.BondStereo.STEREOTRANS) + + try: + # 不使用 cleanIt=True,避免清掉刚刚从 EGL 明确恢复的 E/Z 标记。 + Chem.AssignStereochemistry(mol, cleanIt=False, force=True) + except Exception: + pass + + def has_invalid_atoms(mol: Chem.Mol) -> bool: """ 检查分子是否包含无效原子(Dummy Atom) diff --git a/molecode/molecule/rdkit_to_mermaid.py b/molecode/molecule/rdkit_to_mermaid.py index 0560844..675ee89 100644 --- a/molecode/molecule/rdkit_to_mermaid.py +++ b/molecode/molecule/rdkit_to_mermaid.py @@ -158,18 +158,16 @@ def _generate_atom_id(self, atom: Chem.Atom) -> str: # 基础ID base_id = f"{clean_name}_{symbol}_{count}" - # 检测手性并添加后缀 - chiral_tag = atom.GetChiralTag() - - if chiral_tag == Chem.ChiralType.CHI_TETRAHEDRAL_CW: - # 顺时针 (R构型) - return f"{base_id}_R" - elif chiral_tag == Chem.ChiralType.CHI_TETRAHEDRAL_CCW: - # 逆时针 (S构型) - return f"{base_id}_S" - else: - # 无手性或未指定 - return base_id + # 使用 RDKit 计算出的绝对 CIP 构型,而不是直接把 + # CHI_TETRAHEDRAL_CW/CCW 当作 R/S。CW/CCW 依赖原子顺序, + # 只有 _CIPCode 才是可序列化的绝对 R/S 标签。 + if atom.HasProp('_CIPCode'): + cip_code = atom.GetProp('_CIPCode') + if cip_code in ('R', 'S'): + return f"{base_id}_{cip_code}" + + # 无手性或未指定 + return base_id def _generate_atom_label(self, atom: Chem.Atom) -> str: """ diff --git a/molecode/prompts/molecule_system_prompt.py b/molecode/prompts/molecule_system_prompt.py index d97813f..d8dcdc1 100644 --- a/molecode/prompts/molecule_system_prompt.py +++ b/molecode/prompts/molecule_system_prompt.py @@ -1,4 +1,4 @@ -BASE_INSTRUCTION = ''' +BASE_INSTRUCTION = r''' # Mermaid Molecular Graph Syntax Specification > Complete syntax definition and parser requirements for Mermaid molecular graphs based on the MolToGraph project diff --git a/tests/test_molecule_stereochemistry.py b/tests/test_molecule_stereochemistry.py new file mode 100644 index 0000000..2221156 --- /dev/null +++ b/tests/test_molecule_stereochemistry.py @@ -0,0 +1,53 @@ +from rdkit import Chem + +from molecode.molecule import mermaid_to_mol, mol_to_mermaid + + +def _canonical_isomeric_smiles(smiles: str) -> str: + mol = Chem.MolFromSmiles(smiles) + assert mol is not None + return Chem.MolToSmiles(mol, isomericSmiles=True) + + +def _round_trip_smiles(smiles: str, *, kekulize: bool = True) -> tuple[str, str]: + mol = Chem.MolFromSmiles(smiles) + assert mol is not None + graph = mol_to_mermaid(mol, name="Test", kekulize=kekulize) + recovered = mermaid_to_mol(graph) + assert recovered is not None + return graph, Chem.MolToSmiles(recovered, isomericSmiles=True) + + +def test_aromatic_bond_round_trip_when_not_kekulized(): + graph, recovered = _round_trip_smiles("c1ccccc1", kekulize=False) + + assert "<-->" in graph + assert recovered == _canonical_isomeric_smiles("c1ccccc1") + + +def test_double_bond_e_stereo_round_trip(): + graph, recovered = _round_trip_smiles("F/C=C/F") + + assert "===|E|" in graph + assert recovered == _canonical_isomeric_smiles("F/C=C/F") + + +def test_double_bond_z_stereo_round_trip(): + graph, recovered = _round_trip_smiles("F/C=C\\F") + + assert "===|Z|" in graph + assert recovered == _canonical_isomeric_smiles("F/C=C\\F") + + +def test_tetrahedral_r_stereo_round_trip_uses_absolute_cip_label(): + graph, recovered = _round_trip_smiles("C[C@H](O)Cl") + + assert "_R[CH]" in graph + assert recovered == _canonical_isomeric_smiles("C[C@H](O)Cl") + + +def test_tetrahedral_s_stereo_round_trip_uses_absolute_cip_label(): + graph, recovered = _round_trip_smiles("C[C@@H](O)Cl") + + assert "_S[CH]" in graph + assert recovered == _canonical_isomeric_smiles("C[C@@H](O)Cl")