Pseudo Code Generator | JavaScript

MindMapExtractor Class for Text Structure Extraction

The MindMapExtractor class is designed for extracting structured mind maps from text sections using machine learning, featuring error handling, threading for efficiency, and functions for data merging and transformation.


Empty image or helper icon

Prompt

Copyright 2024 The InfiniFlow Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");

you may not use this file except in compliance with the License.

You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software

distributed under the License is distributed on an "AS IS" BASIS,

WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

See the License for the specific language governing permissions and

limitations under the License.

import collections import logging import re import logging import traceback from concurrent.futures import ThreadPoolExecutor from dataclasses import dataclass from typing import Any

from graphrag.mind_map_prompt import MIND_MAP_EXTRACTION_PROMPT from graphrag.utils import ErrorHandlerFn, perform_variable_replacements from rag.llm.chat_model import Base as CompletionLLM import markdown_to_json from functools import reduce from rag.utils import num_tokens_from_string

@dataclass class MindMapResult: """Unipartite Mind Graph result class definition.""" output: dict

class MindMapExtractor:

_llm: CompletionLLM
_input_text_key: str
_mind_map_prompt: str
_on_error: ErrorHandlerFn

def __init__(
        self,
        llm_invoker: CompletionLLM,
        prompt: str | None = None,
        input_text_key: str | None = None,
        on_error: ErrorHandlerFn | None = None,
):
    """Init method definition."""
    # TODO: streamline construction
    self._llm = llm_invoker
    self._input_text_key = input_text_key or "input_text"
    self._mind_map_prompt = prompt or MIND_MAP_EXTRACTION_PROMPT
    self._on_error = on_error or (lambda _e, _s, _d: None)

def _key(self, k):
    return re.sub(r"\*+", "", k)

def _be_children(self, obj: dict, keyset: set):
    if isinstance(obj, str):
        obj = [obj]
    if isinstance(obj, list):
        for i in obj: keyset.add(i)
        return [{"id": re.sub(r"\*+", "", i), "children": []} for i in obj]
    arr = []
    for k, v in obj.items():
        k = self._key(k)
        if not k or k in keyset: continue
        keyset.add(k)
        arr.append({
            "id": k,
            "children": self._be_children(v, keyset)
        })
    return arr

def __call__(
        self, sections: list[str], prompt_variables: dict[str, Any] | None = None
) -> MindMapResult:
    """Call method definition."""
    if prompt_variables is None:
        prompt_variables = {}

    try:
        exe = ThreadPoolExecutor(max_workers=12)
        threads = []
        token_count = max(self._llm.max_length * 0.8, self._llm.max_length-512)
        texts = []
        res = []
        cnt = 0
        for i in range(len(sections)):
            section_cnt = num_tokens_from_string(sections[i])
            if cnt + section_cnt >= token_count and texts:
                threads.append(exe.submit(self._process_document, "".join(texts), prompt_variables))
                texts = []
                cnt = 0
            texts.append(sections[i])
            cnt += section_cnt
        if texts:
            threads.append(exe.submit(self._process_document, "".join(texts), prompt_variables))

        for i, _ in enumerate(threads):
            res.append(_.result())

        if not res:
            return MindMapResult(output={"root":{}})

        merge_json = reduce(self._merge, res)
        if len(merge_json.keys()) > 1:
            keyset = set(
                [re.sub(r"\*+", "", k) for k, v in merge_json.items() if isinstance(v, dict) and re.sub(r"\*+", "", k)])
            merge_json = {"id": "root",
                      "children": [{"id": self._key(k), "children": self._be_children(v, keyset)} for k, v in
                                   merge_json.items() if isinstance(v, dict) and self._key(k)]}
        else:
            k = self._key(list(merge_json.keys())[0])
            merge_json = {"id": k, "children": self._be_children(list(merge_json.items())[0][1], set([k]))}

    except Exception as e:
        logging.exception("error mind graph")
        self._on_error(
            e,
            traceback.format_exc(), None
        )
        merge_json = {"error": str(e)}

    return MindMapResult(output=merge_json)

def _merge(self, d1, d2):
    for k in d1:
        if k in d2:
            if isinstance(d1[k], dict) and isinstance(d2[k], dict):
                self._merge(d1[k], d2[k])
            elif isinstance(d1[k], list) and isinstance(d2[k], list):
                d2[k].extend(d1[k])
            else:
                d2[k] = d1[k]
        else:
            d2[k] = d1[k]

    return d2

def _list_to_kv(self, data):
    for key, value in data.items():
        if isinstance(value, dict):
            self._list_to_kv(value)
        elif isinstance(value, list):
            new_value = {}
            for i in range(len(value)):
                if isinstance(value[i], list):
                    new_value[value[i - 1]] = value[i][0]
            data[key] = new_value
        else:
            continue
    return data

def _todict(self, layer:collections.OrderedDict):
    to_ret = layer
    if isinstance(layer, collections.OrderedDict):
        to_ret = dict(layer)

    try:
        for key, value in to_ret.items():
            to_ret[key] = self._todict(value)
    except AttributeError:
        pass

    return self._list_to_kv(to_ret)

def _process_document(
        self, text: str, prompt_variables: dict[str, str]
) -> str:
    variables = {
        **prompt_variables,
        self._input_text_key: text,
    }
    text = perform_variable_replacements(self._mind_map_prompt, variables=variables)
    gen_conf = {"temperature": 0.5}
    response = self._llm.chat(text, [], gen_conf)
    response = re.sub(r"```[^\n]*", "", response)
    print(response)
    print("---------------------------------------------------\n", self._todict(markdown_to_json.dictify(response)))
    return self._todict(markdown_to_json.dictify(response))

Answer

Pseudo Code for MindMapExtractor Class

Overview

The MindMapExtractor class is designed to extract a mind map structure from a list of text sections using a machine learning model. It includes error handling, threading for concurrent processes, and functions for merging and transforming the extracted data.

Class Definition

Class MindMapResult:
    Attributes:
        output: Dictionary
Class MindMapExtractor:
    Attributes:
        _llm: CompletionLLM 
        _input_text_key: String
        _mind_map_prompt: String
        _on_error: ErrorHandlerFn

    Method __init__(llm_invoker, prompt=None, input_text_key=None, on_error=None):
        Initialize attributes with provided parameters
        If input_text_key is None, set default to "input_text"
        If prompt is None, set default to MIND_MAP_EXTRACTION_PROMPT
        If on_error is None, set default to a lambda that does nothing

    Method _key(k):
        Remove asterisks from the key and return it

    Method _be_children(obj, keyset):
        If obj is a string:
            Convert obj to a list with a single element
        If obj is a list:
            For each item in obj:
                Add item to keyset
            Return list of dictionaries with 'id' and empty 'children'
        Initialize an empty array arr
        For each key-value pair (k, v) in obj:
            k = _key(k)
            If k is empty or k is in keyset:
                Continue
            Add k to keyset
            Append dictionary with 'id' as k and 'children' from recursive call to _be_children(v, keyset)
        Return arr

    Method __call__(sections, prompt_variables=None):
        If prompt_variables is None:
            Set prompt_variables to an empty dictionary
        Initialize a ThreadPoolExecutor with 12 workers
        Initialize threads, token_count, texts, res, and cnt
        For each section in sections:
            Calculate token count of section
            If accumulated token count exceeds token_count and texts is not empty:
                Submit a task to process_document with concatenated texts and prompt_variables
                Reset texts and cnt
            Append section to texts
            Increment cnt by section token count
        If texts is not empty:
            Submit a final task for remaining texts
        For each thread in threads:
            Append result to res
        If res is empty:
            Return MindMapResult with empty root
        Merge results from res into merge_json
        Perform checks to structure merge_json based on keys
        Handle merging and creating structured output

    Method _merge(d1, d2):
        For each key k in d1:
            If k exists in d2:
                If both values are dictionaries:
                    Call _merge recursively
                If both values are lists:
                    Extend list in d2 with values from d1
                Else set d2[k] to d1[k]
            Else:
                Set d2[k] to d1[k]
        Return d2

    Method _list_to_kv(data):
        For each key-value pair in data:
            If value is a dictionary:
                Call _list_to_kv recursively on value
            If value is a list:
                Initialize new_value as an empty dictionary
                For index in range of value length:
                    If value[index] is a list:
                        Assign value at index - 1 to new_value as key
                        Assign first element of value[index] as the value
                Set data[key] to new_value
        Return data

    Method _todict(layer):
        Convert layer to a standard dictionary if it's an OrderedDict
        Attempt to iterate over items in the flattened dictionary:
            Recursively call _todict on each item's value
        Handle AttributeError exceptions 
        Return transformed layer

    Method _process_document(text, prompt_variables):
        Prepare variables for the prompt
        Replace variables in _mind_map_prompt with the prepared variables
        Send request to the language model with the prompt text
        Clean up response to remove code block markers
        Convert response to a dictionary and transform it
        Return transformed output

Key Processing Steps

  1. Initialization: Set up the LLM, fetching the required prompt and error handling variables.
  2. Input Processing: Iterate through the sections, managing token limits using threading.
  3. Data Transformation: Merge and structure the extracted data into a mind map format.
  4. Error Handling: Log errors and execute the designated error handler.
  5. Final Output: Return a structured output encapsulated in the MindMapResult.

Notes

  • This pseudo code captures the essence of the original Python code while ensuring clarity and understanding.
  • Specific methods within the class manage tasks such as merging, structuring, and processing the documents, providing a clean separation of functionalities.

Create your Thread using our flexible tools, share it with friends and colleagues.

Your current query will become the main foundation for the thread, which you can expand with other tools presented on our platform. We will help you choose tools so that your thread is structured and logically built.

Description

The MindMapExtractor class is designed for extracting structured mind maps from text sections using machine learning, featuring error handling, threading for efficiency, and functions for data merging and transformation.