from unstructured_ingest.v2.pipeline.pipeline import Pipeline
from unstructured_ingest.v2.interfaces import ProcessorConfig
from unstructured_ingest.v2.processes.connectors.local import (
LocalIndexerConfig,
LocalDownloaderConfig,
LocalConnectionConfig,
LocalUploaderConfig
)
from unstructured_ingest.v2.processes.partitioner import PartitionerConfig
import os, json
from hugchat import hugchat
from hugchat.login import Login
def generate_json_from_local(
input_path: str,
output_dir: str,
partition_by_api: bool = False,
api_key: str = None,
partition_endpoint: str = None,
split_pdf_page: bool = True,
split_pdf_allow_failed: bool = True,
split_pdf_concurrency_level: int = 15
):
Pipeline.from_configs(
context=ProcessorConfig(),
indexer_config=LocalIndexerConfig(input_path=input_path),
downloader_config=LocalDownloaderConfig(),
source_connection_config=LocalConnectionConfig(),
partitioner_config=PartitionerConfig(
partition_by_api=partition_by_api,
api_key=api_key,
partition_endpoint=partition_endpoint,
additional_partition_args={
"split_pdf_page": split_pdf_page,
"split_pdf_allow_failed": split_pdf_allow_failed,
"split_pdf_concurrency_level": split_pdf_concurrency_level
}
),
uploader_config=LocalUploaderConfig(output_dir=output_dir)
).run()
def extract_matching_texts_from_local(input_json_file_path: str, text_to_match: str) -> str:
voting_texts = ""
with open(input_json_file_path, 'r') as file:
file_elements = json.load(file)
for element in file_elements:
if text_to_match in element["text"]:
voting_texts += " " + element["text"]
return voting_texts
def log_in_to_hugging_face(email: str, passwd: str, cookie_dir_path: str) -> hugchat.ChatBot:
sign = Login(
email=email,
passwd=passwd
)
cookies = sign.login(cookie_dir_path=cookie_dir_path)
return hugchat.ChatBot(cookies=cookies.get_dict())
if __name__ == "__main__":
generate_json_from_local(
input_path=os.getenv("LOCAL_FILE_INPUT_DIR"),
output_dir=os.getenv("LOCAL_FILE_OUTPUT_DIR"),
partition_by_api=True,
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
partition_endpoint=os.getenv("UNSTRUCTURED_API_URL")
)
chatbot = log_in_to_hugging_face(
email=os.getenv("HUGGING_FACE_EMAIL"),
passwd=os.getenv("HUGGING_FACE_PASSWORD"),
cookie_dir_path=os.getenv("HUGGING_FACE_COOKIE_DIR_PATH")
)
voting_texts = extract_matching_texts_from_local(
input_json_file_path=f"{os.getenv("LOCAL_FILE_OUTPUT_DIR")}/constitution.json",
text_to_match="vot"
)
print("\n-----\n")
print("Querying HuggingChat...")
print("\n-----\n")
req = f"Given the following information, what is the minimum voting age in the United States? {voting_texts}"
print(req)
print("\n-----\n")
print(chatbot.chat(text=req))
print("\n-----\n")
print("Querying HuggingChat again...")
print("\n-----\n")
follow_up = "And when were women given the right to vote in the United States?"
print(follow_up)
print("\n-----\n")
print(chatbot.chat(text=follow_up))