UNSTRUCTURED_API_KEY - Your Unstructured API key value.
UNSTRUCTURED_API_URL - Your Unstructured API URL.
These environment variables:
HUGGING_FACE_EMAIL, representing your Hugging Face account’s email address.
HUGGING_FACE_PASSWORD, representing your Hugging Face account’s email address.
HUGGING_FACE_COOKIE_DIR_PATH, representing a login cookie cache path, typically ./cookies/ or ./login_cache/.
This example’s code is as follows:
Ingest Python library
Python
from unstructured_ingest.pipeline.pipeline import Pipelinefrom unstructured_ingest.interfaces import ProcessorConfigfrom unstructured_ingest.processes.connectors.local import ( LocalIndexerConfig, LocalDownloaderConfig, LocalConnectionConfig, LocalUploaderConfig)from unstructured_ingest.processes.partitioner import PartitionerConfigimport os, jsonfrom hugchat import hugchatfrom hugchat.login import Logindef generate_json_from_local( input_path: str, output_dir: str, partition_by_api: bool = False, api_key: str = None, partition_endpoint: str = None, split_pdf_page: bool = True, split_pdf_allow_failed: bool = True, split_pdf_concurrency_level: int = 15 ): Pipeline.from_configs( context=ProcessorConfig(), indexer_config=LocalIndexerConfig(input_path=input_path), downloader_config=LocalDownloaderConfig(), source_connection_config=LocalConnectionConfig(), partitioner_config=PartitionerConfig( partition_by_api=partition_by_api, api_key=api_key, partition_endpoint=partition_endpoint, additional_partition_args={ "split_pdf_page": split_pdf_page, "split_pdf_allow_failed": split_pdf_allow_failed, "split_pdf_concurrency_level": split_pdf_concurrency_level } ), uploader_config=LocalUploaderConfig(output_dir=output_dir) ).run()def extract_matching_texts_from_local(input_json_file_path: str, text_to_match: str) -> str: voting_texts = "" with open(input_json_file_path, 'r') as file: file_elements = json.load(file) for element in file_elements: if text_to_match in element["text"]: voting_texts += " " + element["text"] return voting_textsdef log_in_to_hugging_face(email: str, passwd: str, cookie_dir_path: str) -> hugchat.ChatBot: sign = Login( email=email, passwd=passwd ) cookies = sign.login(cookie_dir_path=cookie_dir_path) return hugchat.ChatBot(cookies=cookies.get_dict())if __name__ == "__main__": generate_json_from_local( input_path=os.getenv("LOCAL_FILE_INPUT_DIR"), output_dir=os.getenv("LOCAL_FILE_OUTPUT_DIR"), partition_by_api=True, api_key=os.getenv("UNSTRUCTURED_API_KEY"), partition_endpoint=os.getenv("UNSTRUCTURED_API_URL") ) chatbot = log_in_to_hugging_face( email=os.getenv("HUGGING_FACE_EMAIL"), passwd=os.getenv("HUGGING_FACE_PASSWORD"), cookie_dir_path=os.getenv("HUGGING_FACE_COOKIE_DIR_PATH") ) voting_texts = extract_matching_texts_from_local( input_json_file_path=f"{os.getenv("LOCAL_FILE_OUTPUT_DIR")}/constitution.json", text_to_match="vot" ) print("\n-----\n") print("Querying HuggingChat...") print("\n-----\n") req = f"Given the following information, what is the minimum voting age in the United States? {voting_texts}" print(req) print("\n-----\n") print(chatbot.chat(text=req)) print("\n-----\n") print("Querying HuggingChat again...") print("\n-----\n") follow_up = "And when were women given the right to vote in the United States?" print(follow_up) print("\n-----\n") print(chatbot.chat(text=follow_up))
JavaScript/TypeScript SDK
Unstructured recommends that you use the Unstructured Ingest Python library instead.The Ingest Python library provides faster processing of larger individual files, and faster and easier processing of multiple files at a time in batches.
TypeScript
import { UnstructuredClient } from "unstructured-client";import { Strategy } from "unstructured-client/sdk/models/shared/index.js";import { PartitionResponse } from "unstructured-client/sdk/models/operations";import * as fs from "fs";import * as path from 'path';import { fileURLToPath } from 'url';import { Login, ChatBot} from "huggingface-chat";const key = process.env.UNSTRUCTURED_API_KEY || '';const url = process.env.UNSTRUCTURED_API_URL || '';const username = process.env.HUGGING_FACE_EMAIL || '';const userpassword = process.env.HUGGING_FACE_PASSWORD || '';const cookieDirPath = process.env.HUGGING_FACE_COOKIE_DIR_PATH || '';const inputFilepath = path.dirname(fileURLToPath(import.meta.url)) + "/local-ingest-pdf-source/constitution.pdf"async function getChatBot(email: string, password: string, cachePath: string): Promise<ChatBot> { const signin = new Login(email, password) const result = await signin.login(cachePath) const chat = new ChatBot(result) return chat}async function queryAndResponse(chatbot: ChatBot, query: string) { let data = await chatbot.chat(query) if (! data) { console.log("No data available."); return; } let reader = data.stream?.getReader(); if (! reader) { console.log("No stream available."); return; } while (true) { const { done, value } = await reader?.read(); if (done) break; process.stdout.write(value); } }const client = new UnstructuredClient({ security: { apiKeyAuth: key }, serverURL: new URL(url).origin});const data = fs.readFileSync(inputFilepath);client.general.partition({ partitionParameters: { files: { content: data, fileName: inputFilepath }, strategy: Strategy.HiRes, splitPdfPage: true, splitPdfAllowFailed: true, splitPdfConcurrencyLevel: 15 }}).then(async (res: PartitionResponse) => { if (res.statusCode == 200) { let voting_texts = "" if (res) for (const element of res) { if (element.text.includes("vot")) { voting_texts += " " + element.text; } } let chat = await getChatBot(username, userpassword, cachePath) await chat.initialize() const firstQuery = 'Given the following information, what is the minimum voting age in the United States?' const secondQuery = 'And when were women given the right to vote in the United States?' console.log(`${firstQuery} ${voting_texts}\n\n-----\n`) await queryAndResponse(chat, firstQuery) console.log(`\n\n-----\n`) console.log(`${secondQuery}\n\n-----\n`) await queryAndResponse(chat, secondQuery) } }).catch((e) => { if (e.statusCode) { console.log(e.statusCode); console.log(e.body); } else { console.log(e); }});