Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rename page_num_list, top_list, position_list #3940

Merged
merged 3 commits into from
Dec 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,15 @@ RUN --mount=type=bind,from=infiniflow/ragflow_deps:latest,source=/,target=/deps
cp /deps/cl100k_base.tiktoken /ragflow/9b5ad71b2ce5302211f9c61530b329a4922fc6a4

ENV TIKA_SERVER_JAR="file:///ragflow/tika-server-standard-3.0.0.jar"
ENV DEBIAN_FRONTEND=noninteractive

# Setup apt
# cv2 requires libGL.so.1
# Python package and implicit dependencies:
# opencv-python: libglib2.0-0 libglx-mesa0 libgl1
# aspose-slides: pkg-config libicu-dev libgdiplus libssl1.1_1.1.1f-1ubuntu2_amd64.deb
# python-pptx: default-jdk tika-server-standard-3.0.0.jar
# selenium: libatk-bridge2.0-0 chrome-linux64-121-0-6167-85
# Building C extensions: libpython3-dev libgtk-4-1 libnss3 xdg-utils libgbm-dev
RUN --mount=type=cache,id=ragflow_apt,target=/var/cache/apt,sharing=locked \
if [ "$NEED_MIRROR" == "1" ]; then \
sed -i 's|http://archive.ubuntu.com|https://mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list; \
Expand All @@ -47,8 +53,12 @@ RUN --mount=type=cache,id=ragflow_apt,target=/var/cache/apt,sharing=locked \
apt update && \
apt --no-install-recommends install -y ca-certificates && \
apt update && \
DEBIAN_FRONTEND=noninteractive apt install -y curl libpython3-dev nginx libglib2.0-0 libglx-mesa0 pkg-config libicu-dev libgdiplus default-jdk python3-pip pipx \
libatk-bridge2.0-0 libgtk-4-1 libnss3 xdg-utils unzip libgbm-dev wget git nginx libgl1 vim less
apt install -y libglib2.0-0 libglx-mesa0 libgl1 && \
apt install -y pkg-config libicu-dev libgdiplus && \
apt install -y default-jdk && \
apt install -y libatk-bridge2.0-0 && \
apt install -y libpython3-dev libgtk-4-1 libnss3 xdg-utils libgbm-dev && \
apt install -y python3-pip pipx nginx unzip curl wget git vim less

RUN if [ "$NEED_MIRROR" == "1" ]; then \
pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple && \
Expand Down
2 changes: 1 addition & 1 deletion api/apps/chunk_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def list_chunk():
"question_kwd": sres.field[id].get("question_kwd", []),
"image_id": sres.field[id].get("img_id", ""),
"available_int": int(sres.field[id].get("available_int", 1)),
"positions": json.loads(sres.field[id].get("position_list", "[]")),
"positions": sres.field[id].get("position_int", []),
}
assert isinstance(d["positions"], list)
assert len(d["positions"]) == 0 or (isinstance(d["positions"][0], list) and len(d["positions"][0]) == 5)
Expand Down
2 changes: 1 addition & 1 deletion api/apps/sdk/doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -846,7 +846,7 @@ def list_chunks(tenant_id, dataset_id, document_id):
"question_kwd": sres.field[id].get("question_kwd", []),
"img_id": sres.field[id].get("img_id", ""),
"available_int": sres.field[id].get("available_int", 1),
"positions": sres.field[id].get("position_int", "").split("\t"),
"positions": sres.field[id].get("position_int", []),
}
if len(d["positions"]) % 5 == 0:
poss = []
Expand Down
6 changes: 3 additions & 3 deletions conf/infinity_mapping.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@
"content_with_weight": {"type": "varchar", "default": ""},
"content_ltks": {"type": "varchar", "default": ""},
"content_sm_ltks": {"type": "varchar", "default": ""},
"page_num_list": {"type": "varchar", "default": ""},
"top_list": {"type": "varchar", "default": ""},
"position_list": {"type": "varchar", "default": ""},
"page_num_int": {"type": "varchar", "default": ""},
"top_int": {"type": "varchar", "default": ""},
"position_int": {"type": "varchar", "default": ""},
"weight_int": {"type": "integer", "default": 0},
"weight_flt": {"type": "float", "default": 0.0},
"rank_int": {"type": "integer", "default": 0},
Expand Down
2 changes: 1 addition & 1 deletion graphrag/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def merge_into_first(sres, title="") -> dict[str, str]:
matchDense = self.get_vector(qst, emb_mdl, 1024, req.get("similarity", 0.1))
q_vec = matchDense.embedding_data
src = req.get("fields", ["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd",
"doc_id", f"q_{len(q_vec)}_vec", "position_list", "name_kwd",
"doc_id", f"q_{len(q_vec)}_vec", "position_int", "name_kwd",
"available_int", "content_with_weight",
"weight_int", "weight_flt"
])
Expand Down
14 changes: 6 additions & 8 deletions rag/app/presentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
from rag.nlp import rag_tokenizer
from deepdoc.parser import PdfParser, PptParser, PlainParser
from PyPDF2 import PdfReader as pdf2_read
import json


class Ppt(PptParser):
Expand Down Expand Up @@ -109,9 +108,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
d = copy.deepcopy(doc)
pn += from_page
d["image"] = img
d["page_num_list"] = json.dumps([pn + 1])
d["top_list"] = json.dumps([0])
d["position_list"] = json.dumps([(pn + 1, 0, img.size[0], 0, img.size[1])])
d["page_num_int"] = [pn + 1]
d["top_int"] = [0]
d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])]
tokenize(d, txt, eng)
res.append(d)
return res
Expand All @@ -125,10 +124,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
pn += from_page
if img:
d["image"] = img
d["page_num_list"] = json.dumps([pn + 1])
d["top_list"] = json.dumps([0])
d["position_list"] = json.dumps([
(pn + 1, 0, img.size[0] if img else 0, 0, img.size[1] if img else 0)])
d["page_num_int"] = [pn + 1]
d["top_int"] = [0]
d["position_int"] = [(pn + 1, 0, img.size[0] if img else 0, 0, img.size[1] if img else 0)]
tokenize(d, txt, eng)
res.append(d)
return res
Expand Down
19 changes: 9 additions & 10 deletions rag/nlp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
from . import rag_tokenizer
import re
import copy
import json
import roman_numbers as r
from word2number import w2n
from cn2an import cn2an
Expand Down Expand Up @@ -311,16 +310,16 @@ def tokenize_table(tbls, doc, eng, batch_size=10):
def add_positions(d, poss):
if not poss:
return
page_num_list = []
position_list = []
top_list = []
page_num_int = []
position_int = []
top_int = []
for pn, left, right, top, bottom in poss:
page_num_list.append(int(pn + 1))
top_list.append(int(top))
position_list.append((int(pn + 1), int(left), int(right), int(top), int(bottom)))
d["page_num_list"] = json.dumps(page_num_list)
d["position_list"] = json.dumps(position_list)
d["top_list"] = json.dumps(top_list)
page_num_int.append(int(pn + 1))
top_int.append(int(top))
position_int.append((int(pn + 1), int(left), int(right), int(top), int(bottom)))
d["page_num_int"] = page_num_int
d["position_int"] = position_int
d["top_int"] = top_int


def remove_contents_table(sections, eng=False):
Expand Down
9 changes: 5 additions & 4 deletions rag/nlp/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

import logging
import re
import json
from dataclasses import dataclass

from rag.utils import rmSpace
Expand Down Expand Up @@ -74,14 +73,16 @@ def search(self, req, idx_names: str | list[str], kb_ids: list[str], emb_mdl=Non
offset, limit = pg * ps, (pg + 1) * ps

src = req.get("fields", ["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd",
"doc_id", "position_list", "knowledge_graph_kwd", "question_kwd", "question_tks",
"doc_id", "page_num_int", "top_int", "create_timestamp_flt", "knowledge_graph_kwd", "question_kwd", "question_tks",
"available_int", "content_with_weight", "pagerank_fea"])
kwds = set([])

qst = req.get("question", "")
q_vec = []
if not qst:
if req.get("sort"):
orderBy.asc("page_num_int")
orderBy.asc("top_int")
orderBy.desc("create_timestamp_flt")
res = self.dataStore.search(src, [], filters, [], orderBy, offset, limit, idx_names, kb_ids)
total=self.dataStore.getTotal(res)
Expand Down Expand Up @@ -340,7 +341,7 @@ def floor_sim(score):
chunk = sres.field[id]
dnm = chunk["docnm_kwd"]
did = chunk["doc_id"]
position_list = chunk.get("position_list", "[]")
position_int = chunk.get("position_int", [])
d = {
"chunk_id": id,
"content_ltks": chunk["content_ltks"],
Expand All @@ -354,7 +355,7 @@ def floor_sim(score):
"vector_similarity": vsim[i],
"term_similarity": tsim[i],
"vector": chunk.get(vector_column, zero_vector),
"positions": json.loads(position_list)
"positions": position_int,
}
if highlight and sres.highlight:
if id in sres.highlight:
Expand Down
6 changes: 3 additions & 3 deletions rag/svr/task_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,9 +211,9 @@ def build_chunks(task, progress_callback):
if not d.get("image"):
_ = d.pop("image", None)
d["img_id"] = ""
d["page_num_list"] = json.dumps([])
d["position_list"] = json.dumps([])
d["top_list"] = json.dumps([])
d["page_num_int"] = []
d["position_int"] = []
d["top_int"] = []
docs.append(d)
continue

Expand Down
10 changes: 8 additions & 2 deletions rag/utils/es_conn.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,8 +185,14 @@ def search(self, selectFields: list[str], highlightFields: list[str], condition:
orders = list()
for field, order in orderBy.fields:
order = "asc" if order == 0 else "desc"
orders.append({field: {"order": order, "unmapped_type": "float",
"mode": "avg", "numeric_type": "double"}})
if field in ["page_num_int", "top_int"]:
order_info = {"order": order, "unmapped_type": "float",
"mode": "avg", "numeric_type": "double"}
elif field.endswith("_int") or field.endswith("_flt"):
order_info = {"order": order, "unmapped_type": "float"}
else:
order_info = {"order": order, "unmapped_type": "text"}
orders.append({field: order_info})
s = s.sort(*orders)

if limit > 0:
Expand Down
45 changes: 39 additions & 6 deletions rag/utils/infinity_conn.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ def search(
df_list.append(kb_res)
self.connPool.release_conn(inf_conn)
res = concat_dataframes(df_list, selectFields)
logger.debug("INFINITY search tables: " + str(table_list))
logger.debug(f"INFINITY search tables: {str(table_list)}, result: {str(res)}")
return res

def get(
Expand All @@ -307,15 +307,18 @@ def get(
db_instance = inf_conn.get_database(self.dbName)
df_list = list()
assert isinstance(knowledgebaseIds, list)
table_list = list()
for knowledgebaseId in knowledgebaseIds:
table_name = f"{indexName}_{knowledgebaseId}"
table_list.append(table_name)
table_instance = db_instance.get_table(table_name)
kb_res = table_instance.output(["*"]).filter(f"id = '{chunkId}'").to_pl()
if len(kb_res) != 0 and kb_res.shape[0] > 0:
df_list.append(kb_res)

self.connPool.release_conn(inf_conn)
res = concat_dataframes(df_list, ["id"])
logger.debug(f"INFINITY get tables: {str(table_list)}, result: {str(res)}")
res_fields = self.getFields(res, res.columns)
return res_fields.get(chunkId, None)

Expand Down Expand Up @@ -349,15 +352,22 @@ def insert(
for k, v in d.items():
if k.endswith("_kwd") and isinstance(v, list):
d[k] = " ".join(v)
if k == 'kb_id':
elif k == 'kb_id':
if isinstance(d[k], list):
d[k] = d[k][0] # since d[k] is a list, but we need a str
elif k == "position_int":
assert isinstance(v, list)
arr = [num for row in v for num in row]
d[k] = "_".join(f"{num:08x}" for num in arr)
elif k in ["page_num_int", "top_int", "position_int"]:
assert isinstance(v, list)
d[k] = "_".join(f"{num:08x}" for num in v)
ids = ["'{}'".format(d["id"]) for d in documents]
str_ids = ", ".join(ids)
str_filter = f"id IN ({str_ids})"
table_instance.delete(str_filter)
# for doc in documents:
# logger.info(f"insert position_list: {doc['position_list']}")
# logger.info(f"insert position_int: {doc['position_int']}")
# logger.info(f"InfinityConnection.insert {json.dumps(documents)}")
table_instance.insert(documents)
self.connPool.release_conn(inf_conn)
Expand All @@ -367,8 +377,8 @@ def insert(
def update(
self, condition: dict, newValue: dict, indexName: str, knowledgebaseId: str
) -> bool:
# if 'position_list' in newValue:
# logger.info(f"upsert position_list: {newValue['position_list']}")
# if 'position_int' in newValue:
# logger.info(f"update position_int: {newValue['position_int']}")
inf_conn = self.connPool.get_conn()
db_instance = inf_conn.get_database(self.dbName)
table_name = f"{indexName}_{knowledgebaseId}"
Expand All @@ -377,6 +387,16 @@ def update(
for k, v in newValue.items():
if k.endswith("_kwd") and isinstance(v, list):
newValue[k] = " ".join(v)
elif k == 'kb_id':
if isinstance(newValue[k], list):
newValue[k] = newValue[k][0] # since d[k] is a list, but we need a str
elif k == "position_int":
assert isinstance(v, list)
arr = [num for row in v for num in row]
newValue[k] = "_".join(f"{num:08x}" for num in arr)
elif k in ["page_num_int", "top_int"]:
assert isinstance(v, list)
newValue[k] = "_".join(f"{num:08x}" for num in v)
table_instance.update(filter, newValue)
self.connPool.release_conn(inf_conn)
return True
Expand Down Expand Up @@ -423,9 +443,22 @@ def getFields(self, res, fields: list[str]) -> list[str, dict]:
v = res[fieldnm][i]
if isinstance(v, Series):
v = list(v)
elif fieldnm == "important_kwd":
elif fieldnm.endswith("_kwd"):
assert isinstance(v, str)
v = v.split()
elif fieldnm == "position_int":
assert isinstance(v, str)
if v:
arr = [int(hex_val, 16) for hex_val in v.split('_')]
v = [arr[i:i + 4] for i in range(0, len(arr), 4)]
else:
v = []
elif fieldnm in ["page_num_int", "top_int"]:
assert isinstance(v, str)
if v:
v = [int(hex_val, 16) for hex_val in v.split('_')]
else:
v = []
else:
if not isinstance(v, str):
v = str(v)
Expand Down