Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LGPMA表格处理 #172

Open
cqray1990 opened this issue Mar 12, 2024 · 0 comments
Open

LGPMA表格处理 #172

cqray1990 opened this issue Mar 12, 2024 · 0 comments

Comments

@cqray1990
Copy link

def html_to_area(html_str, row_index, span_matrix):
"""Convert html to span matrix, a two-dimensional matrix representing table structure

Args:
    html_str(str): html representing table structure.
    row_index(list): index of each row in html.
    span_matrix(np.array): a two-dimensional matrix representing table structure.

Returns:
    np.array(num_row x num_col): span matrix
"""

num_row, num_col = span_matrix.shape[0], span_matrix.shape[1]

staus = 0  # whether the given html is illegal
area_index = 1
row_index.append(len(html_str))
for i in range(num_row):
    col_index = 0  # record column number of the current row
    spantogether = 0
    html_cur_row = html_str[row_index[i]:row_index[i + 1]]

    for ind, tag in enumerate(html_cur_row):
        if spantogether:
            spantogether = 0
            continue
        # if cur tag is not key information,continue
        if tag != "<td>" and "span" not in tag:
            continue

        if col_index > num_col - 1:
            return 1  # The column of current row exceeds the column of the first row
        # current cell is a part of row span cell
        while span_matrix[i, col_index] != -1:
            if col_index == num_col - 1:
                return 1
            else:
                col_index += 1

        # basic cell
        if tag == "<td>":
            span_matrix[i, col_index] = area_index
            col_index += 1
        # "rowspan" and "colspan" together
        elif "rowspan" in tag and (ind != len(html_cur_row) - 1 and "colspan" in html_cur_row[ind + 1]):
            row = int(tag[-3:-1]) if tag[-3:-1].isdigit() else int(tag[-2])
            col = int(html_cur_row[ind + 1][-3:-1]) \
                if html_cur_row[ind + 1][-3:-1].isdigit() else int(html_cur_row[ind + 1][-2])
            spantogether = 1  # the next span will be skipped
            if (span_matrix[i:i + row, col_index:col_index + col] != -1).any():
                return 3  # Overlay between cells
            span_matrix[i:i + row, col_index:col_index + col] = area_index
            if i + row > span_matrix.shape[0] or col_index + col > span_matrix.shape[1]:
                return 2  # Spanning cell exceeds the table boundary
            col_index += col
        # only "colspan"
        elif "colspan" in tag:
            col = int(tag[-3:-1]) if tag[-3:-1].isdigit() else int(tag[-2])
            if col_index + col > num_col:
                return 2
            if (span_matrix[i, col_index:col_index + col] != -1).any():
                return 3
            span_matrix[i, col_index:col_index + col] = area_index
            col_index += col
        # only "rowspan"
        elif "rowspan" in tag:
            row = int(tag[-3:-1]) if tag[-3:-1].isdigit() else int(tag[-2])
            if i + row > num_row:
                return 2
            if (span_matrix[i:i + row, col_index] != -1).any():
                return 3
            span_matrix[i:i + row, col_index] = area_index
            col_index += 1
        area_index += 1
if -1 in span_matrix:
    staus = 1  # The column number of some rows is smaller than the column number of the first row

return staus

函数中以下代码片段,如果正好两个跨列之和等于总列数,就被视为异常了,不是有问题?还有tag[-3:-1]的取值 如果是类似 " colspan="15"" 大于10 的值只取了一个5,1没取道

 # only "colspan"
        elif "colspan" in tag:
            col = int(tag[-3:-1]) if tag[-3:-1].isdigit() else int(tag[-2])
            if col_index + col > num_col:
                return 2
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant