You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
def html_to_area(html_str, row_index, span_matrix):
"""Convert html to span matrix, a two-dimensional matrix representing table structure
Args:
html_str(str): html representing table structure.
row_index(list): index of each row in html.
span_matrix(np.array): a two-dimensional matrix representing table structure.
Returns:
np.array(num_row x num_col): span matrix
"""
num_row, num_col = span_matrix.shape[0], span_matrix.shape[1]
staus = 0 # whether the given html is illegal
area_index = 1
row_index.append(len(html_str))
for i in range(num_row):
col_index = 0 # record column number of the current row
spantogether = 0
html_cur_row = html_str[row_index[i]:row_index[i + 1]]
for ind, tag in enumerate(html_cur_row):
if spantogether:
spantogether = 0
continue
# if cur tag is not key information,continue
if tag != "<td>" and "span" not in tag:
continue
if col_index > num_col - 1:
return 1 # The column of current row exceeds the column of the first row
# current cell is a part of row span cell
while span_matrix[i, col_index] != -1:
if col_index == num_col - 1:
return 1
else:
col_index += 1
# basic cell
if tag == "<td>":
span_matrix[i, col_index] = area_index
col_index += 1
# "rowspan" and "colspan" together
elif "rowspan" in tag and (ind != len(html_cur_row) - 1 and "colspan" in html_cur_row[ind + 1]):
row = int(tag[-3:-1]) if tag[-3:-1].isdigit() else int(tag[-2])
col = int(html_cur_row[ind + 1][-3:-1]) \
if html_cur_row[ind + 1][-3:-1].isdigit() else int(html_cur_row[ind + 1][-2])
spantogether = 1 # the next span will be skipped
if (span_matrix[i:i + row, col_index:col_index + col] != -1).any():
return 3 # Overlay between cells
span_matrix[i:i + row, col_index:col_index + col] = area_index
if i + row > span_matrix.shape[0] or col_index + col > span_matrix.shape[1]:
return 2 # Spanning cell exceeds the table boundary
col_index += col
# only "colspan"
elif "colspan" in tag:
col = int(tag[-3:-1]) if tag[-3:-1].isdigit() else int(tag[-2])
if col_index + col > num_col:
return 2
if (span_matrix[i, col_index:col_index + col] != -1).any():
return 3
span_matrix[i, col_index:col_index + col] = area_index
col_index += col
# only "rowspan"
elif "rowspan" in tag:
row = int(tag[-3:-1]) if tag[-3:-1].isdigit() else int(tag[-2])
if i + row > num_row:
return 2
if (span_matrix[i:i + row, col_index] != -1).any():
return 3
span_matrix[i:i + row, col_index] = area_index
col_index += 1
area_index += 1
if -1 in span_matrix:
staus = 1 # The column number of some rows is smaller than the column number of the first row
return staus
def html_to_area(html_str, row_index, span_matrix):
"""Convert html to span matrix, a two-dimensional matrix representing table structure
函数中以下代码片段,如果正好两个跨列之和等于总列数,就被视为异常了,不是有问题?还有tag[-3:-1]的取值 如果是类似 " colspan="15"" 大于10 的值只取了一个5,1没取道
The text was updated successfully, but these errors were encountered: