leili.bib

@InProceedings{xu2024cmus,
  author       = {Xu, Xi and Ouyang, Siqi and Yan, Brian and Fernandes, Patrick and Chen, William and Li, Lei and Neubig, Graham and Watanabe, Shinji},
  booktitle    = {Proceedings of the 21st International Conference on Spoken Language Translation (IWSLT 2024)},
  title        = {{CMU}{'}s {IWSLT} 2024 Simultaneous Speech Translation System},
  year         = {2024},
  month        = aug,
  pages        = {154--159},
  abstract     = {This paper describes CMU{'}s submission to the IWSLT 2024 Simultaneous Speech Translation (SST) task for translating English speech to German text in a streaming manner. Our end-to-end speech-to-text (ST) system integrates the WavLM speech encoder, a modality adapter, and the Llama2-7B-Base model as the decoder. We employ a two-stage training approach: initially, we align the representations of speech and text, followed by full fine-tuning. Both stages are trained on MuST-c v2 data with cross-entropy loss. We adapt our offline ST model for SST using a simple fixed hold-n policy. Experiments show that our model obtains an offline BLEU score of 31.1 and a BLEU score of 29.5 under 2 seconds latency on the MuST-C-v2 tst-COMMON.},
  entrysubtype = {workshop},
  eprint       = {https://aclanthology.org/2024.iwslt-1.20},
}

@InProceedings{yan2024cmus,
  author       = {Yan, Brian and Fernandes, Patrick and Tian, Jinchuan and Ouyang, Siqi and Chen, William and Livescu, Karen and Li, Lei and Neubig, Graham and Watanabe, Shinji},
  booktitle    = {Proceedings of the 21st International Conference on Spoken Language Translation (IWSLT 2024)},
  title        = {{CMU}{'}s {IWSLT} 2024 Offline Speech Translation System: A Cascaded Approach For Long-Form Robustness},
  year         = {2024},
  month        = aug,
  pages        = {164--169},
  publisher    = {Association for Computational Linguistics},
  abstract     = {This work describes CMU{'}s submission to the IWSLT 2024 Offline Speech Translation (ST) Shared Task for translating English speech to German, Chinese, and Japanese text. We are the first participants to employ a long-form strategy which directly processes unsegmented recordings without the need for a separate voice-activity detection stage (VAD). We show that the Whisper automatic speech recognition (ASR) model has a hallucination problem when applied out-of-the-box to recordings containing non-speech noises, but a simple noisy fine-tuning approach can greatly enhance Whisper{'}s long-form robustness across multiple domains. Then, we feed English ASR outputs into fine-tuned NLLB machine translation (MT) models which are decoded using COMET-based Minimum Bayes Risk. Our VAD-free ASR+MT cascade is tested on TED talks, TV series, and workout videos and shown to outperform prior winning IWSLT submissions and large open-source models.},
  entrysubtype = {workshop},
  eprint       = {https://aclanthology.org/2024.iwslt-1.22},
}

@InProceedings{wang2024global,
  author    = {Danqing Wang and Antonis Antoniades and Kha-Dinh Luong and Edwin Zhang and Mert Kosan and Jiachen Li and Ambuj Singh and William Yang Wang and Lei Li},
  booktitle = {the 30th SIGKDD Conference on Knowledge Discovery and Data Mining (KDD)},
  title     = {Global Human-guided Counterfactual Explanations for Molecular Properties via Reinforcement Learning},
  year      = {2024},
  month     = aug,
  abstract  = {Counterfactual explanations of Graph Neural Networks (GNNs) offer a powerful way to understand data that can naturally be represented by a graph structure. Furthermore, in many domains, it is highly desirable to derive data-driven global explanations or rules that can better explain the high-level properties of the models and data in question. However, evaluating global counterfactual explanations is hard in real-world datasets due to a lack of human-annotated ground truth, which limits their use in areas like molecular sciences. Additionally, the increasing scale of these datasets provides a challenge for random search-based methods. In this paper, we develop a novel global explanation model RLHEX for molecular property prediction. It aligns the counterfactual explanations with human-defined principles, making the explanations more interpretable and easy for experts to evaluate. RLHEX includes a VAE-based graph generator to generate global explanations and an adapter to adjust the latent representation space to human-defined principles. Optimized by Proximal Policy Optimization (PPO), the global explanations produced by RLHEX cover 4.12% more input graphs and reduce the distance between the counterfactual explanation set and the input set by 0.47% on average across three molecular datasets. RLHEX provides a flexible framework to incorporate different human-designed principles into the counterfactual explanation generation process, aligning these explanations with domain expertise.},
  code      = {https://github.com/dqwang122/RLHEX},
  eprint    = {https://arxiv.org/abs/2406.13869},
}

@InProceedings{yuan2024how,
  author    = {Fei Yuan and Shuai Yuan and Zhiyong Wu and Lei Li},
  booktitle = {the 62nd Annual Meeting of the Association for Computational Linguistics - Findings (ACL-Findings)},
  title     = {How Vocabulary Sharing Facilitates Multilingualism in LLaMA?},
  year      = {2024},
  month     = aug,
  abstract  = {Large Language Models (LLMs), often show strong performance on English tasks, while exhibiting limitations on other languages. What is an LLM’s multilingual capability when it is trained only on certain languages? The underlying mechanism remains unclear. This study endeavors to examine the multilingual capability of LLMs from the vocabulary sharing perspective by conducting an exhaustive analysis across 101 languages. Through the investigation of the performance gap before and after embedding fine-tuning, we discovered four distinct quadrants. By delving into each quadrant we provide actionable and efficient guidelines for tuning these languages. Extensive experiments reveal that existing LLMs possess multilingual capabilities that surpass our expectations, and we can significantly improve the multilingual performance of LLMs based on these attributes of each quadrant},
  eprint    = {https://arxiv.org/abs/2311.09071},
}

@InProceedings{zhang2024hire,
  author    = {Kexun Zhang and Yee Man Choi and Zhenqiao Song and Taiqi He and William Yang Wang and Lei Li},
  booktitle = {the 62nd Annual Meeting of the Association for Computational Linguistics - Findings (ACL-Findings)},
  title     = {Hire a Linguist!: Learning Endangered Languages in LLMs with In-Context Linguistic Descriptions},
  year      = {2024},
  month     = aug,
  abstract  = {How can large language models (LLMs) process and translate endangered languages? Many languages lack a large corpus to train a decent LLM; therefore existing LLMs rarely perform well in unseen, endangered languages. On the contrary, we observe that 2000 endangered languages, though without a large corpus, have a grammar book or a dictionary. We propose LINGOLLM, a training-free approach to enable an LLM to process unseen languages that hardly occur in its pre-training. Our key insight is to demonstrate linguistic knowledge of an unseen language in an LLM’s prompt, including a dictionary, a grammar book, and morphologically analyzed input text. We implement LINGOLLM on top of two models, GPT-4 and Mixtral, and evaluate their performance on 5 tasks across 8 endangered or low-resource languages. Our results show that LINGOLLM elevates translation capability from GPT-4’s 0 to 10.5 BLEU for 10 language directions. Our findings demonstrate the tremendous value of linguistic knowledge in the age of LLMs for endangered languages. Our data, code, and model generations can be found at https://github.com/LeiLiLab/LingoLLM.},
  code      = {https://github.com/LeiLiLab/LingoLLM},
  eprint    = {https://arxiv.org/abs/2402.18025},
}

@InProceedings{xu2024pride,
  author    = {Wenda Xu and Guanglei Zhu and Xuandong Zhao and Liangming Pan and Lei Li and William Yang Wang},
  booktitle = {the 62nd Annual Meeting of the Association for Computational Linguistics (ACL)},
  title     = {Pride and Prejudice: LLM Amplifies Self-Bias in Self-Refinement},
  year      = {2024},
  month     = aug,
  note      = {2024},
  abstract  = {Recent studies show that large language models (LLMs) improve their performance through self-feedback on certain tasks while degrade on others. We discovered that such a contrary is due to LLM’s bias in evaluating their own output. In this paper, we formally define LLM’s self-bias -- the tendency to favor its own generation -- using two statistics. We analyze six LLMs (GPT-4, GPT-3.5, Gemini, LLaMA2, Mixtral and DeepSeek) on translation, constrained text generation, and mathematical reasoning tasks. We find that self-bias is prevalent in all examined LLMs across multiple languages and tasks. Our analysis reveals that while the self-refine pipeline improves the fluency and understandability of model outputs, it further amplifies self-bias. To mitigate such biases, we discover that larger model size and external feedback with accurate assessment can significantly reduce bias in the self-refine pipeline, leading to actual performance improvement in downstream tasks. The code and data are released at https://github. com/xu1998hz/llm_self_bias.},
  code      = {https://github. com/xu1998hz/llm_self_bias},
  eprint    = {https://arxiv.org/abs/2402.11436},
}

@InProceedings{duarte2024de,
  author    = {André V. Duarte and Xuandong Zhao and Arlindo L. Oliveira and Lei Li},
  booktitle = {Proceedings of the 41st International Conference on Machine Learning (ICML)},
  title     = {DE-COP: Detecting Copyrighted Content in Language Models Training Data},
  year      = {2024},
  month     = jul,
  abstract  = {How can we detect if copyrighted content was used in the training process of a language model, considering that the training data is typically undisclosed? We are motivated by the premise that a language model is likely to identify verbatim excerpts from its training text. We propose DE-COP, a method to determine whether a piece of copyrighted content was included in training. DE-COP’s core approach is to probe an LLM with multiple-choice questions, whose options include both verbatim text and their paraphrases. We construct BookTection, a benchmark with excerpts from 165 books published prior and subsequent to a model’s training cutoff, along with their paraphrases. Our experiments show that DE-COP surpasses the prior best method by 9.6% in detection performance (AUC) on models with logits available. Moreover, DE-COP also achieves an average accuracy of 72% for detecting suspect books on fully black-box models where prior methods give approximately 4% accuracy. The code and datasets are available at https: //github.com/LeiLiLab/DE-COP.},
  code      = {https: //github.com/LeiLiLab/DE-COP},
  eprint    = {https://arxiv.org/abs/2402.09910},
}

@InProceedings{song2024generative,
  author    = {Zhenqiao Song and Yunlong Zhao and Wenxian Shi and Wengong Jin and Yang Yang and Lei Li},
  booktitle = {Proceedings of the 41st International Conference on Machine Learning (ICML)},
  title     = {Generative Enzyme Design Guided by Functionally Important Sites and Small-Molecule Substrates},
  year      = {2024},
  month     = jul,
  abstract  = {Enzymes are genetically encoded biocatalysts capable of accelerating chemical reactions. How can we automatically design functional enzymes? In this paper, we propose EnzyGen, an approach to learn a unified model to design enzymes across all functional families. Our key idea is to generate an enzyme’s amino acid sequence and their three-dimensional (3D) coordinates based on functionally important sites and substrates corresponding to a desired catalytic function. These sites are automatically mined from enzyme databases. EnzyGen consists of a novel interleaving network of attention and neighborhood equivariant layers, which captures both long-range correlation in an entire protein sequence and local influence from nearest amino acids in 3D space. To learn the generative model, we devise a joint training objective, including a sequence generation loss, a position prediction loss and an enzyme- substrate interaction loss. We further construct EnzyBench, a dataset with 3157 enzyme families, covering all available enzymes within the protein data bank (PDB). Experimental results show that our EnzyGen consistently achieves the best performance across all 323 testing families, surpassing the best baseline by 10.79% in terms of substrate binding affinity. These findings demonstrate EnzyGen’s superior capability in designing well-folded and effective enzymes binding to specific substrates with high affinities. The code, model and dataset are released at https: //github.com/LeiLiLab/EnzyGen.},
  code      = {https: //github.com/LeiLiLab/EnzyGen},
  eprint    = {https://arxiv.org/abs/2405.08205},
}

@InProceedings{song2024surfpro,
  author    = {Zhenqiao Song and Tinglin Huang and Lei Li and Wengong Jin},
  booktitle = {Proceedings of the 41st International Conference on Machine Learning (ICML)},
  title     = {SurfPro: Functional Protein Design Based on Continuous Surface},
  year      = {2024},
  month     = jul,
  abstract  = {How can we design proteins with desired functions? We are motivated by a chemical intuition that both geometric structure and biochemical properties are critical to a protein’s function. In this paper, we propose SurfPro, a new method to generate functional proteins given a desired surface and its associated biochemical properties. SurfPro comprises a hierarchical encoder that progressively models the geometric shape and biochemical features of a protein surface, and an autoregressive decoder to produce an amino acid sequence. We evaluate SurfPro on a standard inverse folding benchmark CATH 4.2 and two functional protein design tasks: protein binder design and enzyme design. Our SurfPro consistently surpasses previous state-of-the-art inverse folding methods, achieving a recovery rate of 57.78% on CATH 4.2 and higher success rates in terms of protein-protein binding and enzyme-substrate interaction scores.},
  eprint    = {https://arxiv.org/abs/2405.06693},
}

@InProceedings{xu2024llmrefine,
  author    = {Wenda Xu and Daniel Deutsch and Mara Finkelstein and Juraj Juraska and Biao Zhang and Zhongtao Liu and William Yang Wang and Lei Li and Markus Freitag},
  booktitle = {Proceedings of 2024 Conference of the North American Chapter of the Association for Computational Linguistics (NAACL) - Findings},
  title     = {LLMRefine: Pinpointing and Refining Large Language Models via Fine-Grained Actionable Feedback},
  year      = {2024},
  month     = jun,
  abstract  = {Recent large language models (LLM) are leveraging human feedback to improve their generation quality. However, human feedback is costly to obtain, especially during inference. In this work, we propose LLMRefine, an inference time optimization method to refine LLM's output. The core idea is to use a learned fine-grained feedback model to pinpoint defects and guide LLM to refine them iteratively. Using original LLM as a proposal of edits, LLMRefine searches for defect-less text via simulated annealing, trading off the exploration and exploitation. We conduct experiments on three text generation tasks, including machine translation, long-form question answering (QA), and topical summarization. LLMRefine consistently outperforms all baseline approaches, achieving improvements up to 1.7 MetricX points on translation tasks, 8.1 ROUGE-L on ASQA, 2.2 ROUGE-L on topical summarization.},
  eprint    = {https://arxiv.org/abs/2311.09336},
}

@InProceedings{zhu2024multilingual,
  author    = {Wenhao Zhu and Hongyi Liu and Qingxiu Dong and Jingjing Xu and Lingpeng Kong and Jiajun Chen and Lei Li and Shujian Huang},
  booktitle = {Proceedings of 2024 Conference of the North American Chapter of the Association for Computational Linguistics (NAACL) - Findings},
  title     = {Multilingual Machine Translation with Large Language Models: Empirical Results and Analysis},
  year      = {2024},
  month     = jun,
  abstract  = {Large language models (LLMs) have demonstrated remarkable potential in handling multilingual machine translation (MMT). In this paper, we systematically investigate the advantages and challenges of LLMs for MMT by answering two questions: 1) How well do LLMs perform in translating a massive number of languages? 2) Which factors affect LLMs' performance in translation? We evaluate popular LLMs, including XGLM, OPT, BLOOMZ, and ChatGPT, on 102 languages. Our empirical results show that even the best model ChatGPT still lags behind the supervised baseline NLLB in 83.33% of translation directions. Through further analysis, we discover that LLMs exhibit new working patterns when used for MMT. First, prompt semantics can surprisingly be ignored when given in-context exemplars, where LLMs still show strong performance even with unreasonable prompts. Second, cross-lingual exemplars can provide better task instruction for low-resource translation than exemplars in the same language pairs. Third, we observe the overestimated performance of BLOOMZ on dataset Flores-101, indicating the potential risk when using public datasets for evaluation.},
  eprint    = {https://arxiv.org/abs/2304.04675},
}

@InProceedings{zhao2024provable,
  author    = {Xuandong Zhao and Prabhanjan Vijendra Ananth and Lei Li and Yu-Xiang Wang},
  booktitle = {International Conference on Learning Representations (ICLR)},
  title     = {Provable Robust Watermarking for AI-Generated Text},
  year      = {2024},
  month     = may,
  abstract  = {We study the problem of watermarking large language models (LLMs) generated text -- one of the most promising approaches for addressing the safety challenges of LLM usage. In this paper, we propose a rigorous theoretical framework to quantify the effectiveness and robustness of LLM watermarks. We propose a robust and high-quality watermark method, Unigram-Watermark, by extending an existing approach with a simplified fixed grouping strategy. We prove that our watermark method enjoys guaranteed generation quality, correctness in watermark detection, and is robust against text editing and paraphrasing. Experiments on three varying LLMs and two datasets verify that our Unigram-Watermark achieves superior detection accuracy and comparable generation quality in perplexity, thus promoting the responsible use of LLMs.},
  code      = {https://github.com/XuandongZhao/Unigram-Watermark},
  eprint    = {https://arxiv.org/abs/2306.17439},
  owner     = {lilei.02},
}

@Patent{li2024method,
  nationality = {US},
  number      = {11,954,455 B2},
  year        = {2024},
  yearfiled   = {2021},
  assignee    = {Beijing Bytedance Network Technology Co.},
  author      = {Lei Li and Jun Cao and Minguxan Wang and Zhou Qian},
  day         = {9},
  dayfiled    = {26},
  month       = apr,
  monthfiled  = {#feb#},
  title       = {Method for translating words in a picture, electronic device, and storage medium},
  type        = {patentus},
}

@InProceedings{jain2024where,
  author    = {Sameer Jain and Sedrick Scott Keh and Shova Chhetri and Karun Dewan and Pablo Izquierdo and Johanna Prussmann and Pooja Shrestha and César Suárez and Zheyuan Ryan Shi and Lei Li and Fei Fang},
  booktitle = {Proceedings of the AAAI Conference on Artificial Intelligence (AAAI)},
  title     = {Where It Really Matters: Few-Shot Environmental Conservation Media Monitoring for Low-Resource Languages},
  year      = {2024},
  month     = feb,
}

@Patent{li2024media,
  nationality = {US},
  number      = {11874869B2},
  year        = {2024},
  yearfiled   = {2018},
  assignee    = {Beijing Bytedance Network Tech Co.},
  author      = {Gen Li and Yi He and Lei Li and Yitan Li},
  day         = {16},
  dayfiled    = {29},
  month       = jan,
  monthfiled  = {#dec#},
  title       = {Media retrieval method and apparatus},
  owner       = {lilei.02},
}

@InProceedings{dong2023statistical,
  author    = {Qingxiu Dong and Jingjing Xu and Lingpeng Kong and Zhifang Sui and Lei Li},
  booktitle = {the 37th Conference on Neural Information Processing Systems (NeurIPS)},
  title     = {Statistical Knowledge Assessment for Large Language Models},
  year      = {2023},
  month     = dec,
  abstract  = {Given varying prompts regarding a factoid question, can a large language model (LLM) reliably generate factually correct answers? Existing LLMs may generate distinct responses for different prompts. In this paper, we study the problem of quantifying knowledge contained in an LLM regarding a given set of facts. We propose KaRR, a statistical approach to assess factual knowledge for LLMs. The main idea is to estimate the ratio of LLM generating text corresponding to the answer entity given diverse prompts of the subject and the querying relation, versus it generating by random chances. Our assessment suite contains a comprehensive set of 994,123 entities and 600 relations, with 1,395,905 text aliases. We use our method to evaluate 20 LLMs of various sizes, including LLaMA, Alpaca, OPT, etc. Experiments show that our results have a strong correlation (0.43 Kendall's τ) with the results of human assessment on LLMs. Our results reveal that the knowledge in LLMs with the same backbone architecture adheres to the scaling law, while tuning on instruction-following data sometimes compromises the model's capability to generate factually correct text reliably.},
  code      = {https://github.com/dqxiu/KAssess},
  eprint    = {https://arxiv.org/abs/2305.10519},
  owner     = {lilei.02},
}

@InProceedings{zhang2023algo,
  author    = {Kexun Zhang and Danqing Wang and Jingtao Xia and William Yang Wang and Lei Li},
  booktitle = {the 37th Conference on Neural Information Processing Systems (NeurIPS)},
  title     = {ALGO: Synthesizing Algorithmic Programs with Generated Oracle Verifiers},
  year      = {2023},
  month     = dec,
  abstract  = {Large language models (LLMs) excel at implementing code from functionality descriptions but struggle with algorithmic problems that require not only implementation but also identification of the suitable algorithm. Moreover, LLM-generated programs lack guaranteed correctness and require human verification. To address these challenges, we propose ALGO, a framework that synthesizes Algorithmic programs with LLM-Generated Oracles to guide the generation and verify their correctness. ALGO first generates a reference oracle by prompting an LLM to exhaustively enumerate all the combinations of relevant variables. This oracle is then utilized to guide an arbitrary search strategy in exploring the algorithm space and to verify the synthesized algorithms. Our study shows that the LLM-generated oracles are correct for 88% of the cases. With the oracles as verifiers, ALGO can be integrated with any existing code generation model in a model-agnostic manner to enhance its performance. Experiments show that when equipped with ALGO, we achieve an 8x better one-submission pass rate over the Codex model and a 2.6x better one-submission pass rate over CodeT, the current state-of-the-art model on CodeContests. We can also get 1.3x better pass rate over the ChatGPT Code Interpreter on unseen problems.},
  code      = {https://github.com/zkx06111/ALGO},
  eprint    = {https://arxiv.org/abs/2305.14591},
  owner     = {lilei.02},
}

@InProceedings{wang2023learning,
  author    = {Danqing Wang and Lei Li},
  booktitle = {the Conference on Empirical Methods in Natural Language Processing (EMNLP)},
  title     = {Learning from Mistakes via Cooperative Study Assistant for Large Language Models},
  year      = {2023},
  month     = dec,
  abstract  = {Large language models (LLMs) have demonstrated their potential to refine their generation based on their own feedback. However, the feedback from LLM itself is often inaccurate, thereby limiting its benefits. In this paper, we propose Study Assistant for Large LAnguage Model (SALAM), a novel framework with an auxiliary agent to assist the main LLM in learning from mistakes through interactive cooperation. In the gathering phase, the student assistant agent probes the main LLM, analyzes its errors, and collects the interaction in a mistake memory. During the examination phase, the study assistant provides guidelines by retrieving relevant cases to help the main LLM anticipate and avoid similar errors. We first investigate the effectiveness of a general study assistant and then customize it to provide LLM-specific guidance through imitation learning from successful guidance experiences. Our experiments on three LLMs using two challenging frameworks demonstrate that SALAM can significantly boost LLMs by an accuracy margin of up to 6.6 on BBH and 12.6 on BBQ.},
  code      = {https://dqwang122.github.io/projects/SALAM},
  eprint    = {https://arxiv.org/abs/2305.13829},
}

@InProceedings{xu2023instructscore,
  author    = {Wenda Xu and Danqing Wang and Liangming Pan and Zhenqiao Song and Markus Freitag and William Yang Wang and Lei Li},
  booktitle = {the Conference on Empirical Methods in Natural Language Processing (EMNLP)},
  title     = {INSTRUCTSCORE: Explainable Text Generation Evaluation with Finegrained Feedback},
  year      = {2023},
  month     = dec,
  abstract  = {Automatically evaluating the quality of language generation is critical. Although recent learned metrics show high correlation with human judgement, these metrics can not explain their verdict or associate the scores with defects in generated text. To address this limitation, we present InstructScore, an explainable evaluation metric for text generation. By harnessing both explicit human instruction and the implicit knowledge of GPT-4, we fine-tune a text evaluation metric based on LLaMA, producing both a score for generated text and a human readable diagnostic report. We evaluate InstructScore on a variety of generation tasks, including translation, captioning, data-to-text and commonsense generation. Experiments show that our 7B model surpasses all other unsupervised metrics, including those based on 175B GPT-3 and GPT-4. Surprisingly, our InstructScore, even without direct supervision from human-rated data, achieves performance levels on par with state-of-the-art metrics like COMET22, which were fine-tuned on human ratings.},
  code      = {https://github.com/xu1998hz/InstructScore_SEScore3},
  eprint    = {https://arxiv.org/abs/2305.14282},
}

@InProceedings{ouyang2023autoplan,
  author    = {Siqi Ouyang and Lei Li},
  booktitle = {the Conference on Empirical Methods in Natural Language Processing (EMNLP) - Findings},
  title     = {AutoPlan: Automatic Planning of Interactive Decision-Making Tasks With Large Language Models},
  year      = {2023},
  month     = dec,
  abstract  = {Recent large language models (LLMs) are promising for making decisions in grounded environments. However, LLMs frequently fail in complex decision-making tasks due to the misalignment between the pre-trained knowledge in LLMs and the actual rules in the environment. Existing methods require either costly gradient computation or lengthy in-context demonstrations. In this paper, we propose AutoPlan, an approach to guide LLM-based agents to accomplish interactive decision-making tasks. AutoPlan augments the LLM prompt with a task-solving plan and optimizes it through iterative experience collection and reflection. Our experiments show that AutoPlan, though using no in-context demonstrations, achieves success rates on par with the baselines using human-written demonstrations on ALFWorld and even outperforms them by 8% on HotpotQA.},
  code      = {https://github.com/owaski/AutoPlan},
  eprint    = {https://arxiv.org/abs/2305.15064},
}

@InProceedings{wu2023extrapolating,
  author    = {Bohong Wu and Fei Yuan and Hai Zhao and Lei Li and Jingjing Xu},
  booktitle = {the Conference on Empirical Methods in Natural Language Processing (EMNLP) - Findings},
  title     = {Extrapolating Multilingual Understanding Models as Multilingual Generators},
  year      = {2023},
  month     = dec,
  abstract  = {Multilingual understanding models (or encoder-based), pre-trained via masked language modeling, have achieved promising results on many language understanding tasks (e.g., mBERT). However, these non-autoregressive (NAR) models still struggle to generate high-quality texts compared with autoregressive (AR) models. Considering that encoder-based models have the advantage of efficient generation and self-correction abilities, this paper explores methods to empower multilingual understanding models the generation abilities to get a unified model. Specifically, we start from a multilingual encoder (XLM-R) and propose a Semantic-Guided Alignment-then-Denoising (SGA) approach to adapt an encoder to a multilingual generator with a small number of new parameters. Experiments show that the proposed approach is an effective adaption method, outperforming widely-used initialization-based methods with gains of 9.4 BLEU on machine translation, 8.1 Rouge-L on question generation, and 5.5 METEOR on story generation on XLM-Rlarge. On the other hand, we observe that XLM-R is still inferior to mBART in supervised settings despite better results on zero-shot settings, indicating that more exploration is required to make understanding models strong generators.},
  eprint    = {https://arxiv.org/abs/2305.13140},
}

@Patent{du2023video,
  nationality = {US},
  number      = {11580314B2},
  year        = {2023},
  yearfiled   = {2022},
  assignee    = {Beijing Bytedance Network Tech Co.},
  author      = {Yuzhang Du and Peihao Zhu and Yiming Chen and Chongxing Zhou and Mingxuan Wang and Lei Li},
  day         = {19},
  dayfiled    = {10},
  month       = sep,
  monthfiled  = {#aug#},
  title       = {Video translation method and apparatus, storage medium, and electronic device},
  comment     = {视频翻译方法和装置、存储介质和电子设备},
  owner       = {lilei.02},
}

@InProceedings{wang2023accelerating,
  author    = {Danqing Wang and Zeyu Wen and Fei Ye and Lei Li and Hao Zhou},
  booktitle = {the 29th SIGKDD Conference on Knowledge Discovery and Data Mining (KDD)},
  title     = {Accelerating Antimicrobial Peptide Discovery with Latent Structure},
  year      = {2023},
  month     = aug,
  abstract  = {Antimicrobial peptides (AMPs) are promising therapeutic approaches against drug-resistant pathogens. Recently, deep generative models are used to discover new AMPs. However, previous studies mainly
focus on peptide sequence attributes and do not consider crucial structure information. In this paper, we propose a latent sequence structure model for designing AMPs (LSSAMP). LSSAMP exploits multi-scale vector quantization in the latent space to represent secondary structures (e.g. alpha helix and beta sheet). By sampling in the latent space, LSSAMP can simultaneously generate peptides with ideal sequence attributes and secondary structures. Experimental results show that the peptides generated by LSSAMP have a high probability of antimicrobial activity. Our wet laboratory experiments verified that two of the 21 candidates exhibit strong antimicrobial activity. The code is released at https://github.com/dqwang122/LSSAMP.},
  code      = {https://github.com/dqwang122/LSSAMP},
  eprint    = {https://arxiv.org/abs/2212.09450},
}

@Patent{li2023interactive,
  nationality = {US},
  number      = {11704504B2},
  year        = {2023},
  yearfiled   = {2021},
  assignee    = {Beijing Bytedance Network Tech Co.},
  author      = {Lei Li and Mingxuan Wang and Hao Zhou and Zewei Sun},
  day         = {18},
  dayfiled    = {16},
  month       = jul,
  monthfiled  = {#feb#},
  title       = {Interactive machine translation method, electronic device, and computer-readable storage medium},
  owner       = {lilei.02},
}

@InProceedings{song2023importance,
  author    = {Zhenqiao Song and Lei Li},
  booktitle = {Proceedings of the 40th International Conference on Machine Learning (ICML)},
  title     = {Importance Weighted Expectation-Maximization for Protein Sequence Design},
  year      = {2023},
  month     = jul,
  abstract  = {Designing protein sequences with desired biological function is crucial in biology and chemistry. Recent machine learning methods use a surrogate sequence-function model to replace the expensive wet-lab validation. How can we efficiently generate diverse and novel protein sequences with high fitness? In this paper, we propose IsEM-Pro, an approach to generate protein sequences towards a given fitness criterion. At its core, IsEM-Pro is a latent generative model, augmented by combinatorial structure features from a separately learned Markov random fields (MRFs). We develop an Monte Carlo Expectation-Maximization method (MCEM) to learn the model. During inference, sampling from its latent space enhances diversity while its MRFs features guide the exploration in high fitness regions. Experiments on eight protein sequence design tasks show that our IsEM-Pro outperforms the previous best methods by at least 55% on average fitness score and generates more diverse and novel protein sequences.},
  code      = {https://github.com/JocelynSong/IsEM-Pro},
  eprint    = {https://arxiv.org/abs/2305.00386},
}

@InProceedings{zhang2023redi,
  author    = {Kexun Zhang and Xianjun Yang and William Yang Wang and Lei Li},
  booktitle = {Proceedings of the 40th International Conference on Machine Learning (ICML)},
  title     = {{ReDi}: Efficient Learning-Free Diffusion Inference via Trajectory Retrieval},
  year      = {2023},
  month     = jul,
  abstract  = {Diffusion models show promising generation capability for a variety of data. Despite their high generation quality, the inference for diffusion models is still time-consuming due to the numerous sampling iterations required. To accelerate the inference, we propose ReDi, a simple yet learning-free Retrieval-based Diffusion sampling framework. From a precomputed knowledge base, ReDi retrieves a trajectory similar to the partially generated trajectory at an early stage of generation, skips a large portion of intermediate steps, and continues sampling from a later step in the retrieved trajectory. We theoretically prove that the generation performance of ReDi is guaranteed. Our experiments demonstrate that ReDi improves the model inference efficiency by 2x speedup. Furthermore, ReDi is able to generalize well in zero-shot cross-domain image generation such as image stylization.},
  code      = {https://github.com/zkx06111/ReDiffusion},
  eprint    = {https://arxiv.org/abs/2302.02285},
}

@InProceedings{zhao2023protecting,
  author    = {Xuandong Zhao and Yu-Xiang Wang and Lei Li},
  booktitle = {Proceedings of the 40th International Conference on Machine Learning (ICML)},
  title     = {Protecting Language Generation Models via Invisible Watermarking},
  year      = {2023},
  month     = jul,
  abstract  = {Language generation models have been an increasingly powerful enabler for many applications. Many such models offer free or affordable API access, which makes them potentially vulnerable to model extraction attacks through distillation. To protect intellectual property (IP) and ensure fair use of these models, various techniques such as lexical watermarking and synonym replacement have been proposed. However, these methods can be nullified by obvious countermeasures such as "synonym randomization". To address this issue, we propose {GINSEW}, a novel method to protect text generation models from being stolen through distillation. The key idea of our method is to inject secret signals into the probability vector of the decoding steps for each target token. We can then detect the secret message by probing a suspect model to tell if it is distilled from the protected one. Experimental results show that GINSEW can effectively identify instances of IP infringement with minimal impact on the generation quality of protected APIs. Our method demonstrates an absolute improvement of 19 to 29 points on mean average precision (mAP) in detecting suspects compared to previous methods against watermark removal attacks.},
  eprint    = {https://arxiv.org/abs/2302.03162},
}

@InProceedings{gu2023playground,
  author       = {Gu, Tianrui and Chen, Kaie and Ouyang, Siqi and Li, Lei},
  booktitle    = {Proceedings of the Workshop on Natural Language Processing for Indigenous Languages of the Americas (AmericasNLP)},
  title        = {{P}lay{G}round Low Resource Machine Translation System for the 2023 {A}mericas{NLP} Shared Task},
  year         = {2023},
  address      = {Toronto, Canada},
  editor       = {Mager, Manuel and Ebrahimi, Abteen and Oncevay, Arturo and Rice, Enora and Rijhwani, Shruti and Palmer, Alexis and Kann, Katharina},
  month        = jul,
  pages        = {173--176},
  publisher    = {Association for Computational Linguistics},
  abstract     = {This paper presents PlayGround{'}s submission to the AmericasNLP 2023 shared task on machine translation (MT) into indigenous languages. We finetuned NLLB-600M, a multilingual MT model pre-trained on Flores-200, on 10 low-resource language directions and examined the effectiveness of weight averaging and back translation. Our experiments showed that weight averaging, on average, led to a 0.0169 improvement in the ChrF++ score. Additionally, we found that back translation resulted in a 0.008 improvement in the ChrF++ score.},
  doi          = {10.18653/v1/2023.americasnlp-1.19},
  entrysubtype = {workshop},
  url          = {https://aclanthology.org/2023.americasnlp-1.19},
}

@InProceedings{yuan2023lego,
  author    = {Fei Yuan and Yinquan Lu and Wenhao Zhu and Lingpeng Kong and Lei Li and Yu Qiao and Jingjing Xu},
  booktitle = {the 61st Annual Meeting of the Association for Computational Linguistics - Findings (ACL-Findings)},
  title     = {{Lego-MT}: Learning Detachable Models for Massively Multilingual Machine Translation},
  year      = {2023},
  month     = jul,
  abstract  = {Traditional multilingual neural machine translation (MNMT) uses a single model to translate all directions. However, with the increasing scale of language pairs, simply using a single model for massive MNMT brings new challenges: parameter tension and large computations. In this paper, we revisit multi-way structures by assigning an individual branch for each language (group). Despite being a simple architecture, it is challenging to train de-centralized models due to the lack of constraints to align representations from all languages. We propose a localized training recipe to map different branches into a unified space, resulting in an efficient detachable model, Lego-MT. For a fair comparison, we collect data from OPUS and build the first large-scale open-source translation benchmark covering 7 language-centric data, each containing 445 language pairs. Experiments show that Lego-MT (1.2B) brings gains of more than 4 BLEU while outperforming M2M-100 (12B).},
  code      = {https://github.com/CONE-MT/Lego-MT},
  eprint    = {https://arxiv.org/abs/2212.10551},
}

@InProceedings{chen2023say,
  author    = {Jiangjie Chen and Wei Shi and Ziquan Fu and Sijie Cheng and Lei Li and Yanghua Xiao},
  booktitle = {the 61st Annual Meeting of the Association for Computational Linguistics (ACL)},
  title     = {Say What You Mean! Large Language Models Speak Too Positively about Negative Commonsense Knowledge},
  year      = {2023},
  month     = jul,
  abstract  = {Large language models (LLMs) have been widely studied for their ability to store and utilize positive knowledge. However, negative knowledge, such as “lions don’t live in the ocean”, is also ubiquitous in the world but rarely mentioned explicitly in the text. What do LLMs know about negative knowledge? This work examines the ability of LLMs to negative commonsense knowledge. We design a constrained keywords-to-sentence generation task (CG) and a Boolean question-answering task (QA) to probe LLMs. Our experiments reveal that LLMs frequently fail to generate valid sentences grounded in negative commonsense knowledge, yet they can correctly answer polar yes-or-no questions. We term this phenomenon the belief conflict of LLMs. Our further analysis shows that statistical shortcuts and negation reporting bias from language modeling pre-training cause this conflict.},
  code      = {https://github.com/jiangjiechen/uncommongen},
  eprint    = {https://arxiv.org/abs/2305.05976},
}

@InProceedings{ouyang2023waco,
  author    = {Siqi Ouyang and Rong Ye and Lei Li},
  booktitle = {the 61st Annual Meeting of the Association for Computational Linguistics (ACL)},
  title     = {{WACO}: Word-Aligned Contrastive Learning for Speech Translation},
  year      = {2023},
  month     = jul,
  abstract  = {End-to-end Speech Translation (E2E ST) aims to translate source speech into target translation without generating the intermediate transcript. However, existing approaches for E2E ST degrade considerably when only limited ST data are available. We observe that an ST model's performance strongly correlates with its embedding similarity from speech and transcript. In this paper, we propose Word-Aligned COntrastive learning (WACO), a novel method for few-shot speech-to-text translation. Our key idea is bridging word-level representations for both modalities via contrastive learning. We evaluate WACO and other methods on the MuST-C dataset, a widely used ST benchmark. Our experiments demonstrate that WACO outperforms the best baseline methods by 0.7-8.5 BLEU points with only 1-hour parallel data.},
  code      = {https://github.com/owaski/WACO},
  eprint    = {https://arxiv.org/abs/2212.09359},
}

@InProceedings{xu2023sescore2,
  author    = {Wenda Xu and Xian Qian and Mingxuan Wang and Lei Li and William Yang Wang},
  booktitle = {the 61st Annual Meeting of the Association for Computational Linguistics (ACL)},
  title     = {{SESCORE2}: Learning Text Generation Evaluation via Synthesizing Realistic Mistakes},
  year      = {2023},
  month     = jul,
  abstract  = {Is it possible to leverage large scale raw and raw parallel corpora to build a general learned metric? Existing learned metrics have gaps to human judgements, are model-dependent or are limited to the domains or tasks where human ratings are available. In this paper, we propose SEScore2, a model-based metric pretrained over million-scale synthetic dataset constructed by our novel retrieval augmented data synthesis pipeline. SEScore2 achieves high correlation to human judgements without any human rating supervisions. Importantly, our unsupervised SEScore2 can outperform supervised metrics, which are trained on the News human ratings, at the TED domain. We evaluate SEScore2 over four text generation tasks across three languages. SEScore2 outperforms all prior unsupervised evaluation metrics in machine translation, speech translation, data-to-text and dialogue generation, with average Kendall improvements 0.158. SEScore2 even outperforms SOTA supervised BLEURT at data-to-text, dialogue generation and overall correlation.data.},
  code      = {https://github.com/xu1998hz/SEScore2},
  eprint    = {https://arxiv.org/abs/2212.09305},
}

@InProceedings{zhao2023pre,
  author    = {Xuandong Zhao and Siqi Ouyang and Zhiguo Yu and Ming Wu and Lei Li},
  booktitle = {the 61st Annual Meeting of the Association for Computational Linguistics (ACL)},
  title     = {Pre-trained Language Models can be Fully Zero-Shot Learners},
  year      = {2023},
  month     = jul,
  abstract  = {How can we extend a pre-trained model to many language understanding tasks, without labeled or additional unlabeled data? Pre-trained language models (PLMs) have been effective for a wide range of NLP tasks. However, existing approaches either require fine-tuning on downstream labeled datasets or manually constructing proper prompts. In this paper, we propose nonparametric prompting PLM (NPPrompt) for fully zero-shot language understanding. Unlike previous methods, NPPrompt uses only pre-trained language models and does not require any labeled data or additional raw corpus for further fine-tuning, nor does it rely on humans to construct a comprehensive set of prompt label words. We evaluate NPPrompt against previous major few-shot and zero-shot learning methods on diverse NLP tasks: including text classification, text entailment, similar text retrieval, and paraphrasing. Experimental results demonstrate that our NPPrompt outperforms the previous best fully zero-shot method by big margins, with absolute gains of 12.8% in accuracy on text classification and 18.9% on the GLUE benchmark.},
  code      = {https://github.com/XuandongZhao/NPPrompt},
  eprint    = {https://arxiv.org/abs/2212.06950},
}

@Patent{du2023document,
  nationality = {US},
  number      = {11580314B2},
  year        = {2023},
  yearfiled   = {2022},
  assignee    = {Beijing Bytedance Network Tech Co.},
  author      = {Yuzhang Du and Peihao Zhu and Chongxing Zhou and Yiming Chen and Mingxuan Wang and Lei Li},
  day         = {14},
  dayfiled    = {25},
  month       = feb,
  monthfiled  = {#jul#},
  title       = {Document translation method and apparatus, storage medium, and electronic device},
  owner       = {lilei.02},
}

@Patent{he2023method,
  nationality = {US},
  number      = {11593582B2},
  year        = {2023},
  yearfiled   = {2018},
  assignee    = {Beijing Bytedance Network Tech Co.},
  author      = {He, Yi and Li, Lei and Yang, Cheng and Li, Gen and Li, Yitan},
  day         = {28},
  dayfiled    = {29},
  month       = feb,
  monthfiled  = {#dec#},
  title       = {Method and device for comparing media features},
  owner       = {lilei.02},
}

@Patent{wang2023speech,
  nationality = {US},
  number      = {11,586,831 B2},
  year        = {2023},
  yearfiled   = {2021},
  assignee    = {Beijing ByteDance Network Technology Co Ltd},
  author      = {Mingxuan Wang and Qianqian Dong and Lei Li},
  day         = {21},
  dayfiled    = {26},
  month       = feb,
  monthfiled  = {#feb#},
  title       = {Speech translation method electronic device and computer-readable storage medium using SEQ2SEQ for determining alternative translated speech segments},
  owner       = {lilei.02},
}

@InProceedings{chen2023converge,
  author    = {Jiangjie Chen and Rui Xu and Wenxuan Zeng and Changzhi Sun and Lei Li and Yanghua Xiao},
  booktitle = {Proceedings of the AAAI Conference on Artificial Intelligence (AAAI)},
  title     = {Converge to the Truth: Factual Error Correction via Iterative Constrained Editing},
  year      = {2023},
  month     = feb,
  abstract  = {Given a possibly false claim sentence, how can we automatically correct it with minimal editing? Existing methods either require a large number of pairs of false and corrected claims for supervised training or do not handle well errors spanning over multiple tokens within an utterance. In this paper, we propose VENCE, a novel method for factual error correction (FEC) with minimal edits. VENCE formulates the FEC problem as iterative sampling editing actions with respect to a target density function. We carefully design the target function with predicted truthfulness scores from an offline trained fact verification model. VENCE samples the most probable editing positions based on back-calculated gradients of the truthfulness score concerning input tokens and the editing actions using a distantly-supervised language model (T5). Experiments on a public dataset show that VENCE improves the well-adopted SARI metric by 5.3 (or a relative improvement of 11.8%) over the previous best distantly-supervised methods.},
  code      = {https://github.com/jiangjiechen/VENCE},
  eprint    = {https://arxiv.org/abs/2211.12130},
}

@InBook{li2023deep,
  author    = {Lei Li},
  chapter   = {3},
  editor    = {Honglin Li and Mingyue Zheng and Feng Zhu and Fang Bai},
  pages     = {78-94},
  publisher = {Chemical Industry Press},
  title     = {Deep Generative Models},
  year      = {2023},
  isbn      = {978-7-122-42928-5},
  note      = {in Chinese},
  booktitle = {Artificial Intelligence for Drug Discovery},
}

@InProceedings{dong2022calibrating,
  author    = {Qingxiu Dong and Damai Dai and Yifan Song and Jingjing Xu and Zhifang Sui and Lei Li},
  booktitle = {the Conference on Empirical Methods in Natural Language Processing (EMNLP) - Findings},
  title     = {Calibrating Factual Knowledge in Pretrained Language Models},
  year      = {2022},
  month     = dec,
  abstract  = {Previous literature has proved that Pretrained Language Models (PLMs) can store factual knowledge. However, we find that facts stored in the PLMs are not always correct. It motivates us to explore a fundamental question: How do we calibrate factual knowledge in PLMs without re-training from scratch? In this work, we propose a simple and lightweight method CaliNet to achieve this goal. To be specific, we first detect whether PLMs can learn the right facts via a contrastive score between right and fake facts. If not, we then use a lightweight method to add and adapt new parameters to specific factual texts. Experiments on the knowledge probing task show the calibration effectiveness and efficiency. In addition, through closed-book question answering, we find that the calibrated PLM possesses knowledge generalization ability after fine-tuning. Beyond the calibration performance, we further investigate and visualize the knowledge calibration mechanism.},
  code      = {https://github.com/dqxiu/calinet},
  eprint    = {https://arxiv.org/abs/2210.03329},
}

@InProceedings{xu2022not,
  author    = {Wenda Xu and Yilin Tuan and Yujie Lu and Michael Saxon and Lei Li and William Yang Wang},
  booktitle = {the Conference on Empirical Methods in Natural Language Processing (EMNLP) - Findings},
  title     = {Not All Errors are Equal: Learning Text Generation Metrics using Stratified Error Synthesis},
  year      = {2022},
  month     = dec,
  abstract  = {Is it possible to build a general and automatic natural language generation (NLG) evaluation metric? Existing learned metrics either perform unsatisfactorily or are restricted to tasks where large human rating data is already available. We introduce SESCORE, a model-based metric that is highly correlated with human judgements without requiring human annotation, by utilizing a novel, iterative error synthesis and severity scoring pipeline. This pipeline applies a series of plausible errors to raw text and assigns severity labels by simulating human judgements with entailment. We evaluate SESCORE against existing metrics by comparing how their scores correlate with human ratings. SESCORE outperforms all prior unsupervised metrics on multiple diverse NLG tasks including machine translation, image captioning, and WebNLG text generation. For WMT 20/21 En-De and Zh-En, SESCORE improve the average Kendall correlation with human judgement from 0.154 to 0.195. SESCORE even achieves comparable performance to the best supervised metric COMET, despite receiving no human-annotated training data.},
  eprint    = {https://arxiv.org/abs/2210.05035},
}

@InProceedings{zhao2022distillation,
  author    = {Xuandong Zhao and Lei Li and Yu-Xiang Wang},
  booktitle = {the Conference on Empirical Methods in Natural Language Processing (EMNLP) - Findings},
  title     = {Distillation-Resistant Watermarking for Model Protection in NLP},
  year      = {2022},
  month     = dec,
  abstract  = {How can we protect the intellectual property of trained NLP models? Modern NLP models are prone to stealing by querying and distilling from their publicly exposed APIs. However, existing protection methods such as watermarking only work for images but are not applicable to text. We propose Distillation-Resistant Watermarking (DRW), a novel technique to protect NLP models from being stolen via distillation. DRW protects a model by injecting watermarks into the victim's prediction probability corresponding to a secret key and is able to detect such a key by probing a suspect model. We prove that a protected model still retains the original accuracy within a certain bound. We evaluate DRW on a diverse set of NLP tasks including text classification, part-of-speech tagging, and named entity recognition. Experiments show that DRW protects the original model and detects stealing suspects at 100% mean average precision for all four tasks while the prior method fails on two.},
  code      = {https://github.com/xuandongzhao/drw},
  eprint    = {https://arxiv.org/abs/2210.03312},
}

  abstract = {Separating 3D point clouds into individual instances is an important task for 3D vision. It is challenging due to the unknown and varying number of instances in a scene. Existing deep learning based works focus on a two-step pipeline: first learn a feature embedding and then cluster the points. Such a two-step pipeline leads to disconnected intermediate objectives. In this paper, we propose an integrated reformulation of 3D instance segmentation as a per-point classification problem. We propose ICM-3D, a single-step method to segment 3D instances via instantiated categorization. The augmented category information is automatically constructed from 3D spatial positions. We conduct extensive experiments to verify the effectiveness of ICM-3D and show that it obtains inspiring performance across multiple frameworks, backbones and benchmarks.},
  eprint   = {https://arxiv.org/abs/2108.11771},
}

@Article{wang2022solo,
  author  = {Xinlong Wang and Rufeng Zhang and Chunhua Shen and Tao Kong and Lei Li},
  journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI)},
  title   = {SOLO: A Simple Framework for Instance Segmentation},
  year    = {2022},
  month   = nov,
  number  = {11},
  pages   = {8587-8601},
  volume  = {44},
  code    = {https://github.com/aim-uofa/AdelaiDet/},
  eprint  = {https://arxiv.org/abs/2106.15947},
}

@InProceedings{wang2022lightseq2,
  author    = {Xiaohui Wang and Yang Wei and Ying Xiong and Guyue Huang and Xian Qian and Yufei Ding and Mingxuan Wang and Lei Li},
  booktitle = {Proceedings of The International Conference for High Performance Computing, Networking, Storage and Analysis ({SC}'22)},
  title     = {{LightSeq2}: Accelerated Training for Transformer-based Models on {GPUs}},
  year      = {2022},
  month     = nov,
  code      = {https://github.com/bytedance/lightseq},
  eprint    = {https://arxiv.org/abs/2110.05722},
}

@Patent{he2022video,
  nationality = {US},
  number      = {11,455,802 B2},
  year        = {2022},
  yearfiled   = {2018},
  assignee    = {Beijing Bytedance Network Tech Co.},
  author      = {He, Yi and Li, Lei and Yang, Cheng and Li, Gen and Li, Yitan},
  day         = {27},
  dayfiled    = {29},
  month       = sep,
  monthfiled  = {#dec#},
  title       = {Video Feature Extraction Method and Device},
  comment     = {一种视频特征提取方法及装置 CN 201810271774.6  March 29, 2018},
  owner       = {lilei.02},
}

@Patent{yang2022method,
  nationality = {US},
  number      = {11403835B2},
  year        = {2022},
  yearfiled   = {2018},
  assignee    = {Beijing Bytedance Network Tech Co.},
  author      = {Yang, Cheng and He, Yi and Li, Lei},
  day         = {2},
  dayfiled    = {12},
  month       = aug,
  monthfiled  = {#sep#},
  title       = {Method and device for processing feature point of image},
  comment     = {用于处理图像的特征点的方法和装置},
  owner       = {lilei.02},
}

@InProceedings{lu2022uncovering,
  author    = {Yunfei Lu and Peng Cui and Linyun Yu and Lei Li and Wenwu Zhu},
  booktitle = {the 28th SIGKDD Conference on Knowledge Discovery and Data Mining (KDD)},
  title     = {Uncovering the Heterogeneous Effects of Preference Diversity on User Activeness: A Dynamic Mixture Model},
  year      = {2022},
  month     = aug,
}

@Patent{cao2022method,
  nationality = {US},
  number      = {11379664B2},
  year        = {2022},
  yearfiled   = {2020},
  assignee    = {Beijing Bytedance Network Tech Co.},
  author      = {Cao, Jun and Li, Lei and Wang, Mingxuan and Zhu, Peihao},
  day         = {5},
  dayfiled    = {28},
  month       = jul,
  monthfiled  = {#feb#},
  title       = {Method for acquiring a parallel corpus, electronic device, and storage medium},
  comment     = {平行语料获取方法、装置、电子设备、及存储
介质},
  owner       = {lilei.02},
}

@InProceedings{chen2022mtg,
  author    = {Chen, Yiran and Song, Zhenqiao and Wu, Xianze and Wang, Danqing and Xu, Jingjing and Chen, Jiaze and Zhou, Hao and Li, Lei},
  booktitle = {Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT Findings)},
  title     = {{MTG}: A Benchmark Suite for Multilingual Text Generation},
  year      = {2022},
  month     = jul,
  publisher = {Association for Computational Linguistics},
  abstract  = {We introduce MTG, a new benchmark suite for training and evaluating multilingual text generation. It is the first and largest multilingual multiway text generation benchmark with 400k human-annotated data for four generation tasks (story generation, question generation, title generation and text summarization) across five languages (English, German, French, Spanish and Chinese). Its multiway characteristic makes it possible to achieve direct cross-lingual generation between any two languages, thus facilitating knowledge transfer. Based on MTG, we set various evaluation scenarios and conduct deep analyses of several popular multilingual generation models from different aspects. Our benchmark suite can foster model performance enhancement with more human-annotated parallel data and encourage model evaluation with more diverse generation scenarios.},
  eprint    = {https://arxiv.org/abs/2108.07140},
  owner     = {lilei.02},
  url       = {https://mtg-benchmark.netlify.app},
}

@InProceedings{ye2022cross,
  author    = {Ye, Rong and Wang, Mingxuan and Li, Lei},
  booktitle = {Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT)},
  title     = {Cross-modal Contrastive Learning for Speech Translation},
  year      = {2022},
  month     = jul,
  publisher = {Association for Computational Linguistics},
  abstract  = {How to learn similar representations for spoken utterances and their written text? We believe a unified and aligned representation of speech and text will lead to improvement in speech translation. To this end, we propose ConST, a cross-modal contrastive learning method for end-to-end speech-to-text translation. We evaluate ConST and a variety of previous baselines on multiple language directions (En-De/Fr/Ru) of a popular benchmark MuST-C. Experiments show that the proposed ConST consistently outperforms all previous methods, and achieves the state-of-the-art average BLEU of 28.5. The analysis further verifies that ConST indeed closes the representation gap of different modalities --- its learned representation improves the accuracy of cross-modal text retrieval from 4% to 88%.},
  code      = {https://github.com/ReneeYe/ConST},
  eprint    = {https://arxiv.org/abs/2205.02444},
  owner     = {lilei.02},
}

@InProceedings{zhao2022provably,
  author    = {Zhao, Xuandong and Li, Lei and Wang, Yu-Xiang},
  booktitle = {Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT)},
  title     = {Provably Confidential Language Modelling},
  year      = {2022},
  month     = jul,
  publisher = {Association for Computational Linguistics},
  abstract  = {Large language models are shown to memorize privacy information such as social security numbers in training data. Given the sheer scale of the training corpus, it is challenging to screen and filter these privacy data, either manually or automatically. In this paper, we propose Confidentially Redacted Training (CRT), a method to train language generation models while protecting the confidential segments. We borrow ideas from differential privacy (which solves a related but distinct problem) and show that our method is able to provably prevent unintended memorization by randomizing parts of the training process. Moreover, we show that redaction with an approximately correct screening policy amplifies the confidentiality guarantee. We implement the method for both LSTM and GPT language models. Our experimental results show that the models trained by CRT obtain almost the same perplexity while preserving strong confidentiality.},
  eprint    = {https://arxiv.org/abs/2205.01863},
  owner     = {lilei.02},
}

@InProceedings{huang2022learning,
  author    = {Fei Huang and Tianhua Tao and Hao Zhou and Lei Li and Minlie Huang},
  booktitle = {Proceedings of the 39th International Conference on Machine Learning (ICML)},
  title     = {On the Learning of Non-autoregressive Transformers},
  year      = {2022},
  month     = jul,
}

@InProceedings{li2022learning,
  author    = {Yunfei Li and Tao Kong and Lei Li and Yi Wu},
  booktitle = {{IEEE} International Conference on Robotics and Automation ({ICRA})},
  title     = {Learning Design and Construction with Varying-Sized Materials via Prioritized Memory Resets},
  year      = {2022},
  month     = may,
  abstract  = {Can a robot autonomously learn to design and construct a bridge from varying-sized blocks without a blueprint? It is a challenging task with long horizon and sparse reward – the robot has to figure out physically stable design schemes and feasible actions to manipulate and transport blocks. Due to diverse block sizes, the state space and action trajectories are vast to explore. In this paper, we propose a hierarchical approach for this problem. It consists of a reinforcement-learning designer to propose high-level building instructions and a motion-planning-based action generator to manipulate blocks at the low level. For high-level learning, we develop a novel technique, prioritized memory resetting (PMR) to improve exploration. PMR adaptively resets the state to those most critical configurations from a replay buffer so that the robot can resume training on partial architectures instead of from scratch. Furthermore, we augment PMR with auxiliary training objectives and fine-tune the designer with the locomotion generator. Our experiments in simulation and on a real deployed robotic system demonstrate that it is able to effectively construct bridges with blocks of varying sizes at a high success rate. Demos can be found at https://sites.google. com/view/bridge-pmr.},
  code      = {https://github.com/IrisLi17/bridge_construction},
  eprint    = {https://arxiv.org/abs/2204.05509},
  thumbnail = {li2022bridge_robot.jpg},
  url       = {https://sites.google.com/view/bridge-pmr},
}

@InProceedings{ouyang2022impact,
  author       = {Ouyang, Siqi and Ye, Rong and Li, Lei},
  booktitle    = {Proceedings of the 19th International Conference on Spoken Language Translation (IWSLT 2022)},
  title        = {On the Impact of Noises in Crowd-Sourced Data for Speech Translation},
  year         = {2022},
  month        = may,
  pages        = {92--97},
  publisher    = {Association for Computational Linguistics},
  abstract     = {Training speech translation (ST) models requires large and high-quality datasets. MuST-C is one of the most widely used ST benchmark datasets. It contains around 400 hours of speech-transcript-translation data for each of the eight translation directions. This dataset passes several quality-control filters during creation. However, we find that MuST-C still suffers from three major quality issues: audiotext misalignment, inaccurate translation, and unnecessary speaker{'}s name. What are the impacts of these data quality issues for model development and evaluation? In this paper, we propose an automatic method to fix or filter the above quality issues, using English-German (En-De) translation as an example. Our experiments show that ST models perform better on clean test sets, and the rank of proposed models remains consistent across different test sets. Besides, simply removing misaligned data points from the training set does not lead to a better ST model.},
  entrysubtype = {workshop},
}

@InProceedings{bao2022latent,
  author    = {Yu Bao and Hao Zhou and Shujian Huang and Dongqi Wang and Lihua Qian and Xinyu Dai and Jiajun Chen and Lei Li},
  booktitle = {the 60th Annual Meeting of the Association for Computational Linguistics (ACL)},
  title     = {latent-{GLAT}: Glancing at Latent Variables for Parallel Text Generation},
  year      = {2022},
  month     = may,
  abstract  = {Recently, parallel text generation has received widespread attention due to its success in generation efficiency. Although many advanced techniques are proposed to improve its generation quality, they still need the help of an autoregressive model for training to overcome the one-to-many multi-modal phenomenon in the dataset, limiting their applications. In this paper, we propose latent-GLAT, which employs the discrete latent variables to capture word categorical information and invoke an advanced curriculum learning technique, alleviating the multi-modality problem. Experiment results show that our method outperforms strong baselines without the help of an autoregressive model, which further broadens the application scenarios of the parallel decoding paradigm.},
  code      = {https://github.com/baoy-nlp/Latent-GLAT},
  eprint    = {https://openreview.net/forum?id=y4xCe0MSoWx},
}

@InProceedings{dong2022learning,
  author    = {Qianqian Dong and Yaoming Zhu and Mingxuan Wang and Lei Li},
  booktitle = {the 60th Annual Meeting of the Association for Computational Linguistics (ACL)},
  title     = {Learning When to Translate for Streaming Speech},
  year      = {2022},
  month     = may,
  abstract  = {How to find proper moments to generate partial sentence translation given a streaming speech input? Existing approaches waiting-and-translating for a fixed duration often break the acoustic units in speech, since the boundaries between acoustic units in speech are not even. In this paper, we propose MoSST, a simple yet effective method for translating streaming speech content. Given a usually long speech sequence, we develop an efficient monotonic segmentation module inside an encoder-decoder model to accumulate acoustic information incrementally and detect proper speech unit boundaries for the input in speech translation task. Experiments on multiple translation directions of the MuST-C dataset show that MoSST outperforms existing methods and achieves the best trade-off between translation quality (BLEU) and latency.},
  code      = {https://github.com/dqqcasia/mosst},
  eprint    = {https://openreview.net/forum?id=mBz73IzOI6},
}

@InProceedings{fang2022stemm,
  author    = {Qingkai Fang and Rong Ye and Lei Li and Yang Feng and Mingxuan Wang},
  booktitle = {the 60th Annual Meeting of the Association for Computational Linguistics (ACL)},
  title     = {{STEMM}: Self-learning with Speech-text Manifold Mixup for Speech Translation},
  year      = {2022},
  month     = may,
  abstract  = {How to learn a better speech representation for end-to-end speech-to-text translation (ST) with limited labeled data? Existing techniques often attempt to transfer powerful machine translation (MT) capabilities to ST, but neglect the representation discrepancy across modalities. In this paper, we propose the Speech-TExt Manifold Mixup (STEMM) method to calibrate such discrepancy. Specifically, we mix up the representation sequences of different modalities, and take both unimodal speech sequences and multimodal mixed sequences as input to the translation model in parallel, and regularize their output predictions with a self-learning framework. Experiments on MuST- C speech translation benchmark and further analysis show that our method effectively alleviates the cross-modal representation discrepancy, and achieves significant improvements over a strong baseline on eight translation directions.},
  code      = {https://github.com/ictnlp/STEMM},
  eprint    = {https://openreview.net/forum?id=kazCgft9cCH},
}

@InProceedings{fu2022contextual,
  author    = {Zhiyi Fu and Wangchunshu Zhou and Jingjing Xu and Hao Zhou and Lei Li},
  booktitle = {the 60th Annual Meeting of the Association for Computational Linguistics (ACL)},
  title     = {Contextual Representation Learning beyond Masked Language Modeling},
  year      = {2022},
  month     = may,
  abstract  = {How do masked language models (MLMs) such as BERT learn contextual representations? In this work, we analyze the learning dynamics of MLMs. We find that MLMs adopt sampled embeddings as anchors to estimate and inject contextual semantics to representations, which limits the efficiency and effectiveness of MLMs. To address these issues, we propose TACO, a simple yet effective representation learning approach to directly model global semantics. TACO extracts and aligns contextual semantics hidden in contextualized representations to encourage models to attend global semantics when generating contextualized representations. Experiments on the GLUE benchmark show that TACO achieves up to 5x speedup and up to 1.2 points average improvement over existing MLMs.},
  code      = {https://github.com/FUZHIYI/TACO},
  eprint    = {https://openreview.net/forum?id=KWL_ElhUejN},
}

@InProceedings{chen2022e,
  author    = {Jiangjie Chen and Rui Xu and Ziquan Fu and Wei Shi and Zhongqiao Li and Xinbo Zhang and Changzhi Sun and Lei Li and Yanghua Xiao and Hao Zhou},
  booktitle = {the 60th Annual Meeting of the Association for Computational Linguistics (ACL) - Findings},
  title     = {{E-KAR}: A Benchmark for Rationalizing Natural Language Analogical Reasoning},
  year      = {2022},
  month     = may,
  abstract  = {The ability to recognize analogies is fundamental to human cognition. Existing benchmarks to test word analogy do not reveal the underneath process of analogical reasoning of neural models. Holding the belief that models capable of reasoning should be right for the right reasons, we propose a first-of-its- kind Explainable Knowledge-intensive Analogical Reasoning benchmark (E-KAR). Our benchmark consists of 1,655 (in Chinese) and 1,251 (in English) problems sourced from the Civil Service Exams, which require intensive background knowledge to solve. More importantly, we design a free-text explanation scheme to explain whether an analogy should be drawn, and manually annotate them for each and every question and candidate answer. Empirical results suggest that this benchmark is very challenging for some state-of-the-art models for both explanation generation and analogical question answering tasks, which invites further research in this area. Project page of E-KAR can be found at https:// ekar-leaderboard.github.io.},
  eprint    = {https://openreview.net/forum?id=9kXOFRtrEj},
  url       = {https://ekar-leaderboard.github.io},
}

@InProceedings{sun2022rethinking,
  author    = {Zewei Sun and Mingxuan Wang and Hao Zhou and Chengqi Zhao and Shujian Huang and Jiajun Chen and Lei Li},
  booktitle = {the 60th Annual Meeting of the Association for Computational Linguistics (ACL) - Findings},
  title     = {Rethinking Document-level Neural Machine Translation},
  year      = {2022},
  month     = may,
  abstract  = {This paper does not aim at introducing a novel model for document-level neural machine translation. Instead, we head back to the original Transformer model and hope to answer the following question: Is the capacity of current models strong enough for document-level translation? Interestingly, we observe that the original Transformer with appropriate training techniques can achieve strong results for document translation, even with a length of 2000 words. We evaluate this model and several recent approaches on nine document-level datasets and two sentence-level datasets across six languages. Experiments show that document-level Transformer models outperforms sentence-level ones and many previous methods in a comprehensive set of metrics, including BLEU, four lexical indices, three newly proposed assistant linguistic indicators, and human evaluation. Our new datasets and evaluation scripts are in https://github. com/sunzewei2715/Doc2Doc_NMT.},
  code      = {https://github.com/sunzewei2715/Doc2Doc_NMT},
  eprint    = {https://openreview.net/forum?id=sU9fYzNZ3xX},
}

@InProceedings{zhao2022compressing,
  author    = {Xuandong Zhao and Zhiguo Yu and Ming Wu and Lei Li},
  booktitle = {the 60th Annual Meeting of the Association for Computational Linguistics (ACL) - Findings},
  title     = {Compressing Sentence Representation via Homomorphic Projective Distillation},
  year      = {2022},
  month     = may,
  code      = {https://github.com/XuandongZhao/HPD},
  eprint    = {https://openreview.net/forum?id=n3cvM4Phez9},
}

@InProceedings{song2022switch,
  author    = {Zhenqiao Song and Hao Zhou and Lihua Qian and Jingjing Xu and Shanbo Cheng and Mingxuan Wang and Lei Li},
  booktitle = {International Conference on Learning Representations (ICLR)},
  title     = {{switch-GLAT}: Multilingual Parallel Machine Translation via Code-switch Decoder},
  year      = {2022},
  month     = apr,
  abstract  = {Multilingual machine translation aims to develop a single model for multiple language directions. However, existing multilingual models based on Transformer are limited in terms of both translation performance and inference speed. In this paper, we propose switch-GLAT, a non-autoregressive multilingual machine translation model with a code-switch decoder. It can generate contextual code- switched translations for a given source sentence, and perform code-switch back- translation, greatly boosting multilingual translation performance. In addition, its inference is highly efficient thanks to its parallel decoder. Experiments show that our proposed switch-GLAT outperform the multilingual Transformer with as much as 1.16 BLEU improvement and 6.6x faster decoding speed in inference.},
  eprint    = {https://openreview.net/forum?id=5HvpvYd68b},
  owner     = {lilei.02},
}

@InProceedings{yang2022enhancing,
  author    = {Huiyun Yang and Huadong Chen and Hao Zhou and Lei Li},
  booktitle = {International Conference on Learning Representations (ICLR)},
  title     = {Enhancing Cross-lingual Transfer by Manifold Mixup},
  year      = {2022},
  month     = apr,
  code      = {https://github.com/yhy1117/X-Mixup},
  eprint    = {https://openreview.net/forum?id=OjPmfr9GkVv},
  owner     = {lilei.02},
}

@Patent{he2022method,
  nationality = {US},
  number      = {11,265,598 B2},
  year        = {2022},
  yearfiled   = {2018},
  assignee    = {Beijing Bytedance Network Tech Co.},
  author      = {He, Yi and Li, Lei and Yang, Cheng and Li, Gen and Li, Yitan},
  day         = {1},
  dayfiled    = {29},
  month       = mar,
  monthfiled  = {#mar#},
  title       = {Method and device for determining duplicate video},
  comment     = {一种重复视频的判断方法及装置},
  owner       = {lilei.02},
}

@InProceedings{chen2022loren,
  author    = {Jiangjie Chen and Qiaoben Bao and Changzhi Sun and Xinbo Zhang and Jiaze Chen and Hao Zhou and Yanghua Xiao and Lei Li},
  booktitle = {Proceedings of the AAAI Conference on Artificial Intelligence (AAAI)},
  title     = {{LOREN}: Logic-Regularized Reasoning for Interpretable Fact Verification},
  year      = {2022},
  month     = feb,
  abstract  = {Given a natural language statement, how to verify its veracity against a large-scale textual knowledge source like Wikipedia? Most existing neural models make predictions without giving clues about which part of a false claim goes wrong. In this paper, we propose LOREN, an approach for interpretable fact verification. We decompose the verification of the whole claim at phrase-level, where the veracity of the phrases serves as explanations and can be aggregated into the final verdict according to logical rules. The key insight of LOREN is to represent claim phrase veracity as three-valued latent variables, which are regularized by aggregation logical rules. The final claim verification is based on all latent variables. Thus, LOREN enjoys the additional benefit of interpretability -- it is easy to explain how it reaches certain results with claim phrase veracity. Experiments on a public fact verification benchmark show that LOREN is competitive against previous approaches while enjoying the merit of faithful and accurate interpretability.},
  code      = {https://github.com/jiangjiechen/LOREN},
  eprint    = {https://arxiv.org/abs/2012.13577},
  url       = {https://huggingface.co/spaces/Jiangjie/loren-fact-checking},
}

@InProceedings{chen2022unsupervised,
  author    = {Jiangjie Chen and Chun Gan and Sijie Cheng and Hao Zhou and Yanghua Xiao and Lei Li},
  booktitle = {Proceedings of the AAAI Conference on Artificial Intelligence (AAAI)},
  title     = {Unsupervised Editing for Counterfactual Stories},
  year      = {2022},
  month     = feb,
  abstract  = {Creating what-if stories requires reasoning about prior statements and possible outcomes of the changed conditions. One can easily generate coherent endings under new conditions, but it would be challenging for current systems to do it with minimal changes to the original story. Therefore, one major challenge is the trade-off between generating a logical story and rewriting with minimal-edits. In this paper, we propose EDUCAT, an editing-based unsupervised approach for counterfactual story rewriting. EDUCAT includes a target position detection strategy based on estimating causal effects of the what-if conditions, which keeps the causal invariant parts of the story. EDUCAT then generates the stories under fluency, coherence and minimal-edits constraints. We also propose a new metric to alleviate the shortcomings of current automatic metrics and better evaluate the trade-off. We evaluate EDUCAT on a public counterfactual story rewriting benchmark. Experiments show that EDUCAT achieves the best trade-off over unsupervised SOTA methods according to both automatic and human evaluation.},
  code      = {https://github.com/jiangjiechen/EDUCAT},
  eprint    = {https://arxiv.org/abs/2112.05417},
}


@InProceedings{huang2022non,
  author    = {Chenyang Huang and Hao Zhou and Osmar Zaiane and Lili Mou and Lei Li},
  booktitle = {Proceedings of the AAAI Conference on Artificial Intelligence (AAAI)},
  title     = {Non-Autoregressive Translation with Layer-Wise Prediction and Deep Supervision},
  year      = {2022},
  month     = feb,
  abstract  = {How do we perform efficient inference while retaining high translation quality? Existing neural machine translation models, such as Transformer, achieve high performance, but they decode words one by one, which is inefficient. Recent non-autoregressive translation models speed up the inference, but their quality is still inferior. In this work, we propose DSLP, a highly efficient and high-performance model for machine translation. The key insight is to train a non-autoregressive Transformer with Deep Supervision and feed additional Layer-wise Predictions. We conducted extensive experiments on four translation tasks (both directions of WMT'14 EN-DE and WMT'16 EN-RO). Results show that our approach consistently improves the BLEU scores compared with respective base models. Specifically, our best variant outperforms the autoregressive model on three translation tasks, while being 14.8 times more efficient in inference.},
  code      = {https://github.com/chenyangh/DSLP},
  eprint    = {https://arxiv.org/abs/2110.07515},
}

@Article{chu2022icm,
  author  = {Ruihang Chu and Yukang Chen and Tao Kong and Lu Qi and Lei Li},
  journal = {IEEE Robotics and Automation Letters (RA-L)},
  title   = {{ICM-3D}: Instantiated Category Modeling for 3D Instance Segmentation},
  year    = {2022},
  month   = jan,
  number  = {1},
  pages   = {57-64},
  volume  = {7},
  doi     = {10.1109/LRA.2021.3108483},
}

@InProceedings{zheng2021duplex,
  author    = {Zaixiang Zheng and Hao Zhou and Shujian Huang and Jiajun Chen and Jingjing Xu and Lei Li},
  booktitle = {the 35th Conference on Neural Information Processing Systems (NeurIPS)},
  title     = {Duplex Sequence-to-Sequence Learning for Reversible Machine Translation},
  year      = {2021},
  month     = dec,
  abstract  = {Sequence-to-sequence learning naturally has two directions. How to effectively utilize supervision signals from both directions? Existing approaches either require two separate models, or a multitask-learned model but with inferior performance. In this paper, we propose REDER (REversible Duplex TransformER), a parameter-efficient model and apply it to machine translation. Either end of REDER can simultaneously input and output a distinct language. Thus REDER enables reversible machine translation by simply flipping the input and output ends. Experiments verify that REDER achieves the first success of reversible machine translation, which helps outperform its multitask-trained baselines up to 1.3 BLEU.},
  code      = {https://github.com/zhengzx-nlp/REDER},
  eprint    = {https://arxiv.org/abs/2105.03458},
  owner     = {lilei.02},
}

@Patent{li2021audioa,
  nationality = {US},
  number      = {11,182,426 B2},
  year        = {2021},
  yearfiled   = {2018},
  assignee    = {Beijing Bytedance Network Tech Co.},
  author      = {Gen Li and Lei Li and Yi He},
  day         = {23},
  dayfiled    = {29},
  month       = nov,
  monthfiled  = {#dec#},
  title       = {Audio Retrieval and Identification Method and Device},
  comment     = {一种音频检索识别方法及装置},
  owner       = {lilei.02},
}

@InProceedings{qian2021volctrans,
  author       = {Lihua Qian and Yi Zhou and Zaixiang Zheng and Yaoming Zhu and Zehui Lin and Jiangtao Feng and Shanbo Cheng and Lei Li and Mingxuan Wang and Hao Zhou},
  booktitle    = {Sixth Conference on Machine Translation (WMT21)},
  title        = {The {Volctrans} {GLAT} System: Non-autoregressive Translation Meets {WMT21}},
  year         = {2021},
  month        = nov,
  abstract     = {This paper describes the Volctrans' submission to the WMT21 news translation shared task for German->English translation. We build a parallel (i.e., non-autoregressive) translation system using the Glancing Transformer, which enables fast and accurate parallel decoding in contrast to the currently prevailing autoregressive models. To the best of our knowledge, this is the first parallel translation system that can be scaled to such a practical scenario like WMT competition. More importantly, our parallel translation system achieves the best BLEU score (35.0) on German->English translation task, outperforming all strong autoregressive counterparts.},
  entrysubtype = {workshop},
  eprint       = {https://arxiv.org/abs/2109.11247},
}

@InProceedings{jiang2021learning,
  author    = {Qingnan Jiang and Mingxuan Wang and Jun Cao and Shanbo Cheng and Shujian Huang and Lei Li},
  booktitle = {the Conference on Empirical Methods in Natural Language Processing (EMNLP)},
  title     = {Learning Kernel-Smoothed Machine Translation with Retrieved Examples},
  year      = {2021},
  month     = nov,
  abstract  = {How to effectively adapt neural machine translation  (NMT)  models  according  to  emergingcases  without  retraining?    Despite  the  greatsuccess  of  neural  machine  translation,  updating the deployed models online remains a challenge. Existing  non-parametric  approachesthat retrieve similar examples from a databaseto guide the translation process are promisingbut are prone to overfit the retrieved examples. However,  non-parametric  methods  are  proneto overfit the retrieved examples. In this work,we propose to learn Kernel-Smoothed Translation with Example Retrieval (KSTER), an effective approach to adapt neural machine translation models online.  Experiments on domainadaptation and multi-domain machine translation  datasets  show  that  even  without  expensive retraining, KSTER is able to achieve im-provement  of  1.1  to  1.5  BLEU  scores  overthe  best  existing  online  adaptation  methods. The  code  and  trained  models  are  released  at https://github.com/jiangqn/KSTER.},
  code      = {https://github.com/jiangqn/KSTER},
  eprint    = {https://arxiv.org/abs/2109.09991},
  video     = {https://underline.io/lecture/38697-learning-kernel-smoothed-machine-translation-with-retrieved-examples},
}

@InProceedings{ru2021learning,
  author    = {Dongyu Ru and Changzhi Sun and Jiangtao Feng and Lin Qiu and Hao Zhou and Weinan Zhang and Yong Yu and Lei Li},
  booktitle = {the Conference on Empirical Methods in Natural Language Processing (EMNLP)},
  title     = {Learning Logic Rules for Document-level Relation Extraction},
  year      = {2021},
  month     = nov,
  abstract  = {Document-level relation extraction aims to identify relations between entities in a whole document. Prior efforts to capture long-range dependencies have relied heavily on implicitly powerful representations learned through (graph) neural networks, which makes the model less transparent. To tackle this challenge, in this paper, we propose LogiRE, a novel probabilistic model for document-level relation extraction by learning logic rules. LogiRE treats logic rules as latent variables and consists of two modules: a rule generator and a relation extractor. The rule generator is to generate logic rules potentially contributing to final predictions, and the relation extractor outputs final predictions based on the generated logic rules. Those two modules can be efficiently optimized with the expectation--maximization (EM) algorithm. By introducing logic rules into neural networks, LogiRE can explicitly capture long-range dependencies as well as enjoy better interpretation. Empirical results show that LogiRE significantly outperforms several strong baselines in terms of relation performance (∼1.8 F1 score) and logical consistency (over 3.3 logic score). Our code is available at https://github. com/rudongyu/LogiRE.},
  code      = {https://github.com/rudongyu/LogiRE},
  eprint    = {https://arxiv.org/abs/2111.05407},
  video     = {https://underline.io/lecture/38055-learning-logic-rules-for-document-level-relation-extraction},
}

@InProceedings{zeng2021gradient,
  author    = {Zhiyuan Zeng and Jiaze Chen and Weiran Xu and Lei Li},
  booktitle = {the Conference on Empirical Methods in Natural Language Processing (EMNLP)},
  title     = {Gradient-based Adversarial Factual Consistency Evaluation for Abstractive Summarization},
  year      = {2021},
  month     = nov,
  abstract  = {Neural abstractive summarization systems have gained significant progress in recent years. However, abstractive summarization often produce inconsisitent statements or false facts. How to automatically generate highly abstract yet factually correct summaries? In this paper, we proposed an efficient weak-supervised adversarial data augmentation approach to form the factual consistency dataset. Based on the artificial dataset, we train an evaluation model that can not only make accurate and robust factual consistency discrimination but is also capable of making interpretable factual errors tracing by backpropagated gradient distribution on token embeddings. Experiments and analysis conduct on public annotated summarization and factual consistency datasets demonstrate our approach effective and reasonable.},
  code      = {https://github.com/parZival27/GrAdualCC},
  eprint    = {https://aclanthology.org/2021.emnlp-main.337/},
}

@InProceedings{sun2021multilingual,
  author    = {Zewei Sun and Mingxuan Wang and Lei Li},
  booktitle = {the Conference on Empirical Methods in Natural Language Processing (EMNLP) - Findings},
  title     = {Multilingual Translation via Grafting Pre-trained Language Models},
  year      = {2021},
  month     = nov,
  abstract  = {Can pre-trained BERT for one language and GPT for another be glued together to translate texts? Self-supervised training using only monolingual data has led to the success of pre-trained (masked) language models in many NLP tasks. However, directly connecting BERT as an encoder and GPT as a decoder can be challenging in machine translation, for GPT-like models lack a cross-attention component that is needed in seq2seq decoders. In this paper, we propose Graformer to graft separately pre-trained (masked) language models for machine translation. With monolingual data for pre-training and parallel data for grafting training, we maximally take advantage of the usage of both types of data. Experiments on 60 directions show that our method achieves average improvements of 5.8 BLEU in x2en and 2.9 BLEU in en2x directions comparing with the multilingual Transformer of the same size.},
  code      = {https://github.com/sunzewei2715/Graformer},
  eprint    = {https://arxiv.org/abs/2109.05256},
}

@InProceedings{wang2021secoco,
  author    = {Tao Wang and Chengqi Zhao and Mingxuan Wang and Lei Li and Hang Li and Deyi Xiong},
  booktitle = {the Conference on Empirical Methods in Natural Language Processing (EMNLP) - Findings},
  title     = {Secoco: Self-Correcting Encoding for Neural Machine Translation},
  year      = {2021},
  month     = nov,
  abstract  = {Different from previous robust approaches, Secoco enables NMT to explicitly correct noisy inputs and delete specific errors simultaneously with the translation decoding process. Secoco is able to achieve significant improvements of 1.6 BLEU points over strong baselines on two real-world test sets and a benchmark WMT dataset  with good interpretability.
The code and dataset are publicly available at \url{https://github.com/rgwt123/Secoco}.},
  code      = {https://github.com/rgwt123/Secoco},
  eprint    = {https://arxiv.org/abs/2108.12137},
}

@InProceedings{zhu2021counter,
  author    = {Yaoming Zhu and Jiangtao Feng and Chengqi Zhao and Mingxuan Wang and Lei Li},
  booktitle = {the Conference on Empirical Methods in Natural Language Processing (EMNLP) - Findings},
  title     = {Counter-Interference Adapter for Multilingual Machine Translation},
  year      = {2021},
  month     = nov,
  abstract  = {Developing  a  unified  multilingual  model  haslong  been  a  pursuit  for  machine  translation. However, existing approaches suffer from performance degradation — a single multilingualmodel  is  inferior  to  separately  trained  bilingual ones on rich-resource languages. We conjecture  that such  a phenomenon  is due  to interference  caused  by  joint  training  with  multiple  languages.   To  accommodate  the  issue,we  propose  CIAT,  an  adapted  Transformermodel  with  a  small  parameter  overhead  formultilingual  machine  translation.   We  evaluate CIAT on multiple benchmark datasets, including  IWSLT,  OPUS-100,  and  WMT.  Experiments  show  that  CIAT  consistently  outperforms strong multilingual baselines on 64 of  total  66  language  directions,  42  of  whichsee  above  0.5  BLEU  improvement. Our code  is  available  at https://github.com/Yaoming95/CIAT.},
  code      = {https://github.com/Yaoming95/CIAT},
  eprint    = {https://arxiv.org/abs/2104.08154},
}

@InProceedings{wang2021cnewsum,
  author    = {Danqing Wang and Jiaze Chen and Xianze Wu and Hao Zhou and Lei Li},
  booktitle = {The 10th CCF International Conference on Natural Language Processing and Chinese Computing (NLPCC)},
  title     = {{CNewSum}: A Large-scale Chinese News Summarization Dataset with Human-annotated Adequacy and Deducibility Level},
  year      = {2021},
  address   = {Qingdao, China},
  month     = oct,
  abstract  = {Automatic text summarization aims to produce a brief but crucial summary for the input documents. Both extractive and abstractive methods have witnessed great success in English datasets in recent years. However, there has been a minimal exploration of text summarization in Chinese, limited by the lack of large-scale datasets. In this paper, we present a large-scale Chinese news summarization dataset CNewSum, which consists of 304,307 documents and human-written summaries for the news feed. It has long documents with high-abstractive summaries, which can encourage document-level understanding and generation for current summarization models. An additional distinguishing feature of CNewSum is that its test set contains adequacy and deducibility annotations for the summaries. The adequacy level measures the degree of summary information covered by the document, and the deducibility indicates the reasoning ability the model needs to generate the summary. These annotations can help researchers analyze and target their model performance bottleneck. We examine recent methods on CNewSum and release our dataset to provide a solid testbed for automatic Chinese summarization research.},
  eprint    = {https://arxiv.org/abs/2110.10874},
  url       = {https://dqwang122.github.io/projects/CNewSum/},
}

@InProceedings{li2021learning,
  author    = {Yunfei Li and Tao Kong and Lei Li and Yifeng Li and Yi Wu},
  booktitle = {IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)},
  title     = {Learning to Design and Construct Bridge without Blueprint},
  year      = {2021},
  month     = sep,
  abstract  = {Autonomous assembly has been a desired functionality of many intelligent robot systems. We study a new challenging assembly task, designing and constructing a bridge without a blueprint. In this task, the robot needs to first design a feasible bridge architecture for arbitrarily wide cliffs and then manipulate the blocks reliably to construct a stable bridge according to the proposed design. In this paper, we propose a bi-level approach to tackle this task. At the high level, the system learns a bridge blueprint policy in a physical simulator using deep reinforcement learning and curriculum learning. A policy is represented as an attention-based neural network with object-centric input, which enables generalization to different numbers of blocks and cliff widths. For low-level control, we implement a motion-planning-based policy for real-robot motion control, which can be directly combined with a trained blueprint policy for real-world bridge construction without tuning. In our field study, our bi-level robot system demonstrates the capability of manipulating blocks to construct a diverse set of bridges with different architectures.},
  eprint    = {https://arxiv.org/abs/2108.02439},
}

@InProceedings{li2021simultaneous,
  author    = {Yiming Li and Tao Kong and Ruihang Chu and Yifeng Li and Peng Wang and Lei Li},
  booktitle = {IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)},
  title     = {Simultaneous Semantic and Collision Learning for 6-DoF Grasp Pose Estimation},
  year      = {2021},
  month     = sep,
  abstract  = {Grasping in cluttered scenes has always been a great challenge for robots, due to the requirement of the ability to well understand the scene and object information. Previous works usually assume that the geometry information of the objects is available, or utilize a step-wise, multi-stage strategy to predict the feasible 6-DoF grasp poses. In this work, we propose to formalize the 6-DoF grasp pose estimation as a simultaneous multi-task learning problem. In a unified framework, we jointly predict the feasible 6-DoF grasp poses, instance semantic segmentation, and collision information. The whole framework is jointly optimized and end-to-end differentiable. Our model is evaluated on large-scale benchmarks as well as the real robot system. On the public dataset, our method outperforms prior state-of-the-art methods by a large margin (+4.08 AP). We also demonstrate the implementation of our model on a real robotic platform and show that the robot can accurately grasp target objects in cluttered scenarios with a high success rate.},
  eprint    = {https://arxiv.org/abs/2108.02425},
}

@InProceedings{shi2021follow,
  author    = {Wenxian Shi and Yuxuan Song and Hao Zhou and Bohan Li and Lei Li},
  booktitle = {Proc. of ECML-PKDD},
  title     = {Follow Your Path: a Progressive Method for Knowledge Distillation},
  year      = {2021},
  month     = sep,
  abstract  = {Deep neural networks often have a huge number of parameters, which posts challenges in deployment in application scenarios with limited memory and computation capacity. Knowledge distillation is one approach to derive compact models from bigger ones. However, it has been observed that a converged heavy teacher model is strongly constrained for learning a compact student network and could make the optimization subject to poor local optima. In this paper, we propose ProKT, a new model-agnostic method by projecting the supervision signals of a teacher model into the student's parameter space. Such projection is implemented by decomposing the training objective into local intermediate targets with an approximate mirror descent technique. The proposed method could be less sensitive with the quirks during optimization which could result in a better local optimum. Experiments on both image and text datasets show that our proposed ProKT consistently achieves superior performance compared to other existing knowledge distillation methods.},
  eprint    = {https://arxiv.org/abs/2107.09305},
}

@InProceedings{ye2021end,
  author    = {Rong Ye and Mingxuan Wang and Lei Li},
  booktitle = {Proc. of INTERSPEECH},
  title     = {End-to-end Speech Translation via Cross-modal Progressive Training},
  year      = {2021},
  month     = aug,
  abstract  = {End-to-end speech translation models have become a new trend in research due to their potential of reducing error propagation. However, these models still suffer from the challenge of data scarcity. How to effectively use unlabeled or other parallel corpora from machine translation is promising but still an open problem. In this paper, we propose Cross Speech-Text Network (XSTNet), an end-to-end model for speech-to-text translation. XSTNet takes both speech and text as input and outputs both transcription and translation text. The model benefits from its three key design aspects: a self-supervised pre-trained sub-network as the audio encoder, a multi-task training objective to exploit additional parallel bilingual text, and a progressive training procedure. We evaluate the performance of XSTNet and baselines on the MuST-C En-X and LibriSpeech En-Fr datasets. In particular, XSTNet achieves state-of-the-art results on all language directions with an average BLEU of 28.8, outperforming the previous best method by 3.2 BLEU. Code, models, cases, and more detailed analysis are available at.},
  code      = {https://github.com/ReneeYe/XSTNet},
  eprint    = {https://arxiv.org/abs/2104.10380},
}

@InProceedings{lin2021learning,
  author    = {Zehui Lin and Liwei Wu and Mingxuan Wang and Lei Li},
  booktitle = {the 59th Annual Meeting of the Association for Computational Linguistics (ACL)},
  title     = {Learning Language Specific Sub-network for Multilingual Machine Translation},
  year      = {2021},
  month     = aug,
  abstract  = {Multilingual neural machine translation aimsat learning a single translation model for muliple  languages.  These  jointly  trained  mod-els often suffer from performance degradationon rich-resource language pairs. We attributethis degeneration to parameter interference. Inthis paper, we propose LaSS to jointly train asingle  unified  multilingual  MT  model.  LaSS learns Language Secific Sub-network (LaSS)for  each  language  pair  to  counter  parameterinterference.  Comprehensive  experiments  on IWSLT and WMT datasets with various Transformer  architectures  show  that  LaSS  obtainsgains on 36 language pairs by up to 1.2 BLEU.Besides, LaSS shows its strong generalization performance  at  easy  adaptation  to  new  lan-guage  pairs  and  zero-shot  translation.  LaSS boosts  zero-shot  translation  with  an  averageof  8.3  BLEU  on  30  language  pairs.},
  code      = {https://github.com/NLP-Playground/LaSS},
  eprint    = {https://arxiv.org/abs/2105.09259},
  timestamp = {2020-05-01},
}

@InProceedings{pan2021contrastive,
  author    = {Xiao Pan and Liwei Wu and Mingxuan Wang and Lei Li},
  booktitle = {the 59th Annual Meeting of the Association for Computational Linguistics (ACL)},
  title     = {Contrastive Learning for Many-to-many Multilingual Neural Machine Translation},
  year      = {2021},
  month     = aug,
  abstract  = {Existing multilingual machine translation approaches mainly focus on English-centric directions, while the non-English directions still lag behind. In this work, we aim to build a many-to-many translation system with an emphasis on the quality of non-English language directions. Our intuition is based on the hypothesis that a universal cross-language representation leads to better multilingual translation performance. To this end, we propose mRASP2, a training method to obtain a single unified multilingual translation model. mRASP2 is empowered by two techniques: a) a contrastive learning scheme to close the gap among representations of different languages, and b) data augmentation on both multiple parallel and monolingual data to further align token representations. For English-centric directions, mRASP2 outperforms existing best unified model and achieves competitive or even better performance than the pre-trained and fine-tuned model mBART on tens of WMT's translation directions. For non-English directions, mRASP2 achieves an improvement of average 10+ BLEU compared with the multilingual Transformer baseline.},
  code      = {https://github.com/PANXiao1994/mRASP2},
  eprint    = {https://arxiv.org/abs/2105.09501},
  slides    = {pubs/mRASP2_ACL2021.pdf},
  timestamp = {2020-05-01},
  url       = {https://medium.com/@panxiao1994/mrasp2-multilingual-nmt-advances-via-contrastive-learning-ac8c4c35d63},
  video     = {https://underline.io/lecture/25372-contrastive-learning-for-many-to-many-multilingual-neural-machine-translation},
}

@InProceedings{qian2021glancing,
  author    = {Lihua Qian and Hao Zhou and Yu Bao and Mingxuan Wang and Lin Qiu and Weinan Zhang and Yong Yu and Lei Li},
  booktitle = {the 59th Annual Meeting of the Association for Computational Linguistics (ACL)},
  title     = {Glancing Transformer for Non-Autoregressive Neural Machine Translation},
  year      = {2021},
  month     = aug,
  abstract  = {Recent work on non-autoregressive neural machine translation (NAT) aims at improving the efficiency by parallel decoding without sacrificing the quality. However, existing NAT methods are either inferior to Transformer or require multiple decoding passes, leading to reduced speedup. We propose the Glancing Language Model (GLM), a method to learn word interdependency for single-pass parallel generation models. With GLM, we develop Glancing Transformer (GLAT) for machine translation. With only single-pass parallel decoding, GLAT is able to generate high-quality translation with 8-15 times speedup. Experiments on multiple WMT language directions show that GLAT outperforms all previous single pass non-autoregressive methods, and is nearly comparable to Transformer, reducing the gap to 0.25-0.9 BLEU points.},
  code      = {https://github.com/FLC777/GLAT},
  comment   = {The main algorithm that achieves top 1 BLEU scores in WMT21 En-De and De-En machine translation contest.},
  eprint    = {https://arxiv.org/abs/2008.07905},
  timestamp = {2020-05-01},
}

@InProceedings{wang2021unire,
  author    = {Yijun Wang and Changzhi Sun and Yuanbin Wu and Hao Zhou and Lei Li and Junchi Yan},
  booktitle = {the 59th Annual Meeting of the Association for Computational Linguistics (ACL)},
  title     = {UniRE: A Unified Label Space for Entity Relation Extraction},
  year      = {2021},
  month     = aug,
  code      = {https://github.com/Receiling/UniRE},
  eprint    = {https://arxiv.org/abs/2107.04292},
  timestamp = {2020-05-01},
}

@InProceedings{xu2021document,
  author    = {Runxin Xu and Tianyu Liu and Lei Li and Baobao Chang},
  booktitle = {the 59th Annual Meeting of the Association for Computational Linguistics (ACL)},
  title     = {Document-level Event Extraction via Heterogeneous Graph-based Interaction Model with a Tracker},
  year      = {2021},
  month     = aug,
  code      = {https://github.com/RunxinXu/GIT},
  eprint    = {https://arxiv.org/abs/2105.14924},
  timestamp = {2020-05-01},
}

@InProceedings{xu2021vocabulary,
  author    = {Jingjing Xu and Hao Zhou and Chun Gan and Zaixiang Zheng and Lei Li},
  booktitle = {the 59th Annual Meeting of the Association for Computational Linguistics (ACL)},
  title     = {Vocabulary Learning via Optimal Transport for Neural Machine Translation},
  year      = {2021},
  month     = aug,
  abstract  = {The choice of token vocabulary affects the performance of machine translation. This paper aims to figure out what is a good vocabulary and whether one can find the optimal vocabulary without trial training. To answer these questions, we first provide an alternative understanding of the role of vocabulary from the perspective of information theory. Motivated by this, we formulate the quest of vocabularization -- finding the best token dictionary with a proper size -- as an optimal transport (OT) problem. We propose VOLT, a simple and efficient solution without trial training. Empirical results show that VOLT outperforms widely-used vocabularies in diverse scenarios, including WMT-14 English-German and TED's 52 translation directions. For example, VOLT achieves almost 70% vocabulary size reduction and 0.5 BLEU gain on English-German translation. Also, compared to BPE-search, VOLT reduces the search time from 384 GPU hours to 30 GPU hours on English-German translation.},
  addendum  = {ACL Best Paper Award (1/3350)},
  blog      = {https://jingjing-nlp.github.io/volt-blog/},
  code      = {https://github.com/Jingjing-NLP/VOLT},
  eprint    = {https://arxiv.org/abs/2012.15671},
  timestamp = {2020-05-01},
  video     = {https://underline.io/lecture/25691-vocabulary-learning-via-optimal-transport-for-neural-machine-translation},
}

@InProceedings{han2021learning,
  author    = {Chi Han and Mingxuan Wang and Heng Ji and Lei Li},
  booktitle = {the 59th Annual Meeting of the Association for Computational Linguistics (ACL) - Findings},
  title     = {Learning Shared Semantic Space for Speech-to-Text Translation},
  year      = {2021},
  month     = aug,
  code      = {https://github.com/Glaciohound/Chimera-ST},
  eprint    = {https://arxiv.org/abs/2105.03095},
  timestamp = {2020-05-01},
  video     = {https://www.youtube.com/watch?v=jkrl0gjVIyQ},
}

@InProceedings{sun2021probabilistic,
  author    = {Changzhi Sun and Xinbo Zhang and Jiangjie Chen and Chun Gan and Yuanbin Wu and Jiaze Chen and Hao Zhou and Lei Li},
  booktitle = {the 59th Annual Meeting of the Association for Computational Linguistics (ACL) - Findings},
  title     = {Probabilistic Graph Reasoning for Natural Proof Generation},
  year      = {2021},
  month     = aug,
  code      = {https://github.com/changzhisun/PRobr/},
  eprint    = {https://arxiv.org/abs/2107.02418},
  timestamp = {2020-05-01},
}

@InProceedings{wang2021contrastive,
  author    = {Danqing Wang and Jiaze Chen and Hao Zhou and Xipeng Qiu and Lei Li},
  booktitle = {the 59th Annual Meeting of the Association for Computational Linguistics (ACL) - Findings},
  title     = {Contrastive Aligned Joint Learning for Multilingual Summarization},
  year      = {2021},
  month     = aug,
  code      = {https://github.com/dqwang122/CALMS},
  timestamp = {2020-05-01},
  url       = {https://dqwang122.github.io/projects/CALMS/},
  video     = {https://underline.io/lecture/26333-contrastive-aligned-joint-learning-for-multilingual-summarization},
}

@InProceedings{wu2021language,
  author    = {Liwei Wu and Shanbo Cheng and Mingxuan Wang and Lei Li},
  booktitle = {the 59th Annual Meeting of the Association for Computational Linguistics (ACL) - Findings},
  title     = {Language Tags Matter for Zero-Shot Neural Machine Translation},
  year      = {2021},
  month     = aug,
  eprint    = {https://arxiv.org/abs/2106.07930},
  timestamp = {2020-05-01},
}

@InProceedings{zhao2021neurst,
  author       = {Chengqi Zhao and Mingxuan Wang and Qianqian Dong and Rong Ye and Lei Li},
  booktitle    = {the 59th Annual Meeting of the Association for Computational Linguistics (ACL): System Demonstrations},
  title        = {{NeurST}: Neural Speech Translation Toolkit},
  year         = {2021},
  month        = aug,
  code         = {https://github.com/bytedance/neurst},
  entrysubtype = {demo},
  eprint       = {https://arxiv.org/abs/2012.10018},
  timestamp    = {2020-05-01},
}

@InProceedings{zhao2021volctrans,
  author       = {Chengqi Zhao and Zhicheng Liu and Jian Tong and Tao Wang and Mingxuan Wang and Rong Ye and Qianqian Dong and Jun Cao and Lei Li},
  booktitle    = {The International Conference on Spoken Language Translation (IWSLT)},
  title        = {The {Volctrans} Neural Speech Translation System for {IWSLT} 2021},
  year         = {2021},
  month        = aug,
  entrysubtype = {workshop},
  eprint       = {https://arxiv.org/abs/2105.07319},
}

@Patent{he2021video,
  nationality = {US},
  number      = {11,055,536 B2},
  year        = {2021},
  yearfiled   = {2018},
  assignee    = {Beijing Bytedance Network Tech Co.},
  author      = {He, Yi and Li, Lei and Yang, Cheng and Li, Gen and Li, Yitan},
  day         = {6},
  dayfiled    = {29},
  month       = jul,
  monthfiled  = {#dec#},
  title       = {Video Feature Extraction Method and Device},
  comment     = {一种视频特征提取方法及装置},
  owner       = {lilei.02},
}

@InProceedings{jing2021adversarial,
  author    = {Mingxuan Jing and Wenbing Huang and Fuchun Sun and Xiaojian Ma and Tao Kong and Chuang Gan and Lei Li},
  booktitle = {Proceedings of the 38th International Conference on Machine Learning (ICML)},
  title     = {Adversarial Option-Aware Hierarchical Imitation Learning},
  year      = {2021},
  month     = jul,
  eprint    = {https://arxiv.org/abs/2106.05530},
}

@InProceedings{long2021generative,
  author    = {Long, Quanyu and Wang, Mingxuan and Li, Lei},
  booktitle = {Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT)},
  title     = {Generative Imagination Elevates Machine Translation},
  year      = {2021},
  address   = {Online},
  month     = jun,
  pages     = {5738--5748},
  publisher = {Association for Computational Linguistics},
  abstract  = {There are common semantics shared across text and images. Given a sentence in a source language, whether depicting the visual scene helps translation into a target language? Existing multimodal neural machine translation methods (MNMT) require triplets of bilingual sentence - image for training and tuples of source sentence - image for inference. In this paper, we propose ImagiT, a novel machine translation method via visual imagination. ImagiT first learns to generate visual representation from the source sentence, and then utilizes both source sentence and the {``}imagined representation{''} to produce a target translation. Unlike previous methods, it only needs the source sentence at the inference time. Experiments demonstrate that ImagiT benefits from visual imagination and significantly outperforms the text-only neural machine translation baselines. Further analysis reveals that the imagination process in ImagiT helps fill in missing information when performing the degradation strategy.},
  eprint    = {https://arxiv.org/abs/2009.09654},
  owner     = {lilei.02},
  timestamp = {2021-03-11},
}

@InProceedings{wang2021autocorrect,
  author    = {Wang, Tao and Zhao, Chengqi and Wang, Mingxuan and Li, Lei and Xiong, Deyi},
  booktitle = {Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies: Industry Papers (NAACL-HLT)},
  title     = {Autocorrect in the Process of Translation {---} Multi-task Learning Improves Dialogue Machine Translation},
  year      = {2021},
  address   = {Online},
  month     = jun,
  pages     = {105--112},
  publisher = {Association for Computational Linguistics},
  abstract  = {Automatic translation of dialogue texts is a much needed demand in many real life scenarios. However, the currently existing neural machine translation delivers unsatisfying results. In this paper, we conduct a deep analysis of a dialogue corpus and summarize three major issues on dialogue translation, including pronoun dropping (), punctuation dropping (), and typos (). In response to these challenges, we propose a joint learning method to identify omission and typo, and utilize context to translate dialogue utterances. To properly evaluate the performance, we propose a manually annotated dataset with 1,931 Chinese-English parallel utterances from 300 dialogues as a benchmark testbed for dialogue translation. Our experiments show that the proposed method improves translation quality by 3.2 BLEU over the baselines. It also elevates the recovery rate of omitted pronouns from 26.09{\%} to 47.16{\%}.},
  eprint    = {https://arxiv.org/abs/2103.16189},
  owner     = {lilei.02},
}

@InProceedings{wang2021cross,
  author    = {Wang, Mingxuan and Bai, Hongxiao and Zhao, Hai and Li, Lei},
  booktitle = {Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies: Industry Papers (NAACL-HLT)},
  title     = {Cross-lingual Supervision Improves Unsupervised Neural Machine Translation},
  year      = {2021},
  address   = {Online},
  month     = jun,
  pages     = {89--96},
  publisher = {Association for Computational Linguistics},
  abstract  = {We propose to improve unsupervised neural machine translation with cross-lingual supervision (), which utilizes supervision signals from high resource language pairs to improve the translation of zero-source languages. Specifically, for training En-Ro system without parallel corpus, we can leverage the corpus from En-Fr and En-De to collectively train the translation from one language into many languages under one model. {\%} is based on multilingual models which require no changes to the standard unsupervised NMT. Simple and effective, significantly improves the translation quality with a big margin in the benchmark unsupervised translation tasks, and even achieves comparable performance to supervised NMT. In particular, on WMT{'}14 -tasks achieves 37.6 and 35.18 BLEU score, which is very close to the large scale supervised setting and on WMT{'}16 -tasks achieves 35.09 BLEU score which is even better than the supervised Transformer baseline.},
  eprint    = {https://arxiv.org/abs/2004.03137},
  owner     = {lilei.02},
}

@InProceedings{wang2021lightseq,
  title = "{L}ight{S}eq: A High Performance Inference Library for Transformers",
    author = "Wang, Xiaohui  and
      Xiong, Ying  and
      Wei, Yang  and
      Wang, Mingxuan  and
      Li, Lei",
    booktitle = "Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies: Industry Papers (NAACL-HLT)",
    month = jun,
    year = "2021",
    address = "Online",
    publisher = "Association for Computational Linguistics",
    pages = "113--120",
    abstract = "Transformer and its variants have achieved great success in natural language processing. Since Transformer models are huge in size, serving these models is a challenge for real industrial applications. In this paper, we propose , a highly efficient inference library for models in the Transformer family. includes a series of GPU optimization techniques to both streamline the computation of Transformer layers and reduce memory footprint. supports models trained using PyTorch and Tensorflow. Experimental results on standard machine translation benchmarks show that achieves up to 14x speedup compared with TensorFlow and 1.4x speedup compared with , a concurrent CUDA implementation. The code will be released publicly after the review.",
  code      = {https://github.com/bytedance/lightseq},
  eprint    = {https://arxiv.org/abs/2010.13887},
  owner     = {lilei.02},
}

@InProceedings{chen2021scale,
  author    = {Yukang Chen and Yanwei Li and Tao Kong and Lu Qi and Ruihang Chu and Lei Li and Jiaya Jia},
  booktitle = {The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  title     = {Scale-aware Automatic Augmentation for Object Detection},
  year      = {2021},
  month     = jun,
  code      = {https://github.com/dvlab-research/SA-AutoAug},
  eprint    = {https://arxiv.org/abs/2103.17220},
  owner     = {lilei.02},
}

@InProceedings{jing2021locate,
  author    = {Ya Jing and Tao Kong and Wei Wang and Liang Wang and Lei Li and Tieniu Tan},
  booktitle = {The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  title     = {Locate then Segment: A Strong Pipeline for Referring Image Segmentation},
  year      = {2021},
  month     = jun,
  eprint    = {https://arxiv.org/abs/2103.16284},
  owner     = {lilei.02},
}

@InProceedings{sun2021sparse,
  author    = {Peize Sun and Rufeng Zhang and Yi Jiang and Tao Kong and Chenfeng Xu and Wei Zhan and Masayoshi Tomizuka and Lei Li and Zehuan Yuan and Changhu Wang and Ping Luo},
  booktitle = {The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  title     = {Sparse {R-CNN}: End-to-End Object Detection with Learnable Proposals},
  year      = {2021},
  month     = jun,
  code      = {https://github.com/PeizeSun/SparseR-CNN},
  eprint    = {https://arxiv.org/abs/2011.12450},
  owner     = {lilei.02},
}

@InProceedings{wang2021dense,
  author    = {Xinlong Wang and Rufeng Zhang and Chunhua Shen and Tao Kong and Lei Li},
  booktitle = {The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  title     = {Dense Contrastive Learning for Self-Supervised Visual Pre-Training},
  year      = {2021},
  month     = jun,
  abstract  = {To date, most existing self-supervised learning methods are designed and optimized for image classification. These pre-trained models can be sub-optimal for dense prediction tasks due to the discrepancy between image-level prediction and pixel-level prediction. To fill this gap, we aim to design an effective, dense self-supervised learning method that directly works at the level of pixels (or local features) by taking into account the correspondence between local features. We present dense contrastive learning, which implements self-supervised learning by optimizing a pairwise contrastive (dis)similarity loss at the pixel level between two views of input images. Compared to the baseline method MoCo-v2, our method introduces negligible computation overhead (only <1% slower), but demonstrates consistently superior performance when transferring to downstream dense prediction tasks including object detection, semantic segmentation and instance segmentation; and outperforms the state-of-the-art methods by a large margin. Specifically, over the strong MoCo-v2 baseline, our method achieves significant improvements of 2.0% AP on PASCAL VOC object detection, 1.1% AP on COCO object detection, 0.9% AP on COCO instance segmentation, 3.0% mIoU on PASCAL VOC semantic segmentation and 1.8% mIoU on Cityscapes semantic segmentation.},
  addendum  = {Oral},
  code      = {https://github.com/WXinlong/DenseCL},
  eprint    = {https://arxiv.org/abs/2011.09157},
  owner     = {lilei.02},
}

@Patent{li2021target,
  nationality = {CN},
  number      = {201811010095.X},
  year        = {2021},
  yearfiled   = {2018},
  assignee    = {Beijing Bytedance Network Tech Co. Ltd},
  author      = {Li, Gen and Xu, Shikun and Zhu, Yandong and Li, Lei and Wang, Changhu},
  day         = {8},
  dayfiled    = {31},
  month       = may,
  monthfiled  = aug,
  title       = {Target Object Image Detection Method and Device},
  comment     = {目标对象的图像确定方法、装置、设备及存储介质},
  owner       = {lilei.02},
}

@InProceedings{xie2021mars,
  author    = {Yutong Xie and Chence Shi and Hao Zhou and Yuwei Yang and Weinan Zhang and Yong Yu and Lei Li},
  booktitle = {International Conference on Learning Representations (ICLR)},
  title     = {{MARS}: Markov Molecular Sampling for Multi-objective Drug Discovery},
  year      = {2021},
  month     = may,
  abstract  = {Searching for novel molecules with desired chemical properties is crucial in drug discovery.  Existing work focuses on developing deep generative models to generate either sequences or chemical molecular graphs. However, it remains a great challenge to find novel and diverse compounds satisfying many properties. In this paper, we propose MARS, a method for multi-objective drug molecule discovery. MARS is based on the idea of generating the chemical candidates by iterative editing fragments of molecular graphs. To search for the best candidates, it employs an annealing scheme together with Markov chain Monte Carlo sampling (MCMC) on molecules. To further improve sample efficiency, MARS is equipped with a graph neural network (GNN) as the proposal for candidate edits on molecules, while the GNN is trained on-the-fly utilizing the sample paths in MCMC. Our experiments show that MARS achieves state-of-the-art performance in various multi-objective settings where molecular bio-activity, drug-likeness, and synthesizability are simultaneously considered.  In the most challenging setting where four objectives – bio-activities to two different targets, drug-likeness and synthesizability – are simultaneously considered, our method outperforms the state-of-the-art significantly in a comprehensive evaluation.},
  addendum  = {Spotlight, 5.6\% acceptance rate},
  code      = {https://github.com/yutxie/mars},
  eprint    = {https://openreview.net/forum?id=kHSu4ebxFXY},
  owner     = {lilei.02},
  thumbnail = {xie2021mars_molecule_editing.jpg},
}

@Patent{chen2021methoda,
  nationality = {US},
  number      = {10,984,542 B2},
  year        = {2021},
  yearfiled   = {2018},
  author      = {Chen, Yangyu and He, Yi and Li, Lei},
  day         = {20},
  dayfiled    = {20},
  month       = apr,
  monthfiled  = {#nov#},
  title       = {Method and Device for determining geometric transformation relation for images},
  comment     = {用于确定图像间的几何变换关系的方法和装置},
}

@Patent{he2021image,
  nationality = {CN},
  number      = {201910498629.6},
  year        = {2021},
  yearfiled   = {2019},
  author      = {He, Yi and Li, Gen and Li, Lei},
  day         = {29},
  dayfiled    = {10},
  month       = apr,
  monthfiled  = jun,
  title       = {Image processing method and device},
  comment     = {图像处理方法、装置、电子设备及计算机可读存储介质(一种概率的图片哈希表达方法)},
}

@Patent{huang2021target,
  nationality = {CN},
  number      = {202010057296.6},
  year        = {2021},
  yearfiled   = {2020},
  author      = {Huang, Xunpeng and Liu, Zhengyang and Li, Lei},
  day         = {20},
  dayfiled    = {17},
  month       = apr,
  monthfiled  = jan,
  title       = {Target Object Classification method and device},
  comment     = {目标对象的分类方法、装置和电子设备( 一种快速的近似二阶牛顿优化算法 )},
}

@Patent{li2021method,
  nationality = {CN},
  number      = {201811455645.9},
  year        = {2021},
  yearfiled   = {2019},
  assignee    = {Beijing Bytedance Tech Co. Ltd},
  author      = {Li, Lei and Chen, Jiaze and Chen, Jiamin and Ma, Weiying and Hua, Lifeng},
  day         = {13},
  dayfiled    = {1},
  month       = apr,
  monthfiled  = feb,
  title       = {Method and Device for generating information},
  comment     = {用于处理文本的方法和装置},
  owner       = {lilei.02},
}

@InProceedings{wang2021enpar,
  author    = {Yijun Wang and Changzhi Sun and Yuanbin Wu and Hao Zhou and Lei Li and Junchi Yan},
  booktitle = {Proceedings of European Chapter of the Association for Computational Linguistics (EACL)},
  title     = {{ENPAR}: Enhancing Entity and Entity Pair Representations for Joint Entity Relation Extraction},
  year      = {2021},
  month     = apr,
  owner     = {lilei.02},
}

@Patent{li2021audio,
  nationality = {US},
  number      = {10,950,255 B2},
  year        = {2021},
  yearfiled   = {2018},
  assignee    = {Beijing Bytedance Network Tech Co.},
  author      = {Gen Li and Lei Li and Yi He},
  day         = {16},
  dayfiled    = {29},
  month       = mar,
  monthfiled  = {#dec#},
  title       = {Audio Fingerprint Extraction Method and Device},
  comment     = {一种音频指纹提取方法及装置},
  owner       = {lilei.02},
}

@Patent{she2021method,
  nationality = {CN},
  number      = {201811562666.0},
  year        = {2021},
  yearfiled   = {2018},
  author      = {She, Heng and Wang, Yang and Guo, Yinuo and Zhang, Huiru and Li, Yitan and Li, Lei and Li, Hang},
  day         = {19},
  dayfiled    = {18},
  month       = mar,
  monthfiled  = dec,
  title       = {Method and Device for Push-Notifying Information},
  comment     = {用于推送信息的方法和装置},
}

@InProceedings{song2021triangular,
  author    = {Zhenqiao Song and Jiaze Chen and Hao Zhou and Lei Li},
  booktitle = {Proceedings of the 14th International Conference on Web Search and Data Mining (WSDM)},
  title     = {Triangular Bidword Generation for Sponsored Search Auction},
  year      = {2021},
  month     = mar,
  abstract  = {Sponsored search auction is a crucial component of modern search engines. It requires a set of candidate bidwords that advertisers can place bids on. Existing methods generate bidwords from search queries or advertisement content. However, they suffer from the data noise in <query, bidword> and <advertisement, bidword> pairs. In this paper, we propose a triangular bidword generation model (TRIDENT), which takes the high-quality data of paired <query, advertisement> as a supervision signal to indirectly guide the bidword generation process. Our proposed model is simple yet effective: by using bidword as the bridge between search query and advertisement, the generation of search query, advertisement and bidword can be jointly learned in the triangular training framework. This alleviates the problem that the training data of bidword may be noisy. Experimental results, including automatic and human evaluations, show that our proposed TRIDENT can generate relevant and diverse bidwords for both search queries and advertisements. Our evaluation on online real data validates the effectiveness of the TRIDENT’s generated bidwords for product search.},
  eprint    = {https://arxiv.org/abs/2101.11349},
  owner     = {lilei.02},
  thumbnail = {song2021triangular_search_bidding_example.jpg},
}

@InProceedings{dong2021consecutive,
  author    = {Qianqian Dong and Mingxuan Wang and Hao Zhou and Shuang Xu and Bo Xu and Lei Li},
  booktitle = {Proceedings of the AAAI Conference on Artificial Intelligence (AAAI)},
  title     = {Consecutive Decoding for Speech-to-text Translation},
  year      = {2021},
  month     = feb,
  code      = {https://github.com/dqqcasia/st},
  eprint    = {https://arxiv.org/abs/2009.09737},
  owner     = {lilei.02},
  url       = {https://dqqcasia.github.io/projects/COSTT/},
}

@InProceedings{dong2021listen,
  author    = {Qianqian Dong and Rong Ye and Mingxuan Wang and Hao Zhou and Shuang Xu and Bo Xu and Lei Li},
  booktitle = {Proceedings of the AAAI Conference on Artificial Intelligence (AAAI)},
  title     = {Listen, Understand and Translate: Triple Supervision Decouples End-to-end Speech-to-text Translation},
  year      = {2021},
  month     = feb,
  code      = {https://github.com/dqqcasia/st},
  eprint    = {https://arxiv.org/abs/2009.09704},
  owner     = {lilei.02},
  url       = {https://dqqcasia.github.io/projects/LUT/},
}

@InProceedings{huang2021acmo,
  author    = {Xunpeng Huang and Runxin Xu and Hao Zhou and Zhe Wang and Zhengyang Liu and Lei Li},
  booktitle = {Proceedings of the AAAI Conference on Artificial Intelligence (AAAI)},
  title     = {ACMo: Angle-Calibrated Moment Methods for Stochastic Optimization},
  year      = {2021},
  month     = feb,
  abstract  = {Stochastic gradient descent (SGD) is a widely used method for its outstanding generalization ability and simplicity.  daptive gradient methods have been proposed to further accelerate the optimization process.  n this paper, we revisit existing adaptive gradient optimization methods with a new interpretation. 
Such new perspective leads to a refreshed understanding of the roles of second moments in stochastic optimization.  Based on this, we propose Angle-Calibration Moment method (ACMo), a novel stochastic optimization method. It enjoys the benefits of second moments with only first moment updates. Theoretical analysis shows that ACMo is able to achieve the same convergence rate as mainstream adaptive methods. Experiments on a variety of CV and NLP tasks demonstrate that ACMo has a comparable convergence to state-of-the-art Adam-type optimizers, and even a better generalization performance in most cases. The code is available at https://github.com/Xunpeng746/ACMo.},
  code      = {https://github.com/Xunpeng746/ACMo},
  eprint    = {https://arxiv.org/abs/2006.07065},
  owner     = {lilei.02},
  thumbnail = {huang2021acmo_illustration.jpg},
  url       = {https://xunpeng746.github.io/projects/ACMo/ACMo.html},
}

@InProceedings{liang2021finding,
  author    = {Jianze Liang and Chengqi Zhao and Mingxuan Wang and Xipeng Qiu and Lei Li},
  booktitle = {Proceedings of the AAAI Conference on Artificial Intelligence (AAAI)},
  title     = {Finding Sparse Structure for Domain Specific Neural Machine Translation},
  year      = {2021},
  month     = feb,
  code      = {https://github.com/ohlionel/Prune-Tune},
  eprint    = {https://arxiv.org/abs/2012.10586},
  owner     = {lilei.02},
  thumbnail = {liang2021finding_prune_tune_diagram.svg},
  url       = {https://ohlionel.github.io/project/Prune-Tune/},
}

@InProceedings{wu2021textgail,
  author    = {Qingyang Wu and Lei Li and Zhou Yu},
  booktitle = {Proceedings of the AAAI Conference on Artificial Intelligence (AAAI)},
  title     = {{TextGAIL}: Generative Adversarial Imitation Learning for Text Generation},
  year      = {2021},
  month     = feb,
  eprint    = {https://arxiv.org/abs/2004.13796},
}

@InProceedings{zhang2021taxonomy,
  author    = {Jieyu Zhang and Xiangchen Song and Ying Zeng and Jiaze Chen and Jiaming Shen and Yuning Mao and Lei Li},
  booktitle = {Proceedings of the AAAI Conference on Artificial Intelligence (AAAI)},
  title     = {Taxonomy Completion via Triplet Matching Network},
  year      = {2021},
  month     = feb,
  code      = {https://github.com/JieyuZ2/TMN},
  eprint    = {https://arxiv.org/abs/2101.01896},
  owner     = {lilei.02},
  thumbnail = {zhang2021taxonomy_completion_task.svg},
}

@Patent{chen2021method,
  nationality = {CN},
  number      = {201811457980.2},
  year        = {2021},
  yearfiled   = {2018},
  author      = {Chen, Jiaze and Li, Lei and Zeng, Ying and Ma, Weiying},
  day         = {29},
  dayfiled    = {2},
  month       = jan,
  monthfiled  = nov,
  title       = {Method and Device for generation product description information},
  comment     = {生成物品描述信息的方法和装置},
}

@Patent{deng2021method,
  nationality = {CN},
  number      = {201810910344.4},
  year        = {2021},
  yearfiled   = {2018},
  assignee    = {Beijing Bytedance Network Tech Co. Ltd},
  author      = {Deng, Jiangdong and Li, Lei and Ma, Weiying},
  day         = {1},
  dayfiled    = {10},
  month       = jan,
  monthfiled  = aug,
  title       = {Method and device for stock selection},
  comment     = {股票的筛选方法及装置、计算机设备及可读存储介质},
  owner       = {lilei.02},
}

@Patent{he2021duplicate,
  nationality = {CN},
  number      = {201810273706.3},
  year        = {2021},
  yearfiled   = {2018},
  assignee    = {Beijing Bytedance Network Tech Co.},
  author      = {He, Yi and Yang, Cheng and Li, Gen and Li, Yitan and Li, Lei},
  day         = {19},
  dayfiled    = {29},
  month       = jan,
  monthfiled  = {#mar#},
  title       = {Duplicate video detection method and device},
  comment     = {一种重复视频的判断方法及装置},
  owner       = {lilei.02},
}

@InProceedings{wang2020solov2,
  author    = {Wang, Xinlong and Zhang, Rufeng and Kong, Tao and Li, Lei and Shen, Chunhua},
  booktitle = {the 34th Conference on Neural Information Processing Systems (NeurIPS)},
  title     = {{SOLOv2}: Dynamic and Fast Instance Segmentation},
  year      = {2020},
  month     = dec,
  abstract  = {In this work, we design a simple, direct, and fast framework for instance segmentation with strong performance. To this end, we propose a novel and effective approach, termed SOLOv2, following the principle of the SOLO method. First, our new framework is empowered by an efficient and holistic instance mask representation scheme, which dynamically segments each instance in the image, without resorting to bounding box detection. Specifically, the object mask generation is decoupled into a mask kernel prediction and mask feature learning, which are responsible for generating convolution kernels and the feature maps to be convolved with, respectively. Second, SOLOv2 significantly reduces inference overhead with our novel matrix non-maximum suppression (NMS) technique. Our Matrix NMS performs NMS with parallel matrix operations in one shot, and yields better results. We demonstrate that the proposed SOLOv2 achieves the state-of-the- art performance with high efficiency, making it suitable for both mobile and cloud applications. A light-weight version of SOLOv2 executes at 31.3 FPS and yields 37.1% AP on COCO test-dev. Moreover, our state-of-the-art results in object detection (from our mask byproduct) and panoptic segmentation show the potential of SOLOv2 to serve as a new strong baseline for many instance-level recognition tasks.},
  code      = {https://github.com/WXinlong/SOLO},
  eprint    = {https://arxiv.org/abs/2003.10152},
  owner     = {lilei.02},
  thumbnail = {wang2020solov2_case.png},
}

@InProceedings{wu2020volctrans,
  author       = {Liwei Wu and Xiao Pan and Zehui Lin and Yaoming Zhu and Mingxuan Wang and Lei Li},
  booktitle    = {Proceedings of the Fifth Conference on Machine Translation (Volume 2: Shared Task Papers)},
  title        = {The Volctrans Machine Translation System for WMT20},
  year         = {2020},
  month        = nov,
  comment      = {Winner in WMT20 Machine Translation Contest of Chinese-English, German-English, French-German,  English-Khmer, and English-Pashto languages.},
  entrysubtype = {workshop},
  eprint       = {https://arxiv.org/abs/2010.14806},
  owner        = {lilei.02},
  thumbnail    = {volctrans-logo-eng.png},
}

@InProceedings{xu2020volctrans,
  author       = {Runxin Xu and Zhuo Zhi and Jun Cao and Mingxuan Wang and Lei Li},
  booktitle    = {Proceedings of the Fifth Conference on Machine Translation (Volume 2: Shared Task Papers)},
  title        = {Volctrans Parallel Corpus Filtering System for WMT 2020},
  year         = {2020},
  month        = nov,
  comment      = {Winner of WMT20 Parallel Corpus Filtering tasks on Pashto and Khmer languages.},
  entrysubtype = {workshop},
  eprint       = {https://arxiv.org/abs/2010.14029},
  owner        = {lilei.02},
}

@InProceedings{li2020sentence,
  author    = {Bohan Li and Hao Zhou and Junxian He and Mingxuan Wang and Yiming Yang and Lei Li},
  booktitle = {the Conference on Empirical Methods in Natural Language Processing (EMNLP)},
  title     = {On the Sentence Embeddings from Pre-trained Language Models},
  year      = {2020},
  month     = nov,
  abstract  = {Pre-trained contextual representations like BERT have achieved great success in natural language processing. However, the sentence embeddings from the pre-trained language models without fine-tuning have been found to poorly capture semantic meaning of sentences. In this paper, we argue that the semantic information in the BERT embeddings is not fully exploited. We first reveal the theoretical connection between the masked language model pre-training objective and the semantic similarity task theoretically, and then analyze the BERT sentence embeddings empirically. We find that BERT always induces a non-smooth anisotropic semantic space of sentences, which harms its performance of semantic similarity. To address this issue, we propose to transform the anisotropic sentence embedding distribution to a smooth and isotropic Gaussian distribution through normalizing flows that are learned with an unsupervised objective. Experimental results show that our proposed BERT-flow method obtains significant performance gains over the state-of-the-art sentence embeddings on a variety of semantic textual similarity tasks.},
  code      = {https://github.com/bohanli/BERT-flow},
  eprint    = {https://arxiv.org/abs/2011.05864},
  owner     = {lilei.02},
}

@InProceedings{lin2020pre,
  author    = {Zehui Lin and Xiao Pan and Mingxuan Wang and Xipeng Qiu and Jiangtao Feng and Hao Zhou and Lei Li},
  booktitle = {the Conference on Empirical Methods in Natural Language Processing (EMNLP)},
  title     = {Pre-training Multilingual Neural Machine Translation by Leveraging Alignment Information},
  year      = {2020},
  month     = nov,
  abstract  = {We investigate the following question for machine translation (MT): can we develop a single universal MT model to serve as the common seed and obtain derivative and improved models on arbitrary language pairs? We propose mRASP, an approach to pre-train a universal multilingual neural machine translation model. Our key idea in mRASP is its novel technique of random aligned substitution, which brings words and phrases with simlar meanings across multiple languages closer in the representation space. We pre-train a mRASP model on 32 language pairs jointly with only public datasets. The model is then fine-tuned on downstream language pairs to obtain specialized MT models. We carry out extensive experiments on 42 translation directions across a diverse settings, including low, medium, rich resource, and as well as transferring to exotic language pairs. Experimental results demonstrate that mRASP achieves significant performance improvement compared to directly training on those target pairs. It is the first time to verify that multiple low-esource language pairs can be utilized to improve rich resource MT. Surprisingly, mRASP is even able to improve the translation quality on exotic languages that never occur in the pre- training corpus.},
  code      = {https://github.com/linzehui/mRASP},
  comment   = {This version improves over the EMNLP published one. Table 1 and 3 are updated with better hyperparameter tuning for both baselines and mRASP. The conclusion remains valid.},
  eprint    = {https://arxiv.org/abs/2010.03142},
  thumbnail = {lin2020pre_mrasp_diagram.png},
}

@InProceedings{zeng2020double,
  author    = {Shuang Zeng and Runxin Xu and Baobao Chang and Lei Li},
  booktitle = {the Conference on Empirical Methods in Natural Language Processing (EMNLP)},
  title     = {Double Graph Based Reasoning for Document-level Relation Extraction},
  year      = {2020},
  month     = nov,
  abstract  = {Document-level relation extraction aims to extract relations among entities within a docuent. Different from sentence-level relation extraction, it requires reasoning over multiple sentences across paragraphs. In this paper, we propose Graph Aggregation-and-Inference Network (GAIN), a method to recognize such relations for long paragraphs. GAIN constructs two graphs, a heterogeneous mention- level graph (MG) and an entity-level graph (EG). The former captures complex interaction among different mentions and the latter aggregates mentions underlying for the same entities. Based on the graphs we propose a novel path reasoning mechanism to infer relations between entities. Experiments on the public dataset, DocRED, show GAIN achieves a significant performance improvement (2.85 on F1) over the previous state-of-the-art.},
  code      = {https://github.com/PKUnlp-icler/GAIN},
  eprint    = {https://arxiv.org/abs/2009.13752},
  owner     = {lilei.02},
}

@InProceedings{ru2020active,
  author    = {Dongyu Ru and Jiangtao Feng and Lin Qiu and Hao Zhou and Mingxuan Wang and Weinan Zhang and Yong Yu and Lei Li},
  booktitle = {the Conference on Empirical Methods in Natural Language Processing (EMNLP) - Findings},
  title     = {Active Sentence Learning by Adversarial Uncertainty Sampling in Discrete Space},
  year      = {2020},
  month     = nov,
  abstract  = {Active learning for sentence understanding aims at discovering informative unlabeled data for annotation and therefore reducing the demand for labeled data. We argue that the typical uncertainty sampling method for active learning is time-consuming and can hardly work in real-time, which may lead to ineffective sample selection. We propose adversarial uncertainty sampling in discrete space (AUSDS) to retrieve informative unlabeled samples more efficiently. AUSDS maps sentences into latent space generated by the popuar pre-trained language models, and discover informative unlabeled text samples for annotation via adversarial attack. The proposed approach is extremely efficient compared with traditional uncertainty sampling with more than 10x speedup. Experimental results on five datasets show that AUSDS outperforms strong baselines on effectiveness.},
  eprint    = {https://arxiv.org/abs/2004.08046},
}

@InProceedings{zhang2020language,
  author    = {Maosen Zhang and Nan Jiang and Lei Li and Yexiang Xue},
  booktitle = {the Conference on Empirical Methods in Natural Language Processing (EMNLP) - Findings},
  title     = {Language Generation via Combinatorial Constraint Satisfaction: A Tree Search Enhanced {Monte}-{Carlo} Approach},
  year      = {2020},
  month     = nov,
  abstract  = {Generating natural language under complex constraints is a principled formulation towards controllable text generation. We present a framework to allow specification of combinatorial constraints for sentence generation. We propose TSMH1, an efficient method to generate high likelihood sentences with respect to a pre-trained language model while satisfying the constraints. Our approach is highly flexible, requires no task-specific training, and leverages efficient constraint satisfaction solving techniques. To better handle the combinatorial constraints, a tree search algorithm is embedded into the proposal process of the Markov chain Monte Carlo (MCMC) to explore candidates that satisfy more constraints. Compared to existing MCMC approaches, our sampling approach has a better mixing performance. Experiments show that TSMH achieves consistent and significant improvement on multiple language generation tasks.},
  code      = {https://github.com/Milozms/TSMH},
  eprint    = {https://arxiv.org/abs/2011.12334},
  owner     = {lilei.02},
}

@Patent{deng2020sentiment,
  nationality = {CN},
  number      = {201810909879.X},
  year        = {2020},
  yearfiled   = {2018},
  author      = {Jiangdong Deng and Lei Li and Weiying Ma},
  day         = {18},
  dayfiled    = {10},
  month       = sep,
  monthfiled  = {#aug#},
  title       = {Sentiment Prediction Method and Device},
  type        = {patent},
  comment     = {舆情指数的预测方法及装置、计算机设备和可读存储介质},
  owner       = {lilei.02},
}

@InProceedings{song2020improving,
  author    = {Yuxuan Song and Ning Miao and Hao Zhou and Lantao Yu and Mingxuan Wang and Lei Li},
  booktitle = {The 23rd International Conference on Artificial Intelligence and Statistics (AISTATS)},
  title     = {Improving Maximum Likelihood Training for Text Generation with Density Ratio Estimation},
  year      = {2020},
  month     = aug,
  abstract  = {Auto-regressive  sequence  generative  models trained by Maximum Likelihood Estimation suffer  the  exposure  bias  problem  in  practical finite sample scenarios.  The crux is that the number of training samples for Maximum Likelihood Estimation is usually limited and the input data distributions are different at training and inference stages.  Many method shave been proposed to solve the above problem (Yu et al., 2017; Lu et al., 2018), which relies  on  sampling  from  the  non-stationary model distribution and suffers from high variance  or  biased  estimations.   In  this  paper, we  proposeψ-MLE,  a  new  training  scheme for auto-regressive sequence generative models, which is effective and stable when operating at large sample space encountered in text generation.   We  derive  our  algorithm  from a  new  perspective  of  self-augmentation  and introduce  bias  correction  with  density  ratio estimation.   Extensive  experimental  results on  synthetic  data  and  real-world  text  generation  tasks  demonstrate  that  our  method stably outperforms Maximum Likelihood Estimation and other state-of-the-art sequence generative  models  in  terms  of  both  quality and diversity.},
  eprint    = {https://arxiv.org/abs/2007.06018},
  timestamp = {2020-01-07},
}

@InProceedings{wang2020solo,
  author    = {Xinlong Wang and Tao Kong and Chunhua Shen and Yuning Jiang and Lei Li},
  booktitle = {The European Conference on Computer Vision (ECCV)},
  title     = {{SOLO}: {S}egmenting {O}bjects by {L}ocations},
  year      = {2020},
  month     = aug,
  abstract  = {We present a new, embarrassingly simple approach to instance segmentation in images. Compared to many other dense prediction tasks, e.g., semantic segmentation, it is the arbitrary number of instances that have made instance segmentation much more challenging. In order to predict a mask for each instance, mainstream approaches either follow the 'detect-thensegment' strategy as used by Mask R-CNN, or predict category masks first then use clustering techniques to group pixels into individual instances. We view the task of instance segmentation from a completely new perspective by introducing the notion of "instance categories", which assigns categories to each pixel within an instance according to the instance's location and size, thus nicely converting instance mask segmentation into a classification-solvable problem. Now instance segmentation is decomposed into two classification tasks. We demonstrate a much simpler and flexible instance segmentation framework with strong performance, achieving on par accuracy with Mask R-CNN and outperforming recent singleshot instance segmenters in accuracy. We hope that this very simple and strong framework can serve as a baseline for many instance-level recognition tasks besides instance segmentation.},
  code      = {https://github.com/WXinlong/SOLO},
  eprint    = {https://arxiv.org/abs/1912.04488},
  thumbnail = {wang2020solo_result_663.jpg},
}

@Patent{he2020method,
  nationality = {CN},
  number      = {201810271774.6},
  year        = {2020},
  yearfiled   = {2018},
  assignee    = {Beijing Bytedance Network Tech Co.},
  author      = {He, Yi and Li, Lei and Yang, Cheng and Li, Gen and Li, Yitan},
  day         = {7},
  dayfiled    = {29},
  month       = jul,
  monthfiled  = {#oct#},
  title       = {A Method and Device for Video Feature Extraction},
  comment     = {一种视频特征提取方法及装置},
  owner       = {lilei.02},
}

@Patent{zhou2020machine,
  nationality = {CN},
  number      = {201910105606.4},
  year        = {2020},
  yearfiled   = {2019},
  assignee    = {Beijing Bytedance Tech Co. Ltd},
  author      = {Zhou, Hao and Li, Lei},
  day         = {10},
  dayfiled    = {1},
  month       = jul,
  monthfiled  = feb,
  title       = {Machine Translation Method and Device},
  comment     = {机器翻译方法和装置},
  owner       = {lilei.02},
}

@InProceedings{shi2020dispersed,
  author    = {Wenxian Shi and Hao Zhou and Ning Miao and Lei Li},
  booktitle = {Proceedings of the 37th International Conference on Machine Learning (ICML)},
  title     = {Dispersed Exponential Family Mixture {VAE}s for Interpretable Text Generation},
  year      = {2020},
  month     = jul,
  abstract  = {Deep generative models are commonly used for generating images and text. Interpretability of these models is one important pursuit, other than the generation quality. Variational auto-encoder (VAE) with Gaussian distribution as prior has been successfully applied in text generation, but it is hard to interpret the meaning of the latent variable. To enhance the controllability and interpretability, one can replace the Gaussian prior with a mixture of Gaussian distributions (GM-VAE), whose mixture components could be related to hidden semantic aspects of data. In this paper, we generalize the practice and introduce DEM-VAE, a class of models for text generation using VAEs with a mixture distribution of exponential family. Unfortunately, a standard variational training algorithm fails due to the mode-collapse problem. We theoretically identify the root cause of the problem and propose an effective algorithm to train DEM-VAE. Our method penalizes the training with an extra dispersion term to induce a well-structured latent space. Experimental results show that our approach does obtain a meaningful space, and it outperforms strong baselines in text generation benchmarks. The code is available at https://github.com/wenxianxian/demvae.},
  code      = {https://github.com/wenxianxian/demvae},
  eprint    = {https://arxiv.org/abs/1906.06719},
  thumbnail = {shi2020dispersed_dgmvae_latent.png},
  video     = {https://slideslive.com/38928051},
}

@InProceedings{ru2020quachie,
  author       = {Dongyu Ru and Zhenghui Wang and Lin Qiu and Hao Zhou and Lei Li and Weinan Zhang and Yong Yu},
  booktitle    = {the 43rd International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR) - System Demonstrations},
  title        = {{QuAChIE}: Question Answering based {Chinese} Information Extraction System},
  year         = {2020},
  month        = jul,
  entrysubtype = {demo},
  owner        = {lilei.02},
}

@InProceedings{miao2020do,
  author    = {Ning Miao and Yuxuan Song and Hao Zhou and Lei Li},
  booktitle = {the 58th Annual Meeting of the Association for Computational Linguistics (ACL) - short papers},
  title     = {Do you have the right scissors? Tailoring Pre-trained Language Models via {Monte}-{Carlo} Methods},
  year      = {2020},
  month     = jul,
  abstract  = {It has been a common approach to pre-train a language model on a large corpus and fine-tune it on task-specific data.  In practice, we observe that fine-tuning a pre-trained model on a small dataset may lead to  over- and/or under-estimation problem.  In this paper, we propose MC-Taylor, a novel method to alleviate the above issue in text generation tasks by truncating and transferring the probability mass from over-estimated regions to under-estimated ones. Experiments on a variety of text generation datasets show that MC-Taylor consistently and significantly outperforms the fine-tuning approach. Our code is available at \url{https://github.com/NingMiao/MC-tailor}.},
  code      = {https://github.com/NingMiao/MC-tailor},
  eprint    = {https://arxiv.org/abs/2007.06162},
  timestamp = {2020-05-01},
  video     = {http://slideslive.com/38928919},
}

@InProceedings{xu2020xiaomingbot,
  author       = {Runxin Xu and Jun Cao and Mingxuan Wang and Jiaze Chen and Hao Zhou and Ying Zeng and Yuping Wang and Li Chen and Xiang Yin and Xijin Zhang and Songcheng Jiang and Yuxuan Wang and Lei Li},
  booktitle    = {the 58th Annual Meeting of the Association for Computational Linguistics (ACL): System Demonstrations},
  title        = {Xiaomingbot: A Multilingual Robot News Reporter},
  year         = {2020},
  month        = jul,
  abstract     = {This paper proposes the building of Xiaomingbot, an intelligent, multilingual and multi-modal software robot equipped with four integral capabilities: news generation, news translation, news reading and avatar animation. Its system summarizes Chinese news that it automatically generates from data tables. Next, it translates the summary or the full article into multiple languages, and reads the multilingual rendition through synthesized speech. Notably, Xiaomingbot utilizes a voice cloning technology to synthesize the speech trained from a real person’s voice data in one input language. The proposed system enjoys several merits: it has an animated avatar, and is able to generate and read multilingual news. Since it was put into practice, Xiaomingbot has written over 600,000 articles, and gained over 150,000 followers on social media platforms.},
  entrysubtype = {demo},
  timestamp    = {2020-05-01},
  url          = {https://xiaomingbot.github.io},
}

@Article{kong2020foveabox,
  author   = {Tao {Kong} and Fuchun {Sun} and Huaping {Liu} and Yuning {Jiang} and Lei {Li} and Jianbo {Shi}},
  journal  = {IEEE Transactions on Image Processing},
  title    = {{FoveaBox}: Beyound Anchor-based Object Detection},
  year     = {2020},
  issn     = {1057-7149},
  month    = jun,
  pages    = {7389-7398},
  volume   = {29},
  abstract = {We present FoveaBox, an accurate, flexible, and completely anchor-free framework for object detection. While almost all state-of-the-art object detectors utilize predefined anchors to enumerate possible locations, scales and aspect ratios for the search of the objects, their performance and generalization ability are also limited to the design of anchors. Instead, FoveaBox directly learns the object existing possibility and the bounding box coordinates without anchor reference. This is achieved by: (a) predicting category-sensitive semantic maps for the object existing possibility, and (b) producing category-agnostic bounding box for each position that potentially contains an object. The scales of target boxes are naturally associated with feature pyramid representations. In FoveaBox, an instance is assigned to adjacent feature levels to make the model more accurate.We demonstrate its effectiveness on standard benchmarks and report extensive experimental analysis. Without bells and whistles, FoveaBox achieves state-of-the-art single model performance on the standard COCO and Pascal VOC object detection benchmark. More importantly, FoveaBox avoids all computation and hyper-parameters related to anchor boxes, which are often sensitive to the final detection performance. We believe the simple and effective approach will serve as a solid baseline and help ease future research for object detection. The code has been made publicly available at https://github.com/taokong/FoveaBox.},
  code     = {https://github.com/taokong/FoveaBox},
  doi      = {https://doi.org/10.1109/TIP.2020.3002345},
  eprint   = {https://arxiv.org/abs/1904.03797},
  url      = {http://www.taokong.org/projects/FoveaBox/},
}

@Article{wu2020towards,
  author  = {Wu, Fei and Lu, Cewu and Zhu, Mingjie and Chen, Hao and Zhu, Jun and Yu, Kai and Li, Lei and Li, Ming and Chen, Qianfeng and Li, Xi and Cao, Xudong and Wang, Zhongyuan and Zha, Zhengjun and Zhuang, Yueting and Pan, Yunhe},
  journal = {Nature Machine Intelligence},
  title   = {Towards a new generation of artificial intelligence in {China}},
  year    = {2020},
  month   = jun,
  pages   = {312-316},
  volume  = {2},
  doi     = {https://doi.org/10.1038/s42256-020-0183-4},
  eprint  = {https://rdcu.be/b5vk7},
}

@InProceedings{hua2020xref,
  author    = {Xinyu Hua and Lei Li and Lifeng Hua and Lu Wang},
  booktitle = {Automated Knowledge Base Construction (AKBC)},
  title     = {{XREF}: Entity Linking for {Chinese} News Comments with Supplementary Article Reference},
  year      = {2020},
  month     = jun,
  abstract  = {Automatic identification of mentioned entities in social media posts facilitates quick digestion of trending topics and popular opinions. Nonetheless, this remains a challenging task due to limited context and diverse name variations. In this paper, we study the problem of entity linking for Chinese news comments given mentions’ spans. We hypothesize that comments often refer to entities in the corresponding news article, as well as topics involving the entities. We therefore propose a novel model, XREF, that leverages attention mechanisms to (1) pinpoint relevant context within comments, and (2) detect supporting entities from the news article. To improve training, we make two contributions: (a) we propose a supervised attention loss in addition to the standard cross entropy, and (b) we develop a weakly supervised training scheme to utilize the large-scale unlabeled corpus. Two new datasets in entertainment and product domains are collected and annotated for experiments. Our proposed method outperforms previous methods on both datasets.},
  url       = {https://xinyuhua.github.io/Resources/akbc20/},
}

@Patent{he2020video,
  nationality = {CN},
  number      = {201810271773.1},
  year        = {2020},
  yearfiled   = {2018},
  assignee    = {Beijing Bytedance Network Tech Co.},
  author      = {He, Yi and Li, Lei and Yang, Cheng and Li, Gen and Li, Yitan},
  day         = {29},
  dayfiled    = {29},
  month       = may,
  monthfiled  = {#dec#},
  title       = {Video Feature Extraction Method and Device},
  comment     = {一种视频特征提取方法及装置},
  owner       = {lilei.02},
}

@Patent{zhou2020method,
  nationality = {CN},
  number      = {201910105002.X},
  year        = {2020},
  yearfiled   = {2019},
  assignee    = {Beijing Bytedance Tech Co. Ltd},
  author      = {Zhou, Hao and Li, Lei and Miao, Ning},
  day         = {22},
  dayfiled    = {1},
  month       = may,
  monthfiled  = feb,
  title       = {Method and Device for Generating Text},
  comment     = {用于生成文本的方法和装置},
  owner       = {lilei.02},
}

@Patent{li2020target,
  nationality = {CN},
  number      = {201811010092.6},
  year        = {2020},
  yearfiled   = {2018},
  assignee    = {Beijing Bytedance Network Tech Co. Ltd},
  author      = {Li, Gen and Xu, Shikun and Zhu, Yandong and Li, Lei and Wang, Changhu},
  day         = {24},
  dayfiled    = {31},
  month       = apr,
  monthfiled  = aug,
  title       = {Target Object Image Detection Method and Device},
  comment     = {目标对象的图像确定方法、装置、设备及存储介质},
  owner       = {lilei.02},
}

@Patent{yu2020method,
  nationality = {CN},
  number      = {201810668219.7},
  year        = {2020},
  yearfiled   = {2018},
  assignee    = {Beijing Bytedance Tech Co. Ltd},
  author      = {Yu, Linyun and Li, Lei and Yin, Haibin and Zhu, Wenjia and Jiang, Dong},
  day         = {28},
  dayfiled    = {26},
  month       = apr,
  monthfiled  = jun,
  title       = {Method and Apparatus for generating image},
  comment     = {用于生成图像的方法和装置},
  owner       = {lilei.02},
}

@InProceedings{ye2020variational,
  author    = {Rong Ye and Wenxian Shi and Hao Zhou and Zhongyu Wei and Lei Li},
  booktitle = {International Conference on Learning Representations (ICLR)},
  title     = {Variational Template Machine for Data-to-Text Generation},
  year      = {2020},
  month     = apr,
  abstract  = {How to generate descriptions from structured data organized in tables? Existing approaches using neural encoder-decoder models often suffer from lacking diversity. We claim that an open set of templates is crucial for enriching the phrase constructions and realizing varied generations.Learning such templates is prohibitive since it often requires a large paired <table,description>, which is seldom available. This paper explores the problem of automatically learning reusable "templates" from paired and non-paired data. We propose the variational template machine (VTM), a novel method to generate text descriptions from data tables. Our contributions include:  a) we carefully devise a specific model architecture and losses to explicitly disentangle text template and semantic content information, in the latent spaces, and b) we utilize both small parallel data and large raw text without aligned tables to enrich the template learning. Experiments on datasets from a variety of different domains show that VTM is able generate more diversely while keeping a good fluency and quality.},
  code      = {https://github.com/ReneeYe/VariationalTemplateMachine},
  eprint    = {https://openreview.net/forum?id=HkejNgBtPB},
  video     = {https://iclr.cc/virtual_2020/poster_HkejNgBtPB.html},
}

@InProceedings{zheng2020mirror,
  author    = {Zaixiang Zheng and Hao Zhou and Shujian Huang and Lei Li and Xinyu Dai and Jiajun Chen},
  booktitle = {International Conference on Learning Representations (ICLR)},
  title     = {Mirror Generative Models for Neural Machine Translation},
  year      = {2020},
  month     = apr,
  abstract  = {Training neural machine translation models (NMT) requires a large amount of parallel corpus, which is scarce for many language pairs. However, raw non-parallel corpora are often easy to obtain. Existing approaches have not exploited the full potential of non-parallel bilingual data either in training or decoding. In this paper, we propose the mirror-generative NMT (MGNMT), a single unified architecture that simultaneously integrates the source to target translation model, the target to source translation model, and two language models. Both translation models and language models share the same latent semantic space, therefore both translation directions can learn from non-parallel data more effectively. Besides, the translation models and language models can collaborate together during decoding. Our experiments show that the proposed MGNMT consistently outperforms existing approaches in all a variety of scenarios and language pairs, including resource-rich and low-resource languages.},
  addendum  = {Oral, 1.9\% acceptance rate},
  eprint    = {https://openreview.net/forum?id=HkxQRTNYPH},
  owner     = {lilei.02},
  video     = {https://iclr.cc/virtual_2020/poster_HkxQRTNYPH.html},
}

@Patent{deng2020method,
  nationality = {CN},
  number      = {201811074033.5},
  year        = {2020},
  yearfiled   = {2018},
  assignee    = {Beijing Bytedance Network Tech Co. Ltd},
  author      = {Deng, Jiangdong and Peng, Qu and Li, Lei and Ma, Weiying},
  day         = {27},
  dayfiled    = {4},
  month       = mar,
  monthfiled  = sep,
  title       = {A method for outputing information},
  comment     = {用于输出信息的方法和装置},
  owner       = {lilei.02},
}

@Patent{li2020systems,
  nationality = {US},
  number      = {10606846B2},
  year        = {2020},
  yearfiled   = {2016},
  assignee    = {Baidu USA LLC},
  author      = {Li, Lei and Dai, Zihang and Xu, Wei},
  day         = {31},
  dayfiled    = {23},
  month       = mar,
  monthfiled  = {#may#},
  title       = {Systems and methods for human inspired simple question answering (HISQA)},
  owner       = {lilei.02},
}

@Patent{zhou2020methoda,
  nationality = {CN},
  number      = {201910105241.5},
  year        = {2020},
  yearfiled   = {2019},
  assignee    = {Beijing Bytedance Tech Co. Ltd},
  author      = {Zhou, Hao and Li, Lei and Chen, Jiaze and Shi, Haoyue},
  day         = {31},
  dayfiled    = {1},
  month       = mar,
  monthfiled  = feb,
  title       = {Method and Device for generating information},
  comment     = {用于生成信息的方法和装置},
  owner       = {lilei.02},
}

@Patent{zhou2020methodb,
  nationality = {CN},
  number      = {201910105235.X},
  year        = {2020},
  yearfiled   = {2019},
  assignee    = {Beijing Bytedance Tech Co. Ltd},
  author      = {Zhou, Hao and Li, Lei},
  day         = {17},
  dayfiled    = {1},
  month       = mar,
  monthfiled  = feb,
  title       = {Method and Device for generating information},
  comment     = {用于生成信息的方法和装置},
  owner       = {lilei.02},
}

@InProceedings{huang2020span,
  author    = {Xunpeng Huang and Xianfeng Liang and Zhengyang Liu and Lei Li and Yue Yu and Yitan Li},
  booktitle = {the 34th {AAAI} Conference on Artificial Intelligence (AAAI)},
  title     = {{SPAN}: A Stochastic Projected Approximate Newton Method},
  year      = {2020},
  month     = feb,
  abstract  = {Second-order optimization methods have desirable convergence properties. However, the exact Newton method requires expensive computation for the Hessian and its inverse. In this paper, we propose SPAN, a novel approximate and fast Newton method. SPAN computes the inverse of the Hessian matrix via low-rank approximation and stochastic Hessian-vector products. Our experiments on multiple benchmark datasets demonstrate that SPAN outperforms existing first-order and second-order optimization methods in terms of the convergence wall-clock time. Furthermore, we provide a theoretical analysis of the per-iteration complexity, the approximation error, and the convergence rate. Both the theoretical analysis and experimental results show that our proposed method achieves a better trade-off between the convergence rate and the per-iteration efficiency.},
  eprint    = {https://arxiv.org/abs/2002.03687},
  url       = {https://xunpeng746.github.io/projects/SPAN/SPAN.html},
}

@InProceedings{wang2020task,
  author    = {Xinlong Wang and Wei Yin and Tao Kong and Yuning Jiang and Lei Li and Chunhua Shen},
  booktitle = {the 34th {AAAI} Conference on Artificial Intelligence (AAAI)},
  title     = {Task-Aware Monocular Depth Estimation for {3D} Object Detection},
  year      = {2020},
  month     = feb,
  abstract  = {Monocular depth estimation enables 3D perception from a single 2D image, thus attracting much research attention for years. Almost all methods treat foreground and background regions (``things and stuff'') in an image equally. However, not all pixels are equal. Depth of foreground objects plays a crucial role in 3D object recognition and localization. To date how to boost the depth prediction accuracy of foreground objects is rarely discussed. In this paper, we first analyse the data distributions and interaction of foreground and background, then propose the foreground-background separated monocular depth estimation (ForeSeE) method, to estimate the foreground depth and background depth using separate optimization objectives and depth decoders. Our method significantly improves the depth estimation performance on foreground objects. Applying ForeSeE to 3D object detection, we achieve 7.5 AP gains and set new state-of-the-art results among other monocular methods.},
  addendum  = {Oral},
  code      = {https://github.com/WXinlong/ForeSeE},
  eprint    = {https://arxiv.org/abs/1909.07701},
}

@InProceedings{wu2020importance,
  author    = {Qingyang Wu and Lei Li and Hao Zhou and Ying Zeng and Zhou Yu},
  booktitle = {the 34th {AAAI} Conference on Artificial Intelligence (AAAI)},
  title     = {Importance-Aware Learning for Neural Headline Editing},
  year      = {2020},
  month     = feb,
  abstract  = {Many social media news writers are not professionally trained. Therefore, social media platforms have to hire professional editors to adjust amateur headlines to attract more readers. We propose to automate this headline editing process through neural network models to provide more immediate writing support for these social media news writers. To train such a neural headline editing model, we collected a dataset which contains articles with original headlines and professionally edited headlines. However, it is expensive to collect a large number of professionally edited headlines. To solve this low-resource problem, we design an encoder-decoder model which leverages large scale pre-trained language models. We further improve the pre-trained model's quality by introducing a headline generation task as an intermediate task before the headline editing task. Also, we propose Self Importance-Aware (SIA) loss to address the different levels of editing in the dataset by down-weighting the importance of easily classified tokens and sentences. With the help of Pre-training, Adaptation, and SIA, the model learns to generate headlines in the professional editor's style. Experimental results show that our method significantly improves the quality of headline editing comparing against previous methods.},
  eprint    = {https://arxiv.org/abs/1912.01114},
}

@InProceedings{yang2020towards,
  author    = {Jiacheng Yang and Mingxuan Wang and Hao Zhou and Chengqi Zhao and Weinan Zhang and Yong Yu and Lei Li},
  booktitle = {the 34th {AAAI} Conference on Artificial Intelligence ({AAAI})},
  title     = {Towards Making the Most of {BERT} in Neural Machine Translation},
  year      = {2020},
  month     = feb,
  abstract  = {GPT-2 and BERT demonstrate the effectiveness of using pretrained language models (LMs) on various natural language processing tasks. However, LM fine-tuning often suffers from catastrophic forgetting when applied to resource-rich tasks. In this work, we introduce a concerted training framework (CTNMT) that is the key to integrate the pre-trained LMs to neural machine translation (NMT). Our proposed CTNMT consists of three techniques: a) asymptotic distillation to ensure that the NMT model can retain the previous pre-trained knowledge; b) a dynamic switching gate to avoid catastrophic forgetting of pre-trained knowledge; and c) a strategy to adjust the learning paces according to a scheduled policy. Our experiments in machine translation show CTNMT gains of up
to 3 BLEU score on the WMT14 English-German language pair which even surpasses the previous state-of-the-art pretraining aided NMT by 1.4 BLEU score. While for the large WMT14 English-French task with 40 millions of sentencepairs, our base model still significantly improves upon the state-of-the-art Transformer big model by more than 1 BLEU score.},
  code      = {https://github.com/bytedance/neurst/tree/master/examples/ctnmt},
  eprint    = {https://arxiv.org/abs/1908.05672},
}

@Patent{he2020methoda,
  nationality = {CN},
  number      = {201811060981.3},
  year        = {2020},
  yearfiled   = {2018},
  assignee    = {Beijing Bytedance Network Tech Co. Ltd},
  author      = {He, Yi and Li, Lei and Zong, Xianzi and Tang, Hao and Zheng, Guangguo},
  day         = {4},
  dayfiled    = {12},
  month       = jan,
  monthfiled  = sep,
  title       = {Method and Device for searching information},
  comment     = {用于搜索信息的方法和装置},
  owner       = {lilei.02},
}

@Patent{yu2020methoda,
  nationality = {CN},
  number      = {201810669838.8},
  year        = {2020},
  yearfiled   = {2018},
  assignee    = {Beijing Bytedance Tech Co. Ltd},
  author      = {Yu, Linyun and Li, Lei and Yin, Haibin and Jiang, Dong},
  day         = {7},
  dayfiled    = {26},
  month       = jan,
  monthfiled  = jun,
  title       = {Method and apparatus for generating image},
  comment     = {用于生成图像的方法和装置},
  owner       = {lilei.02},
}

@InProceedings{miao2019kernelized,
  author    = {Miao, Ning and Zhou, Hao and Zhao, Chengqi and Shi, Wenxian and Li, Lei},
  booktitle = {the 33rd Conference on Neural Information Processing Systems (NeurIPS)},
  title     = {Kernelized {Bayesian} Softmax for Text Generation},
  year      = {2019},
  month     = dec,
  abstract  = {Neural models for text generation require a softmax layer with proper word embeddings during the decoding phase. Most existing approaches adopt single point embedding for each word. However, a word may have multiple senses according to different context, some of which might be distinct. In this paper, we propose KerBS, a novel approach for learning better embeddings for text generation. KerBS embodies two advantages: a) it employs a Bayesian composition of embeddings for words with multiple senses; b) it is adaptive to semantic variances of words and
robust to rare sentence context by imposing learned kernels to capture the closeness of words (senses) in the embedding space. Empirical studies show that KerBS significantly boosts the performance of several text generation tasks.},
  code      = {https://github.com/NingMiao/KerBS},
  eprint    = {https://arxiv.org/abs/1911.00274},
}

@Patent{chen2019method,
  nationality = {CN},
  number      = {201811060837.X},
  year        = {2019},
  yearfiled   = {2018},
  assignee    = {Beijing Bytedance Network Tech Co. Ltd},
  author      = {Chen, Yangyu and He, Yi and Li, Lei},
  day         = {12},
  dayfiled    = {12},
  month       = nov,
  monthfiled  = sep,
  title       = {A method and apparatus for determining a geometric transformation relationship between images},
  comment     = {用于确定图像间的几何变换关系的方法和装置},
  owner       = {lilei.02},
}

@InProceedings{zhao2019what,
  author    = {Zhao, Zhichen and Li, Lei and Zhang, Bowen and Wang, Meng and Jiang, Yuning and Xu, Li and Wang, Fengkun and Ma, Weiying},
  booktitle = {the 28th ACM International Conference on Information and Knowledge Management (CIKM)},
  title     = {What You Look Matters: Offline Evaluation of Advertising Creatives for Cold Start Problem},
  year      = {2019},
  month     = nov,
  abstract  = {Modern online-auction-based advertising systems utilize user and item features to automatically place ads. In order to train a model to rank the most profitable ads, new ad creatives have to be placed online for hours to receive sufficient user-click data. This corresponds to the cold-start stage. Random strategy lead to inefficiency and inferior selections of potential ads. In this paper, we analyze the effectiveness of content-based selection during the cold-start stage. Specifically, we propose Pre Evaluation of Ad Creative Model (PEAC), a novel method to evaluate and select ad creatives offline before being placed online. Our proposed PEAC utilizes the automatically extracted deep feature from ad content to predict and rank their potential online placement performance. It does not rely on any user-click data, which is scarce during the cold-starting phase. A large-scale system based on our method has been deployed in a real online advertising platform. The online A/B testing shows the ads system with PEAC pre-ranking obtains significant improvement in revenue gain compared to the prior system. Furthermore, we provide detailed analyses on what the model learned, which gives further suggestions to improve ad creative design.},
}

@InProceedings{wang2019towards,
  author    = {Wang, Mingxuan and Xie, Jun and Tan, Zhixing and Su, Jinsong and Xiong, Deyi and Li, Lei},
  booktitle = {the Conference on Empirical Methods in Natural Language Processing (EMNLP)},
  title     = {Towards Linear Time Neural Machine Translation with {Capsule} Networks},
  year      = {2019},
  month     = nov,
  abstract  = {In this study, we first investigate a novel capsule network with dynamic routing for linear time Neural Machine Translation (NMT), referred as CAPSNMT. CAPSNMT uses an aggregation mechanism to map the source sentence into a matrix with pre-determined size, and then applys a deep LSTM network to decode the target sequence from the source representation. Unlike the previous work (Sutskever et al., 2014) to store the source sentence with a passive and bottom-up way, the dynamic routing policy encodes the source sentence with an iterative process to decide the credit attribution between nodes from lower and higher layers. CAPSNMT has two core properties: it runs in time that is linear in the length of the sequences and provides a more flexible way to aggregate the part-whole information of the source sentence. On WMT14 English-German task and a larger WMT14 English-French task, CAPSNMT achieves comparable results with the Transformer system. We also devise new hybrid architectures intended to combine the strength of CAPSNMT and the RNMT model. Our hybrid models obtain state-of-the-arts results on both benchmark datasets. To the best of our knowledge, this is the first work that capsule networks have been empirically investigated for sequence to sequence problems},
  eprint    = {https://arxiv.org/abs/1811.00287},
}

@InProceedings{jiang2019svd,
  author    = {Jiang, Qing-Yuan and He, Yi and Li, Gen and Lin, Jian and Li, Lei and Li., Wu-Jun},
  booktitle = {International Conference on Computer Vision (ICCV)},
  title     = {{SVD}: A Large-Scale Short Video Dataset for Near Duplicate Video Retrieval.},
  year      = {2019},
  month     = oct,
  abstract  = {With the explosive growth of video data in real applications, near-duplicate video retrieval (NDVR) has become indispensable and challenging, especially for short videos. However, all existing NDVR datasets are introduced for long videos. Furthermore, most of them are small-scale and lack of diversity due to the high cost of collecting and labeling near-duplicate videos. In this paper, we introduce a large-scale short video dataset, called SVD, for the NDVR task. SVD contains over 500,000 short videos and over 30,000 labeled videos of near-duplicates. We use multiple video mining techniques to construct positive/negative pairs. Furthermore, we design temporal and spatial transformations to mimic user-attack behavior in real applications for constructing more difficult variants of SVD. Experiments show that existing state-of-the-art NDVR methods, including real-value based and hashing based methods, fail to achieve satisfactory performance on this challenging dataset. The release of SVD dataset will foster research and system engineering in the NDVR area. The SVD dataset is available at https://svdbase.github.io.},
  url       = {https://svdbase.github.io},
}

@InProceedings{wang2019vatex,
  author    = {Wang, Xin and Wu, Jiawei and Chen, Junkun and Li, Lei and Wang, Yuan-Fang and Wang, William Yang},
  booktitle = {International Conference on Computer Vision (ICCV)},
  title     = {{VATEX}: A Large-Scale, High-Quality Multilingual Dataset for Video-and-Language Research},
  year      = {2019},
  month     = oct,
  abstract  = {We present a new large-scale multilingual video description dataset, VATEX, which contains over 41,250 videos and 825,000 captions in both English and Chinese. Among the captions, there are over 206,000 English-Chinese parallel translation pairs. Compared to the widely-used MSR-VTT dataset, VATEX is multilingual, larger, linguistically complex, and more diverse in terms of both video and natural language descriptions. We also introduce two tasks for video-and-language research based on VATEX: (1) Multilingual Video Captioning, aimed at describing a video in various languages with a compact unified captioning model, and (2) Video-guided Machine Translation, to translate a source language description into the target language using the video information as additional spatiotemporal context. Extensive experiments on the VATEX dataset show that, first, the unified multilingual model can not only produce both English and Chinese descriptions for a video more efficiently, but also offer improved performance over the monolingual models. Furthermore, we demonstrate that the spatiotemporal video context can be effectively utilized to align source and target languages and thus assist machine translation. In the end, we discuss the potentials of using VATEX for other video-and-language research.},
  addendum  = {Oral},
  url       = {https://eric-xw.github.io/vatex-website/index.html},
}

@InProceedings{fu2019rethinking,
  author    = {Fu, Yao and Zhou, Hao and Chen, Jiaze and Li, Lei},
  booktitle = {the 12th International Conference on Natural Language Generation (INLG)},
  title     = {Rethinking Text Attribute Transfer: A Lexical Analysis},
  year      = {2019},
  month     = oct,
  abstract  = {Text attribute transfer is modifying certain linguistic attributes (e.g. sentiment, style, authorship, etc.) of a sentence and transforming them from one type to another. In this paper, we aim to analyze and interpret what is changed during the transfer process. We start from the observation that in many existing models and datasets, certain words within a sentence play important roles in determining the sentence attribute class. These words are referred to as the Pivot Words. Based on these pivot words, we propose a lexical analysis framework, the Pivot Analysis, to quantitatively analyze the effects of these words in text attribute classification and transfer. We apply this framework to existing datasets and models, and show that: (1) the pivot words are strong features for the classification of sentence attributes; (2) to change the attribute of a sentence, many datasets only requires to change certain pivot words; (3) consequently, many transfer models only perform the lexical-level modification, while leaving higher-level sentence structures unchanged. Our work provides an in-depth understanding of linguistic attribute transfer and further identifies the future requirements and challenges of this task.},
  code      = {https://github.com/FranxYao/pivot_analysis},
}

@InProceedings{lu2019uncovering,
  author    = {Lu, Yunfei and Yu, Linyun and Cui, Peng and Zang, Chengxi and Xu, Renzhe and Liu, Yihao and Li, Lei and Zhu, Wenwu},
  booktitle = {the 25th SIGKDD Conference on Knowledge Discovery and Data Mining (KDD)},
  title     = {Uncovering the Co-driven Mechanism of Social and Content Links in User Churn Phenomena},
  year      = {2019},
  address   = {New York, NY, USA},
  month     = aug,
  publisher = {ACM},
  abstract  = {Recent years witness the merge of social networks and user-generatedcontent (UGC) platforms. In these new platforms, users establishlinks to others not only driven by their social relationships in thephysical world but also driven by the contents published by others.During this merging process, social networks gradually integrateboth social and content links and become unprecedentedly complicated,with the motivation to exploit both the advantages of socialviscosity and content attractiveness to reach the best customerretention situation. However, due to the lack of fine-grained datarecording such merging phenomena, the co-driven mechanism ofsocial and content links in churn phenomena remains unexplored.How do social and content factors jointly influence customers’churn? What is the best ratio of social and content links for a user’sretention? Is there a model to capture this co-driven mechanism inusers’ churn phenomena?In this paper, we collect a real-world dataset with more than 5.77million users and 925 million links, with each link being tagged asa social one or a content one. We find that both social and contentlinks have a significant impact on users’ churn and theywork jointlyas a complicated mixture effect. As a result, we propose a novelsurvival model, which incorporates both social and content factors,to predict churn probability over time. Our model successfully fitsthe churn distribution in reality and accurately predicts the churnrate of different subpopulations in the future. By analyzing themodeling parameters, we try to strike a balance between socialdrivenand content-driven links in a user’s social network to reachthe lowest churn rate. Our model and findings may have potentialimplications for the design of future social media.},
  owner     = {leili},
  timestamp = {2019-04-30},
}

@InProceedings{sun2019graspsnooker,
  author       = {Sun, Zhaoyue and Chen, Jiaze and Zhou, Hao and Zhou, Deyu and Li, Lei and Jiang, Mingmin},
  booktitle    = {the 28th International Joint Conference on Artificial Intelligence ({IJCAI})},
  title        = {{GraspSnooker}: Automatic {Chinese} Commentary Generation for Snooker Videos},
  year         = {2019},
  month        = aug,
  pages        = {6569--6571},
  addendum     = {Demo},
  doi          = {https://doi.org/10.24963/ijcai.2019/959},
  entrysubtype = {demo},
}

@InProceedings{weng2019correct,
  author    = {Weng, Rongxiang and Zhou, Hao and Huang, Shujian and Li, Lei and Xia, Yifan and Chen, Jiajun},
  booktitle = {the 28th International Joint Conference on Artificial Intelligence (IJCAI)},
  title     = {{Correct-and-Memorize}: Learning to Translate from Interactive Revisions},
  year      = {2019},
  month     = aug,
  abstract  = {State-of-the-art machine translation models are stillnot on a par with human translators. Previous worktakes human interactions into the neural machine translation process to obtain improved results in target languages. However, not all model-translation errors are equal some are critical while others are minor. In the mean while, same translation mistakes occur repeatedly in similar context. To solve bothissues, we propose CAMIT, a novel method for translating in an interactive environment. Our proposed method works with critical revision instructions,therefore allows human to correct arbitrary words in model-translated sentences. In addition,CAMIT learns from and softly memorizes revision actions based on the context, alleviating the issue of repeating mistakes. Experiments in both ideal and real interactive translation settings demonstrate that our proposed CAMIT enhances machine translation results significantly while requires fewer revision instructions from human compared to previous methods.},
  addendum  = {Oral},
  code      = {https://github.com/wengrx/CAMIT},
}

@InProceedings{bao2019generating,
  author    = {Bao, Yu and Zhou, Hao and Huang, Shujian and Li, Lei and Mou, Lili and Vechtomova, Olga and Dai, Xinyu and Chen, Jiajun},
  booktitle = {the 57th Annual Meeting of the Association for Computational Linguistics (ACL)},
  title     = {Generating Sentences from Disentangled Syntactic and Semantic Spaces},
  year      = {2019},
  month     = jul,
  abstract  = {Variational auto-encoders (VAEs) are widely used in natural language generation due to the regularization of the latent space. However, generating sentences from the continuous latent space does not explicitly model the syntactic information. In this paper, we propose to generate sentences from disentangled syntactic and semantic spaces. Our proposed method explicitly models syntactic information in the VAE’s latent space by using the linearized tree sequence, leading to better performance of language generation. Additionally, the advantage of sampling in the disentangled syntactic and semantic latent spaces enables us to perform novel applications, such as the unsupervised paraphrase generation and syntax-transfer generation. Experimental results show that our proposed model achieves similar or better performance in various tasks, compared with state-of-the-art related work.},
  code      = {https://github.com/baoy-nlp/DSS-VAE},
}

@InProceedings{qiu2019dynamically,
  author    = {Qiu, Lin and Xiao, Yunxuan and Qu, Yanru and Zhou, Hao and Li, Lei and Zhang, Weinan and Yu, Yong},
  booktitle = {the 57th Annual Meeting of the Association for Computational Linguistics (ACL)},
  title     = {Dynamically Fused Graph Network for Multi-hop Reasoning},
  year      = {2019},
  month     = jul,
  abstract  = {Text-based question answering (TBQA) has been studied extensively in recent years. Most existing approaches focus on finding the answer to a question within a single paragraph. However, many difficult questions require multiple supporting evidence from scattered text across two or more documents. In this paper, we propose the Dynamically Fused Graph Network (DFGN), a novel method to answer those questions requiring multiple scattered evidence and reasoning over them. Inspired by human’s step-by-step reasoning behavior, DFGN includes a dynamic fusion layer that starts from the entities mentioned in the given query, explores along the entity graph dynamically built from the text, and gradually finds relevant supporting entities from the given documents. We evaluate DFGN on HotpotQA, a public TBQA dataset requiring multi-hop reasoning. DFGN achieves competitive results on the public board. Furthermore, our analy- sis shows DFGN could produce interpretable reasoning chains.},
  code      = {https://github.com/woshiyyya/DFGN-pytorch},
}

@InProceedings{zhang2019generating,
  author    = {Zhang, Huangzhao and Zhou, Hao and Miao, Ning and Li, Lei},
  booktitle = {the 57th Annual Meeting of the Association for Computational Linguistics (ACL) - short papers},
  title     = {Generating Fluent Adversarial Examples for Natural Languages},
  year      = {2019},
  month     = jul,
  abstract  = {Efficiently building an adversarial attacker fornatural language processing (NLP) tasks is areal challenge. Firstly, as the sentence spaceis discrete, it is difficult to make small perturbations along the direction of gradients. Secondly,the fluency of the generated examples can not be guaranteed. In this paper, we propose MHA, which addresses both problemsby performing Metropolis-Hastings sampling,whose proposal is designed with the guidanceof gradients. Experiments on IMDB and SNLIshow that our proposed MHA outperforms thebaseline model on attacking capability. Adversarial training with MHA also leads to better robustness and performance.},
}

@InProceedings{wu2019unified,
  author    = {Wu, Hao and Mao, Jiayuan and Zhang, Yufeng and Jiang, Yuning and Li, Lei and Sun, Weiwei and Ma, Weiying},
  booktitle = {The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  title     = {Unified Visual-Semantic Embeddings: Bridging Vision and Language with Structured Meaning Representations},
  year      = {2019},
  month     = jun,
  abstract  = {We propose Unified Visual-Semantic Embeddings (VSE)
for learning a joint space for scene representation and textual
semantics. It unifies the embeddings of concepts at different
levels: objects, attributes, relations and full scenes. We
view the sentential semantics as a combination of different
semantic components such as object or relational descriptors,
and align their embeddings with different regions of a
scene. A contrastive learning approach is proposed for the
effective learning of such fine-grained alignment from only
image-caption pairs. We also present a simple yet effective
approach that enforces the coverage of caption embeddings
on the semantic components that appear in the sentence. We
demonstrate that the Unified VSE outperforms other baselines
on cross-modal retrieval tasks and the enforcement
of the semantic coverage improves models’ robustness in
defending text-domain adversarial attacks. Moreover, such
robustness empowers the use of visual cues to accurately
resolve word dependencies in novel sentences.},
  addendum  = {Oral},
}

@Patent{jiang2019construction,
  nationality = {CN},
  number      = {201710388497.2},
  year        = {2019},
  yearfiled   = {2017},
  assignee    = {Beijing Bytedance Network Tech Co.},
  author      = {Jiang, Dong and Zhao, Yanbin and Hou, Shuang and Xia, Xuhong and Li, Lei and Hong, Dingkun},
  day         = {5},
  dayfiled    = {27},
  month       = mar,
  monthfiled  = may,
  title       = {Construction method and device of voice classification model.},
  comment     = {一种语音分类模型的构建方法及装置},
  owner       = {lilei.02},
}

@InProceedings{miao2019cgmh,
  author    = {Miao, Ning and Zhou, Hao and Mou, Lili and Yan, Rui and Li, Lei},
  booktitle = {the 33rd {AAAI} Conference on Artificial Intelligence (AAAI)},
  title     = {{CGMH}: Constrained Sentence Generation by Metropolis-Hastings Sampling},
  year      = {2019},
  month     = jan,
  abstract  = {In real-world applications of natural language generation,
there are often constraints on the target sentences in addition
to fluency and naturalness requirements. Existing language
generation techniques are usually based on recurrent
neural networks (RNNs). However, it is non-trivial to impose
constraints on RNNs while maintaining generation quality,
since RNNs generate sentences sequentially (or with beam
search) from the first word to the last. In this paper, we propose
CGMH, a novel approach using Metropolis-Hastings
sampling for constrained sentence generation. CGMH allows
complicated constraints such as the occurrence of multiple
keywords in the target sentences, which cannot be handled in
traditional RNN-based approaches. Moreover, CGMH works
in the inference stage, and does not require parallel corpora
for training.We evaluate our method on a variety of tasks, including
keywords-to-sentence generation, unsupervised sentence
paraphrasing, and unsupervised sentence error correction.
CGMH achieves high performance compared with previous
supervised methods for sentence generation. Our code
is released at https://github.com/NingMiao/CGMH},
  addendum  = {Oral},
  code      = {https://github.com/NingMiao/CGMH},
  eprint    = {http://arxiv.org/abs/1811.10996},
  slides    = {pubs/miao2019cgmh-ppt.pdf},
}

@InProceedings{cao2018brits,
  author    = {Cao, Wei and Wang, Dong and Li, Jian and Zhou, Hao and Li, Yitan and Li, Lei},
  booktitle = {the 32nd Conference on Neural Information Processing Systems (NeurIPS)},
  title     = {{BRITS}: Bidirectional Recurrent Imputation for Time Series},
  year      = {2018},
  month     = dec,
  abstract  = {Time series are widely used as signals in many classification/regression tasks. It is ubiquitous that time series contains many missing values. Given multiple correlated time series data, how to fill in missing values and to predict their class labels? Existing imputation methods often impose strong assumptions of the underlying data generating process, such as linear dynamics in the state space. In this paper, we propose BRITS, a novel method based on recurrent neural networks for missing value imputation in time series data. Our proposed method directly learns the missing values in a bidirectional recurrent dynamical system, without any specific assumption. The imputed values are treated as variables of RNN graph and can be effectively updated during the backpropagation.BRITS has three advantages: (a) it can handle multiple correlated missing values in time series; (b) it generalizes to time series with nonlinear dynamics underlying; (c) it provides a data-driven imputation procedure and applies to general settings with missing data.We evaluate our model on three real-world datasets, including an air quality dataset, a health-care data, and a localization data for human activity. Experiments show that our model outperforms the state-of-the-art methods in both imputation and classification/regression accuracies.},
  code      = {https://github.com/caow13/BRITS},
  eprint    = {https://arxiv.org/abs/1805.10572},
}

@InProceedings{shi2018tree,
  author    = {Shi, Haoyue and Zhou, Hao and Chen, Jiaze and Li, Lei},
  booktitle = {Conference on Empirical Methods in Natural Language Processing (EMNLP)},
  title     = {On Tree-Based Neural Sentence Modeling},
  year      = {2018},
  month     = oct,
  abstract  = {Neural networks with tree-based sentence encoders have shown better results on many downstream tasks. Most of existing tree-based encoders adopt syntactic parsing trees as the explicit structure prior. To study the effectiveness of different tree structures, we replace the parsing trees with trivial trees (i.e., binary balanced tree, left-branching tree and right-branching tree) in the encoders. Though trivial trees contain no syntactic information, those encoders get competitive or even better results on all of the ten downstream tasks we investigated. This surprising result indicates that explicit syntax guidance may not be the main contributor to the superior performances of tree-based neural sentence modeling. Further analysis show that tree modeling gives better results when crucial words are closer to the final representation. Additional experiments give more clues on how to design an effective tree-based encoder.},
  code      = {https://github.com/ExplorerFreda/TreeEnc},
  eprint    = {https://arxiv.org/abs/1808.09644},
  owner     = {lilei.02},
}

@InProceedings{li2018overview,
  author    = {Lei Li and Xiaojun Wan},
  booktitle = {Proc. of NLPCC},
  title     = {Overview of the NLPCC 2018 shared task: Single document summarization},
  year      = {2018},
  month     = oct,
}

@InProceedings{li2018jersey,
  author       = {Li, Gen and Xu, Shikun and Liu, Xiang and Li, Lei and Wang, Changhu},
  booktitle    = {IEEE Conference on Computer Vision and Pattern Recognition workshops, Computer Vision in Sports},
  title        = {Jersey Number Recognition with Semi-Supervised Spatial Transformer Network},
  year         = {2018},
  month        = jun,
  pages        = {1864 --1871},
  abstract     = {It is still a challenging task to recognize the jersey number 
of players on the court in soccer match videos, as the
jersey numbers are very small in the object detection task
and annotated data are not easy to collect. Based on the
object detection results of all the players on the court, a
CNN model is first introduced to classify these numbers on
the deteced players’ images. To localize the jersey number
more precisely without involving another digit detector and
extra consumption, we then improve the former network to
an end-to-end framework by fusing with the spatial transformer
network (STN). To further improve the accuracy, we
bring extra supervision to STN and upgrade the model to
a semi-supervised multi-task learning system, by labeling a
small portion of the number areas in the dataset by quadrangle.
Extensive experiments illustrate the effectiveness of
the proposed framework.},
  entrysubtype = {workshop},
}

@InProceedings{wu2018reinforced,
  author    = {Wu, Jiawei and Li, Lei and Wang, William Yang},
  booktitle = {Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (NAACL-HLT)},
  title     = {Reinforced Co-Training},
  year      = {2018},
  month     = jun,
  pages     = {1252--1262},
  publisher = {Association for Computational Linguistics},
  abstract  = {Co-training is a popular semi-supervised learning framework to utilize a
large amount of unlabeled data in addition to a small labeled set. Co-training methods exploit predicted labels on the unlabeled data and select samples based on prediction confidence to augment the training. However, the selection of samples in existing co-training methods is based on a predetermined policy, which ignores the sampling bias between the unlabeled and the labeled subsets, and fails to explore the data space. In this paper, we propose a novel method, Reinforced Co-Training, to select high-quality unlabeled samples to better co-train on. More specifically, our approach uses Q-learning to learn a data selection policy with a small labeled dataset, and then exploits this policy to train the co-training classifiers automatically. Experimental results on
clickbait detection and generic text classification tasks demonstrate that our proposed method can obtain more accurate text classification results.},
  eprint    = {http://arxiv.org/abs/1804.06035},
  location  = {New Orleans, Louisiana},
  timestamp = {2018-05-07T10:09:16.000+0200},
}

@InProceedings{hua2017overview,
  author    = {Lifeng Hua and Xiaojun Wan and Lei Li},
  booktitle = {Proc. of NLPCC},
  title     = {Overview of the NLPCC 2017 shared task: Single document summarization},
  year      = {2017},
  month     = oct,
}

@InProceedings{erol2017nearly,
  author    = {Erol, Yusuf and Wu, Yi and Li, Lei and Russell, Stuart},
  booktitle = {the 31st AAAI Conference on Artificial Intelligence (AAAI)},
  title     = {A Nearly-Black-Box Online Algorithm for Joint Parameter and State Estimation in Temporal Models},
  year      = {2017},
  month     = jul,
  abstract  = {Online joint parameter and state estimation is a core problem for temporal models. Most existing methods are either restricted to a particular class of models (e.g., the Storvik filter) or computationally expensive (e.g., particle MCMC). We propose a novel nearly-black-box algorithm, the Assumed Parameter Filter (APF), a hybrid of particle filtering for state variables and assumed density filtering for parameter variables. It has the following advantages: (a) it is online and computationally efficient; (b) it is applicable to both discrete and continuous parameter spaces with arbitrary transition dynamics. On a variety of toy and real models, APF generates more accurate results within a fixed computation budget compared to several standard algorithms from the literature.},
  comment   = {The earlier version appeared in NIPS 2016 workshop on Advances in Approximate Bayesian Inference.},
  file      = {Appendix:pubs/erol-aaai2017-apf-appendix.pdf:PDF},
  owner     = {lilei.02},
  timestamp = {2017-01-20},
  url       = {pubs/erol-aaai2017-apf-appendix.pdf},
}

@Article{matsubara2017non,
  author   = {Matsubara, Yasuko and Sakurai, Yasushi and Prakash, B. Aditya and Li, Lei and Faloutsos, Christos},
  journal  = {ACM Transactions on the Web},
  title    = {Non-linear Dynamics of Information Diffusion in Social Networks},
  year     = {2017},
  month    = feb,
  number   = {1},
  volume   = {11},
  abstract = {The recent explosion in the adoption of search engines and new media such as blogs and Twitter have facilitated the faster propagation of news and rumors. How quickly does a piece of news spread over these media? How does its popularity diminish over time? Does the rising and falling pattern follow a simple universal law? In this paper, we propose SPIKEM, a concise yet flexible analytical model of the rise and fall patterns of information diffusion. Our model has the following advantages: (a) unification power: it explains earlier empirical observations and generalizes theoretical models including the SI and SIR models. We provide the threshold of the take-off vs. die-out conditions for SPIKEM, and discuss the generality of our model, by applying it to an arbitrary graph topology; (b) practicality: it matches the observed behavior of diverse sets of real data; (c) parsimony: it requires only a handful of parameters; and (d) usefulness: it makes it possible to perform analytic tasks such as forecasting, spotting anomalies, and interpretation by reverse engineering the system parameters of interest (e.g. quality of news, number of interested bloggers, etc.). We also introduce an efficient and effective algorithm for the real-time monitoring of information diffusion, namely, SPIKESTREAM, which identifies multiple diffusion patterns in a large collection of online event streams. Extensive experiments on real datasets demonstrate that SPIKEM accurately and succinctly describes all the patterns of the rise-and-fall spikes in social networks.},
  comment  = {The earlier version of the paper appeared in KDD'12. This version includes significant extension.},
}

@InProceedings{wu2016swift,
  author    = {Wu, Yi and Li, Lei and Russell, Stuart and Bodik, Rastislav},
  booktitle = {25th International Joint Conference on Artificial Intelligence (IJCAI)},
  title     = {Swift: Compiled Inference for Probabilistic Programming Languages},
  year      = {2016},
  month     = jul,
  abstract  = {A probabilistic program defines a probability measure over its semantic structures. One common goal of probabilistic programming languages (PPLs) is to compute posterior probabilities for arbitrary models and queries, given observed evidence, using a generic inference engine. Most PPL inference engines—even the compiled ones—incur significant runtime interpretation overhead, especially for contingent and open-universe models. This paper describes Swift, a compiler for the BLOG PPL. Swift-generated code incorporates optimizations that eliminate interpretation overhead, maintain dynamic dependencies efficiently, and handle memory management for possible worlds of varying sizes. Experiments comparing Swift with other PPL engines on a variety of inference problems demonstrate speedups ranging from 12x to 326x.},
  code      = {https:/github.com/lileicc/swift},
  eprint    = {http://arxiv.org/abs/1606.09242},
  owner     = {leili},
  timestamp = {2016.07.04},
}

@InProceedings{dai2016cfo,
  author    = {Dai, Zihang and Li, Lei and Xu, Wei},
  booktitle = {the 54th Annual Meeting of the Association for Computational Linguistics (ACL)},
  title     = {{CFO}: Conditional Focused Neural Question Answering with Large-scale Knowledge Bases},
  year      = {2016},
  month     = jul,
  abstract  = {How can we enable computers to automatically answer questions like ``Who created the character Harry Potter''? Carefully built knowledge bases provide rich sources of facts. However, it remains a challenge to answer factoid questions raised in natural language due to numerous expressions of one question. In particular, we focus on the most common questions --- ones that can be answered with a single fact in the knowledge base. We propose CFO, a Conditional Focused neural-network-based approach to answering factoid questions with knowledge bases. Our approach first zooms in a question to find more probable candidate subject mentions, and infers the final answers with a unified conditional probabilistic framework. Powered by deep recurrent neural networks and neural embeddings, our proposed CFO achieves an accuracy of 75.7\% on a dataset of 108k questions - the largest public one to date. It outperforms the current state of the art by an absolute margin of 11.8\%.},
  code      = {https:/github.com/zihangdai/CFO},
  eprint    = {https:/arxiv.org/abs/1606.01994},
  owner     = {leili},
  timestamp = {2016.07.04},
}

@Misc{lu2015twisted,
  author    = {Lu, Zefu and Li, Lei and Xu, Wei},
  title     = {Twisted Recurrent Network for Named Entity Recognition},
  year      = {2015},
  booktitle = {Bay Area Machine Learning Symposium},
  owner     = {leili},
  timestamp = {2015.11.03},
}

@Misc{pham2015optimization,
  author    = {Pham, Hieu and Dai, Zihang and Li, Lei},
  title     = {On Optimization Algorithms for Recurrent Networks with Long Short-Term Memory},
  year      = {2015},
  booktitle = {Bay Area Machine Learning Symposium},
  owner     = {leili},
  timestamp = {2015.11.03},
}

@InProceedings{wu2014bfit,
  author       = {Wu, Yi and Li, Lei and Russell, Stuart},
  booktitle    = {Neural Information Processing Systems, Probabilistic Programming workshop},
  title        = {{BFiT}: From Possible-World Semantics to Random-Evaluation Semantics in Open Universe},
  year         = {2014},
  abstract     = {In recent years, several probabilistic programming languages (PPLs) have emerged, such as Bayesian Logic (BLOG), Church, and Figaro. These languages can be classified into two categories: PPLs interpreted using possible-world se- mantics and ones using random-evaluation semantics. In this paper, we explic- itly analyze the equivalence between these two semantics in the context of open- universe probability models (OUPMs). We propose a novel dynamic memoization technique to construct OUPMs using procedural instructions in random-evaluation based PPLs. We implemented a translator named BFiT, which converts code in BLOG (possible-world based) to Figaro (random-evaluation based). The trans- lated program in Figaro exhibits a merely constant blowup factor in program size while yielding the same inference results as the original model in BLOG.},
  entrysubtype = {workshop},
  file         = {:pubs/wu2014bfit - BFiT_ From Possible-World Semantics to Random-Evaluation Semantics in Open Universe.pdf:PDF},
  owner        = {leili},
  slides       = {pubs/wu-2014-bfit-poster.pdf},
  timestamp    = {2014.12.16},
}

@InProceedings{du2014maxios,
  author       = {Du, Simon Shaolei and Liu, Yilin and Chen, Boyi and Li, Lei},
  booktitle    = {Neural Information Processing Systems, workshop on Distributed Machine Learning and Matrix Computations},
  title        = {Maxios: Large Scale Nonnegative Matrix Factorization for Collaborative Filtering},
  year         = {2014},
  abstract     = {Nonnegative matrix factorization proved useful in many applications, including collaborative filtering – from existing ratings data one would like to predict new product ratings by users. However, factorizing a user-product score matrix is computation and memory intensive. We propose Maxios, a novel approach to fill missing values for large scale and highly sparse matrices efficiently and ac- curately. We formulate the matrix-completion problem as weighted nonnegative matrix factorization. In addition, we develop distributed update rules using alter- nating direction method of multipliers. We have implemented the Maxios system on top of Spark, a distributed in-memory computation framework. Experiments on commercial clusters show that Maxios is competitive in terms of scalability and accuracy against the existing solutions on a variety of datasets.},
  entrysubtype = {workshop},
  file         = {:pubs/du2014maxios - Maxios_ Large Scale Nonnegative Matrix Factorization for Collaborative Filtering.pdf:PDF},
  owner        = {leili},
  slides       = {pubs/du-2014-maxios-poster.pdf},
  timestamp    = {2014.12.16},
}

@InProceedings{juan2014poisson,
  author    = {Juan, Da-Cheng and Li, Lei and Peng, Huan-Kai and Marculescu, Diana and Faloutsos, Christos},
  booktitle = {The Pacific-Asia Conference on Knowledge Discovery and Data Mining (PAKDD)},
  title     = {Beyond {Poisson}: Modeling Inter-Arrival Times of Requests in a Datacenter},
  year      = {2014},
  file      = {:pubs/juan2014poisson - Beyond Poisson_ Modeling Inter-Arrival Times of Requests in a Datacenter.pdf:PDF},
  owner     = {leili},
  timestamp = {2014.01.07},
}

@TechReport{li2013blog,
  author      = {Li, Lei and Russell, Stuart},
  institution = {EECS Department, University of California, Berkeley},
  title       = {The BLOG Language Reference},
  year        = {2013},
  month       = may,
  number      = {UCB/EECS-2013-51},
  abstract    = {This document introduces the syntax of BLOG, a probabilistic programming language, for describing random variables and their probabilistic dependencies. BLOG defines probabilistic generative models over first-order structures. For example, all Bayesian networks can be easily described by BLOG. BLOG has the following features: (a) it employs open-universe semantics; (b) it can describe relational uncertainty; (c) it can handle identity uncertainty; and (d) it is empowered by first-order logic. The syntax as described in this document corresponds to BLOG version 0.6. The current version represents a significant redesign and extension to previous versions of BLOG, based on the principles of usability and implementation efficiency.},
  file        = {:pubs/li2013blog - The BLOG Language Reference.pdf:PDF;Tech report:http\:/www.eecs.berkeley.edu/Pubs/TechRpts/2013/EECS-2013-51.html:URL},
}

@InProceedings{li2013dynamic,
  author    = {Li, Lei and Ramsundar, Bharath and Russell, Stuart},
  booktitle = {16th International Conference on Artificial Intelligence and Statistics (AISTATS)},
  title     = {Dynamic Scaled Sampling for Deterministic Constraints},
  year      = {2013},
  abstract  = {Deterministic and near-deterministic relationships among subsets of random variables in multivariate systems are known to cause serious problems for Monte Carlo algorithms. We examine the case in which the relationship Z = f(X1,...,Xk) holds, where each Xi has a continuous prior pdf and we wish to obtain samples from the conditional distribution P(X1,...,Xk | Z = s). When f is addition, the problem is NP-hard even when the Xi are independent. In more restricted cases—for example, i.i.d. Boolean or categorical Xi—efficient exact samplers have been obtained previously. For the general continuous case, we propose a dynamic scaling algorithm (DYSC), and prove that it has O(k) expected running time and finite variance. We discuss generalizations of DYSC to functions f described by binary operation trees. We evaluate the algorithm on several examples.},
  file      = {:pubs/li2013dynamic - Dynamic Scaled Sampling for Deterministic Constraints.pdf:PDF},
  owner     = {leili},
  timestamp = {2013.03.05},
}

@InProceedings{vikram2013handwriting,
  author    = {Vikram, Sharad and Li, Lei and Russell, Stuart},
  title     = {Handwriting and Gestures in the Air, Recognizing on the Fly},
  booktitle = {ACM Conference on Human Factors in Computing Systems (CHI) Extended Abstracts},
  year      = {2013},
  abstract  = {Recent technologies in vision sensors are capable of capturing 3D finger positions and movements. We propose a novel way to control and interact with computers by moving fingers in the air. The positions of fingers are precisely captured by a computer vision device. By tracking the moving patterns of fingers, we can then recognize users’ intended control commands or input information. We demonstrate this human input approach through an example application of handwriting recognition. By treating the input as a time series of 3D positions, we propose a fast algorithm using dynamic time warping to recognize characters in online fashion. We employ various optimization techniques to recognize in real time as one writes. Experiments show promising recognition performance and speed.},
  file      = {:pubs/vikram2013handwriting - Handwriting and Gestures in the Air, Recognizing on the Fly.pdf:PDF},
  owner     = {leili},
  timestamp = {2013.03.05},
}

@InProceedings{liu2013hibernating,
  author    = {Liu, Siyuan and Li, Lei and Krishnan, Ramayya},
  booktitle = {IEEE International Conference on Data Mining (ICDM)},
  title     = {Hibernating Process: Modelling Mobile Calls at Multiple Scales},
  year      = {2013},
  file      = {:pubs/liu2013hibernating - Hibernating Process_ Modelling Mobile Calls at Multiple Scales.pdf:PDF},
  owner     = {leili},
  timestamp = {2013.11.26},
}

@InProceedings{erol2013extended,
  author    = {Erol, Yusuf and Li, Lei and Ramsundar, Bharath and Russell, Stuart},
  booktitle = {Proceedings of the 30th International Conference on Machine learning (ICML)},
  title     = {The Extended Parameter Filter},
  year      = {2013},
  abstract  = {The parameters of temporal models, such as dynamic Bayesian networks, may be modelled in a Bayesian context as static or atemporal variables that influence transition probabilities at every time step. Particle filters fail for models that include such variables, while methods that use Gibbs sampling of parameter variables may incur a per-sample cost that grows linearly with the length of the observation sequence. Storvik devised a method for incremental computation of exact sufficient statistics that, for some cases, reduces the per-sample cost to a constant. In this paper, we demonstrate a connection between Storvik's filter and a Kalman filter in parameter space and establish more general conditions under which Storvik's filter works. Drawing on an analogy to the extended Kalman filter, we develop and analyze, both theoretically and experimentally, a Taylor approximation to the parameter posterior that allows Storvik's method to be applied to a broader class of models. Our experiments on both synthetic examples and real applications show improvement over existing methods.},
  comment   = {The full version appeared as Tech. Rep. UCB/EECS-2013-48.},
  owner     = {leili},
  timestamp = {2013.05.08},
}

@InProceedings{fu2013why,
  author    = {Fu, Bin and Lin, Jialiu and Li, Lei and Faloutsos, Christos and Hong, Jason and Sadeh, Norman},
  booktitle = {the 19th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD)},
  title     = {Why People Hate Your App - Making Sense of User Feedback in a Mobile App Store},
  year      = {2013},
  address   = {New York, NY, USA},
  publisher = {ACM},
  abstract  = {User review is a crucial component of open mobile app mar- kets such as the Google Play Store. How do we automatically summarize millions of user reviews and make sense out of them? Unfortunately, beyond simple summaries such as histograms of user ratings, there are few analytic tools that can provide insights into user reviews. In this paper, we propose WisCom, a system that can analyze tens of millions user ratings and comments in mobile app markets at three different levels of detail. Our system is able to (a) discover inconsistencies in reviews; (b) identify reasons why users like or dislike a given app, and provide an interactive, zoomable view of how users’ reviews evolve over time; and (c) provide valuable insights into the entire app market, identifying users’ major concerns and preferences of different types of apps. Results using our techniques are reported on a 32GB dataset consisting of over 13 million user reviews of 171,493 Android apps in the Google Play Store. We discuss how the techniques presented herein can be deployed to help a mobile app market operator such as Google as well as individual app developers and end-users.},
  owner     = {leili},
  timestamp = {2013.06.18},
}

@InProceedings{rogers2013multilinear,
  author    = {Rogers, Mark and Li, Lei and Russell, Stuart},
  booktitle = {the 27th Conference on Neural Information Processing Systems(NeurIPS)},
  title     = {Multilinear Dynamical Systems for Tensor Time Series},
  year      = {2013},
  abstract  = {Data in the sciences frequently occur as sequences of multidimensional arrays called tensors. How can hidden, evolving trends in such data be extracted while preserving the tensor structure? The model that is traditionally used is the linear dynamical system (LDS) with Gaussian noise, which treats the latent state and observation at each time slice as a vector. We present the multilinear dynamical system (MLDS) for modeling tensor time series and an expectation–maximization (EM) algorithm to estimate the parameters. The MLDS models each tensor observation in the time series as the multilinear projection of the corresponding member of a sequence of latent tensors. The latent tensors are again evolving with respect to a multilinear projection. Compared to the LDS with an equal number of parameters, the MLDS achieves higher prediction accuracy and marginal likelihood for both artificial and real datasets.},
  code      = {https://github.com/lileicc/mlds},
  owner     = {leili},
  timestamp = {2013.09.11},
  url       = {mlds/},
}

@InProceedings{matsubara2013f,
  author    = {Matsubara, Yasuko and Li, Lei and Papalexakis, Evangelos E. and Lo, David and Sakurai, Yasushi and Faloutsos, Christos},
  booktitle = {The Pacific-Asia Conference on Knowledge Discovery and Data Mining (PAKDD)},
  title     = {{F-Trail}: Finding Patterns in Taxi Trajectories},
  year      = {2013},
  pages     = {86--98},
  file      = {:pubs/matsubara-The Pacific-Asia Conference on Knowledge Discovery and Data Mining (PAKDD)13-ftrail.pdf:PDF},
}

@InProceedings{henderson2012rolx,
  author    = {Henderson, Keith and Gallagher, Brian and Eliassi-Rad, Tina and Tong, Hanghang and Basu, Sugato and Akoglu, Leman and Koutra, Danai and Faloutsos, Christos and Li, Lei},
  booktitle = {Proceeding of the 18th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD)},
  title     = {{RolX}: Structural Role Extraction and Mining in Large Graphs},
  year      = {2012},
  address   = {New York, NY, USA},
  publisher = {ACM},
  owner     = {leili},
  timestamp = {2012.07.18},
}

@InProceedings{matsubara2012rise,
  author    = {Matsubara, Yasuko and Sakurai, Yasushi and Prakash, B. Aditya and Li, Lei and Faloutsos, Christos},
  booktitle = {Proceeding of the 18th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD)},
  title     = {Rise and Fall Patterns of Information Diffusion: Model and Implications},
  year      = {2012},
  address   = {New York, NY, USA},
  publisher = {ACM},
  owner     = {leili},
  timestamp = {2012.07.18},
}

@PhdThesis{li2011fast,
  author       = {Li, Lei},
  title        = {Fast algorithms for mining co-evolving time series},
  school       = {Carnegie Mellon University},
  year         = {2011},
  howpublished = {Available as technical report CMU-CS-11-127},
  owner        = {leili},
  slides       = {pubs/leili-talk2011-defense.pdf},
  timestamp    = {2011.09.01},
}

@InProceedings{liu2011mobile,
  author       = {Liu, Siyuan and Li, Lei and Faloutsos, Christos and Ni, Lionel},
  booktitle    = {IEEE International Conference on Data Mining, workshop on Data Mining Technologies for Computational Collective Intelligence},
  title        = {Mobile Phone Graph Evolution: Findings, Model and Interpretation},
  year         = {2011},
  entrysubtype = {workshop},
  owner        = {leili},
  timestamp    = {2011.11.30},
}

@InProceedings{henderson2011its,
  author    = {Henderson, Keith and Gallagher, Brian and Li, Lei and Akoglu, Leman and Eliassi-Rad, Tina and Tong, Hanghang and Faloutsos, Christos},
  booktitle = {Proceeding of the 17th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD)},
  title     = {It's Who You Know: Graph Mining Using Recursive Structural Features},
  year      = {2011},
  address   = {New York, NY, USA},
  publisher = {ACM},
  isbn      = {978-1-4503-0813-7/11/08},
  location  = {San Diego, California},
  owner     = {leili},
  timestamp = {2011.07.05},
}

@InProceedings{li2011thermocast,
  author    = {Li, Lei and Liang, Chieh-Jan Mike and Liu, Jie and Nath, Suman and Terzis, Andreas and Faloutsos, Christos},
  booktitle = {Proceeding of the 17th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD)},
  title     = {{ThermoCast}: A Cyber-Physical Forecasting Model for Data Centers},
  year      = {2011},
  address   = {New York, NY, USA},
  publisher = {ACM},
  abstract  = {Efficient thermal management is important in modern data centers as cooling consumes up to 50% of the total energy. Unlike previous work, we consider proactive thermal management, whereby servers can predict potential overheating events due to dynamics in data center configuration and workload, giving operators enough time to react. However, such forecasting is very challenging due to data center scales and complexity. Moreover, such a physical system is influenced by cyber effects, including workload scheduling in servers. We propose ThermoCast, a novel thermal forecasting model to predict the temperatures surrounding the servers in a data center, based on continuous streams of temperature and airflow measurements. Our approach is (a) capable of capturing cyber- physical interactions and automatically learning them from data; (b) computationally and physically scalable to data center scales; (c) able to provide online prediction with real-time sensor mea- surements. The paper’s main contributions are: (i) We provide a systematic approach to integrate physical laws and sensor observa- tions in a data center; (ii) We provide an algorithm that uses sensor data to learn the parameters of a data center’s cyber-physical sys- tem. In turn, this ability enables us to reduce model complexity compared to full-fledged fluid dynamics models, while maintain- ing forecast accuracy; (iii) Unlike previous simulation-based stud- ies, we perform experiments in a production data center. Using real data traces, we show that ThermoCast forecasts temperature 2× better than a machine learning approach solely driven by data, and can successfully predict thermal alarms 4.2 minutes ahead of time.},
  isbn      = {978-1-4503-0813-7/11/08},
  location  = {San Diego, California},
  owner     = {leili},
  slides    = {pubs/li2011thermocast-poster.pdf},
  timestamp = {2011.07.05},
}

@InProceedings{li2011time,
  author    = {Li, Lei and Prakash, B. Aditya},
  booktitle = {Proceedings of the 28th International Conference on Machine Learning (ICML)},
  title     = {Time Series Clustering: Complex is Simpler!},
  year      = {2011},
  code      = {https://github.com/lileicc/dynammo/tree/master/clds},
  comment   = {Please see for updated and additional experiments in Chap 5 of the thesis "Fast algorithms for mining co-evolving time series".},
  location  = {Bellevue, Washington},
  numpages  = {8},
  owner     = {leili},
  slides    = {pubs/li2011time-slides.pdf},
  software  = {software/clds-r347.zip},
  timestamp = {2011.07.05},
}

@InProceedings{sakurai2011windmine,
  author    = {Sakurai, Yasushi and Li, Lei and Matsubara, Yasuko and Faloutsos, Christos},
  booktitle = {SIAM International Conference on Data Mining (SDM)},
  title     = {{WindMine}: Fast and Effective Mining of Web-click Sequences},
  year      = {2011},
  owner     = {leili},
  timestamp = {2011.03.14},
}

@InProceedings{li2010parsimonious,
  author     = {Li, Lei and Prakash, B. Aditya and Faloutsos, Christos},
  booktitle  = {the Proceedings of the Very Large Data Bases Endowment (VLDB)},
  title      = {Parsimonious linear fingerprinting for time series},
  year       = {2010},
  month      = sep,
  pages      = {385--396},
  publisher  = {VLDB Endowment},
  volume     = {3},
  acmid      = {1920893},
  file       = {:pubs/li2010parsimonious - Parsimonious linear fingerprinting for time series.pdf:PDF},
  issn       = {2150-8097},
  issue      = {1-2},
  issue_date = {September 2010},
  numpages   = {12},
  owner      = {leili},
  slides     = {pubs/li-vldb10-plif-slides.pdf},
  software   = {software/plif-r345.zip},
  timestamp  = {2011.03.05},
}

@Article{li2010efficient,
  author    = {Li, Lei and Fu, Bin and Faloutsos, Christos},
  journal   = {IEICE Transactions on Information and Systems},
  title     = {Efficient Parallel Learning of Hidden {Markov} chain Models on {SMP}s},
  year      = {2010},
  number    = {6},
  pages     = {1330--1342},
  volume    = {E93.D},
  abstract  = {Quad-core cpus have been a common desktop configuration for today’s office. The increasing number of processors on a single chip opens new opportunity for parallel computing. Our goal is to make use of the multi-core as well as multi-processor architectures to speed up large-scale data mining algorithms. In this paper, we present a general par- allel learning framework, Cut-And-Stitch, for training hidden Markov chain models. Particularly, we propose two model-specific variants, CAS-LDS for learning linear dynamical systems (LDS) and CAS-HMM for learning hidden Markov models (HMM). Our main contribution is a novel method to handle the data dependencies due to the chain structure of hidden variables, so as to parallelize the EM-based parameter learning algorithm. We imple- ment CAS-LDS and CAS-HMM using OpenMP on two supercomputers and a quad-core commercial desktop. The experimental results show that parallel algorithms using Cut-And-Stitch achieve comparable accuracy and almost linear speedups over the traditional serial version.},
  comment   = {This one is applying the idea from Cut-And-Stitch paper from linear dynamical system to hidden markov models. The extended version can be found in my thesis Chapter 6 and 7.},
  file      = {:pubs/li2010efficient - Efficient Parallel Learning of Hidden Markov Chain Models on SMPs.pdf:PDF},
  owner     = {leili},
  timestamp = {2011.03.05},
}

@InProceedings{li2010fast,
  author       = {Li, Lei},
  booktitle    = {26th IEEE International Conference on Data Engineering, PHD Workshop},
  title        = {Fast Algorithms for Time Series Mining},
  year         = {2010},
  pages        = {341--344},
  entrysubtype = {workshop},
  owner        = {leili},
  slides       = {pubs/li-icde10-slides.pdf},
  timestamp    = {2010.03.21},
}

@InProceedings{henderson2010metric,
  author    = {Henderson, Keith and Eliassi-Rad, Tina and Faloutsos, Christos and Akoglu, Leman and Li, Lei and Maruhashi, Koji and Prakash, B. Aditya and Tong, Hanghang},
  booktitle = {Proceedings of the 16th ACM SIGKDD international conference on Knowledge discovery and data mining (KDD)},
  title     = {Metric forensics: a multi-level approach for mining volatile graphs},
  year      = {2010},
  address   = {New York, NY, USA},
  pages     = {163--172},
  publisher = {ACM},
  series    = {KDD '10},
  doi       = {http://doi.acm.org/10.1145/1835804.1835828},
  isbn      = {978-1-4503-0055-1},
  keywords  = {graph mining, temporal analysis, volatile graphs},
  location  = {Washington, DC, USA},
  numpages  = {10},
}

@InProceedings{li2010bolero,
  author    = {Li, Lei and McCann, James and Pollard, Nancy and Faloutsos, Christos},
  booktitle = {Proceedings of the 2010 ACM SIGGRAPH/Eurographics Symposium on Computer Animation (SCA)},
  title     = {{BoLeRO}: a principled technique for including bone length constraints in motion capture occlusion filling},
  year      = {2010},
  address   = {Aire-la-Ville, Switzerland, Switzerland},
  pages     = {179--188},
  publisher = {Eurographics Association},
  series    = {SCA '10},
  acmid     = {1921454},
  file      = {:pubs/li2010bolero - BoLeRO_ a principled technique for including bone length constraints in motion capture occlusion filling.pdf:PDF;demo:pubs/BoLeRO-final-v1_xvid.avi:URL},
  location  = {Madrid, Spain},
  numpages  = {10},
  owner     = {leili},
  software  = {software/bolero-r349.zip},
  timestamp = {2011.03.05},
}

@Patent{chen2009adaptive,
  nationality = {US},
  number      = {7,634,471},
  year        = {2009},
  yearfiled   = {2006},
  assignee    = {Microsoft},
  author      = {Chen, Zheng and Li, Lei and Lin, Chenxi and Liu, Qiaoling and Wang, Jian and Zhang, Benyu},
  day         = {14},
  dayfiled    = {29},
  month       = dec,
  monthfiled  = {#mar#},
  title       = {Adaptive grouping in a file network},
  owner       = {lilei.02},
}

@Patent{chen2009system,
  nationality = {US},
  number      = {7,624,130},
  year        = {2009},
  yearfiled   = {2006},
  assignee    = {Microsoft},
  author      = {Chen, Zheng and Li, Lei and Lin, Chenxi and Liu, Qiaoling and Wang, Jian and Zhang, Benyu},
  day         = {24},
  dayfiled    = {30},
  month       = nov,
  monthfiled  = {#mar#},
  title       = {System and method for exploring a semantic file network},
  owner       = {lilei.02},
}

@Patent{chen2009extracting,
  nationality = {US},
  number      = {7,502,785},
  year        = {2009},
  yearfiled   = {2006},
  assignee    = {Microsoft},
  author      = {Chen, Zheng and Li, Lei and Lin, Chenxi and Liu, Qiaoling and Wang, Jian and Zhang, Benyu},
  day         = {10},
  dayfiled    = {30},
  month       = mar,
  monthfiled  = {#mar#},
  title       = {Extracting Semantic Attributes},
  owner       = {lilei.02},
}

@InProceedings{li2009dynammo,
  author    = {Li, Lei and McCann, James and Pollard, Nancy and Faloutsos, Christos},
  booktitle = {Proceeding of the 15th ACM SIGKDD international conference on Knowledge discovery and data mining (KDD)},
  title     = {{DynaMMo}: Mining and Summarization of Coevolving Sequences with Missing Values},
  year      = {2009},
  address   = {New York, NY, USA},
  publisher = {ACM},
  code      = {https://github.com/lileicc/dynammo},
  isbn      = {978-1-60558-193-4},
  location  = {Paris, France},
  owner     = {leili},
  slides    = {pubs/li-kdd09-dynammo-slides.pdf},
  software  = {software/dynammo-r346.zip},
  timestamp = {2009.05.01},
}

@InProceedings{guo2009tailoring,
  author       = {Guo, Fan and Li, Lei and Faloutsos, Christos},
  booktitle    = {Proceedings of the 2009 workshop on Web Search Click Data},
  title        = {Tailoring click models to user goals},
  year         = {2009},
  address      = {New York, NY, USA},
  pages        = {88--92},
  publisher    = {ACM},
  series       = {WSCD '09},
  acmid        = {1507523},
  doi          = {http://doi.acm.org/10.1145/1507509.1507523},
  entrysubtype = {workshop},
  isbn         = {978-1-60558-434-8},
  keywords     = {click model, user behavior, web search},
  location     = {Barcelona, Spain},
  numpages     = {5},
}

@InProceedings{guo2008c,
  author    = {Guo, Fan and Li, Lei and Faloutsos, Christos and Xing, Eric P.},
  booktitle = {the Proceedings of the Very Large Data Bases Endowment (VLDB)},
  title     = {{C-DEM}: a Multi-modal Query System for Drosophila Embryo Databases},
  year      = {2008},
  month     = aug,
  pages     = {1508--1511},
  publisher = {VLDB Endowment},
  volume    = {1},
  acmid     = {1454214},
  file      = {:pubs/guo2008c - C-DEM_ a multi-modal query system for Drosophila Embryo databases.pdf:PDF},
  issn      = {2150-8097},
  issue     = {2},
  numpages  = {4},
  owner     = {leili},
  timestamp = {2009.11.20},
}

@InProceedings{xu2008inferring,
  author       = {Xu, Wanhong and Zhou, Xi and Li, Lei},
  booktitle    = {IEEE 24th International Conference on Data Engineering workshops},
  title        = {Inferring privacy information via social relations},
  year         = {2008},
  pages        = {525--530},
  doi          = {10.1109/ICDEW.2008.4498373},
  entrysubtype = {workshop},
}

@InProceedings{li2008cut,
  author    = {Li, Lei and Fu, Wenjie and Guo, Fan and Mowry, Todd C. and Faloutsos, Christos},
  booktitle = {Proceeding of the 14th ACM SIGKDD international conference on Knowledge discovery and data mining (KDD)},
  title     = {{Cut-and-Stitch}: efficient parallel learning of linear dynamical systems on smps},
  year      = {2008},
  address   = {New York, NY, USA},
  pages     = {471--479},
  publisher = {ACM},
  abstract  = {Multi-core processors with ever increasing number of cores per chip are becoming prevalent in modern parallel computing. Our goal is to make use of the multi-core as well as multi-processor architectures to speed up data mining algorithms. Specifically, we present a parallel algorithm for approximate learning of Linear Dynamical Systems (LDS), also known as Kalman Filters (KF). LDSs are widely used in time series analysis such as motion capture modeling and visual tracking etc. We propose Cut-And-Stitch (CAS), a novel method to handle the data dependencies due to the chain structure of hidden variables in LDS, so as to parallelize the EM- based parameter learning algorithm. We implement the algorithm using OpenMP on both a supercomputer and a quad-core commercial desktop. The experimental results show that parallel algorithms using Cut-And-Stitch achieve comparable accuracy and almost linear speedups over the serial version. In addition, Cut-And-Stitch can be generalized to other models with similar linear structures such as Hidden Markov Models (HMM) and Switching Kalman Filters (SKF).},
  isbn      = {978-1-60558-193-4},
  location  = {Las Vegas, Nevada, USA},
  owner     = {leili},
  slides    = {pub/li-2008-cut-slides.pdf},
  software  = {software/paralearn.0.1.zip},
  timestamp = {2009.03.04},
  url       = {paralearn/},
}

@InProceedings{sakurai2008efficient,
  author    = {Sakurai, Yasushi and Chong, Rosalynn and Li, Lei and Faloutsos, Christos},
  booktitle = {SIAM International Conference on Data Mining (SDM)},
  title     = {Efficient Distribution Mining and Classification},
  year      = {2008},
  pages     = {632--643},
  ee        = {http://www.siam.org/proceedings/datamining/2008/dm08_58_sakurai.pdf},
  file      = {:http\:/www.siam.org/proceedings/datamining/2008/dm08_58_sakurai.pdf:URL},
}

@InProceedings{li2008laziness,
  author    = {Li, Lei and McCann, James and Faloutsos, Christos and Pollard, Nancy},
  booktitle = {The 29th Annual Conference of the European Association for Computer Graphics (EG), Short Paper Proceedings},
  title     = {Laziness is a virtue: Motion stitching using effort minimization},
  year      = {2008},
  file      = {:pubs/li2008laziness - Laziness is a virtue_ Motion stitching using effort minimization.pdf:PDF},
  owner     = {leili},
  timestamp = {2009.03.04},
}

@InProceedings{li2006providing,
  author    = {Li, Lei and Liu, Qiaoling and Tao, Yunfeng and Zhang, Lei and Zhou, Jian and Yu, Yong},
  booktitle = {Asia-Pacific Web Conference},
  title     = {Providing an Uncertainty Reasoning Service for Semantic Web Application},
  year      = {2006},
  pages     = {628--639},
}

@Unpublished{baoPreprintpnat,
  author = {Yu Bao and Hao Zhou and Jiangtao Feng and Mingxuan Wang and Shujian Huang and Jiajun Chen and Lei Li},
  note   = {in submission},
  title  = {{PNAT}: Non-autoregressive Transformer by Position Learning},
  year   = {Preprint},
  eprint = {https://arxiv.org/abs/1911.10677},
}

@Unpublished{dongPreprintsurvey,
  author   = {Qingxiu Dong and Lei Li and Damai Dai and Ce Zheng and Zhiyong Wu and Baobao Chang and Xu Sun and Jingjing Xu and Lei Li and Zhifang Sui},
  title    = {A Survey on In-context Learning},
  year     = {Preprint},
  abstract = {With the increasing ability of large language models (LLMs), in-context learning (ICL) has become a new paradigm for natural language processing (NLP), where LLMs make predictions only based on contexts augmented with a few examples. It has been a new trend to explore ICL to evaluate and extrapolate the ability of LLMs. In this paper, we aim to survey and summarize the progress and challenges of ICL. We first present a formal definition of ICL and clarify its correlation to related studies. Then, we organize and discuss advanced techniques, including training strategies, demonstration designing strategies, as well as related analysis. Finally, we discuss the challenges of ICL and provide potential directions for further research. We hope that our work can encourage more research on uncovering how ICL works and improving ICL.},
  eprint   = {https://arxiv.org/abs/2301.00234},
}

@Noshow{liPreprintbidirectional,
  author = {Mingwei Li and Qingyuan Jiang and Yi He and Lei Li and Wujun Li},
  title  = {Bidirectional Attentive Convolutional Neural Network for Near-Duplicate Video Retrieval},
  year   = {Preprint},
}

@Unpublished{songPreprintfunctional,
  author   = {Zhenqiao Song and Yunlong Zhao and Wenxian Shi and Yang Yang and Lei Li},
  title    = {Functional Geometry Guided Protein Sequence and Backbone Structure Co-Design},
  year     = {Preprint},
  abstract = {Proteins are macromolecules responsible for essential functions in almost all living organisms. Designing reasonable proteins with desired functions is crucial. A protein's sequence and structure are strongly correlated and they together determine its function. In this paper, we propose NAEPro, a model to jointly design Protein sequence and structure based on automatically detected functional sites. NAEPro is powered by an interleaving network of attention and equivariant layers, which can capture global correlation in a whole sequence and local influence from nearest amino acids in three dimensional (3D) space. Such an architecture facilitates effective yet economic message passing at two levels. We evaluate our model and several strong baselines on two protein datasets, β-lactamase and myoglobin. Experimental results show that our model consistently achieves the highest amino acid recovery rate, TM-score, and the lowest RMSD among all competitors. These findings prove the capability of our model to design protein sequences and structures that closely resemble their natural counterparts. Furthermore, in-depth analysis further confirms our model's ability to generate highly effective proteins capable of binding to their target metallocofactors.},
  eprint   = {https://arxiv.org/abs/2310.04343},
}

@Unpublished{songPreprintjoint,
  author   = {Zhenqiao Song and Yunlong Zhao and Yufei Song and Wenxian Shi and Yang Yang and Lei Li},
  title    = {Joint Design of Protein Sequence and Structure based on Motifs},
  year     = {Preprint},
  abstract = {Designing novel proteins with desired functions is crucial in biology and chemistry. However, most existing work focus on protein sequence design, leaving protein sequence and structure co-design underexplored. In this paper, we propose GeoPro, a method to design protein backbone structure and sequence jointly. Our motivation is that protein sequence and its backbone structure constrain each other, and thus joint design of both can not only avoid nonfolding and misfolding but also produce more diverse candidates with desired functions. To this end, GeoPro is powered by an equivariant encoder for three-dimensional (3D) backbone structure and a protein sequence decoder guided by 3D geometry. Experimental results on two biologically significant metalloprotein datasets, including β-lactamases and myoglobins, show that our proposed GeoPro outperforms several strong baselines on most metrics. Remarkably, our method discovers novel β-lactamases and myoglobins which are not present in protein data bank (PDB) and UniProt. These proteins exhibit stable folding and active site environments reminiscent of those of natural proteins, demonstrating their excellent potential to be biologically functional.},
  eprint   = {https://arxiv.org/abs/2310.02546},
}

@Noshow{tianPreprintconversational,
  author = {Youzhi Tian and Zhou Yu and Cheng Yang and Hang Li and Lei Li},
  title  = {Conversational Contextualized Multimodal Representation Learning},
  year   = {Preprint},
}

@Unpublished{wangPreprintlearning,
  author   = {Danqing Wang and Kevin Yang and Hanlin Zhu and Xiaomeng Yang and Andrew Cohen and Lei Li and Yuandong Tian},
  title    = {Learning Personalized Story Evaluation},
  year     = {Preprint},
  abstract = {While large language models (LLMs) have shown impressive results for more objective tasks such as QA and retrieval, it remains nontrivial to evaluate their performance on open-ended text generation for reasons including (1) data contamination; (2) multi-dimensional evaluation criteria; and (3) subjectiveness stemming from reviewers' personal preferences. To address such issues, we propose to model personalization in an uncontaminated open-ended generation assessment. We create two new datasets Per-MPST and Per-DOC for personalized story evaluation, by re-purposing existing datasets with proper anonymization and new personalized labels. We further develop a personalized story evaluation model PERSE to infer reviewer preferences and provide a personalized evaluation. Specifically, given a few exemplary reviews from a particular reviewer, PERSE predicts either a detailed review or fine-grained comparison in several aspects (such as interestingness and surprise) for that reviewer on a new text input. Experimental results show that PERSE outperforms GPT-4 by 15.8% on Kendall correlation of story ratings, and by 13.7% on pairwise preference prediction accuracy. Both datasets and code will be released.},
  eprint   = {https://arxiv.org/abs/2310.03304},
}

@Unpublished{xuPreprintreciprocal,
  author = {Minkai Xu and Mingxuan Wang and Zhouhan Lin and Hao Zhou and Weinan Zhang and Lei Li},
  title  = {Reciprocal Supervised Learning Improves Neural Machine Translation},
  year   = {Preprint},
  eprint = {https://arxiv.org/abs/2012.02975},
}

@Unpublished{xuPreprintsurvey,
  author = {Jingjing Xu and Wangchunshu Zhou and Zhiyi Fu and Hao Zhou and Lei Li},
  title  = {A Survey on Green Deep Learning},
  year   = {Preprint},
  eprint = {https://arxiv.org/abs/2111.05193},
}

@Unpublished{yangPreprintstructure,
  author   = {Yuwei Yang and Siqi Ouyang and Xueyu Hu and Mingyue Zheng and Hao Zhou and Lei Li},
  title    = {Structure-Based Drug Design via 3D Molecular Generative Pre-training and Sampling},
  year     = {Preprint},
  abstract = {Structure-based drug design aims at generating high affinity ligands with prior knowledge of 3D target structures. Existing methods either use conditional generative model to learn the distribution of 3D ligands given target binding sites, or iteratively modify molecules to optimize a structure-based activity estimator. The former is highly constrained by data quantity and quality, which leaves optimization-based approaches more promising in practical scenario. However, existing optimization-based approaches choose to edit molecules in 2D space, and use molecular docking to estimate the activity using docking predicted 3D target-ligand complexes. The misalignment between the action space and the objective hinders the performance of these models, especially for those employ deep learning for acceleration. In this work, we propose MolEdit3D to combine 3D molecular generation with optimization frameworks. We develop a novel 3D graph editing model to generate molecules using fragments, and pre-train this model on abundant 3D ligands for learning target-independent properties. Then we employ a target-guided self-learning strategy to improve target-related properties using self-sampled molecules. MolEdit3D achieves state-of-the-art performance on majority of the evaluation metrics, and demonstrate strong capability of capturing both target-dependent and independent properties.},
  eprint   = {https://arxiv.org/abs/2402.14315},
}

@Unpublished{yanPreprintcross,
  author = {An Yan and Xin Wang and Jiangtao Feng and Lei Li and William Yang Wang},
  title  = {Cross-Lingual Vision-Language Navigation},
  year   = {Preprint},
  eprint = {https://arxiv.org/abs/1910.11301},
}

@Unpublished{yuanPreprintks,
  author   = {Fei Yuan and Chang Ma and Shuai Yuan and Qiushi Sun and Lei Li},
  title    = {KS-Lottery: Finding Certified Lottery Tickets for Multilingual Language Models},
  year     = {Preprint},
  abstract = {The lottery ticket hypothesis posits the existence of ``winning tickets'' within a randomly initialized neural network. Do winning tickets exist for LLMs in fine-tuning scenarios? How can we find such winning tickets? In this paper, we propose KS-Lottery, a method to identify a small subset of LLM parameters highly effective in multilingual fine-tuning. Our key idea is to use Kolmogorov-Smirnov Test to analyze the distribution shift of parameters before and after fine-tuning. We further theoretically prove that KS-Lottery can find the certified winning tickets in the embedding layer, fine-tuning on the found parameters is guaranteed to perform as well as full fine-tuning. Comparing KS-Lottery with other parameter-efficient tuning algorithms on translation tasks, the experimental results show that KS-Lottery finds a much smaller set of parameters for fine-tuning while achieving the comparable performance as full fine-tuning LLM. Surprisingly, we find that fine-tuning 18 tokens' embedding of LLaMA suffices to reach the fine-tuning translation performance.},
  eprint   = {https://arxiv.org/abs/2402.02801},
}

@Unpublished{zhaoPreprintpermute,
  author   = {Xuandong Zhao and Lei Li and Yu-Xiang Wang},
  title    = {Permute-and-Flip: An optimally robust and watermarkable decoder for LLMs},
  year     = {Preprint},
  abstract = {In this paper, we propose a new decoding method called Permute-and-Flip (PF) decoder. It enjoys robustness properties similar to the standard sampling decoder, but is provably up to 2x better in its quality-robustness tradeoff than sampling and never worse than any other decoder. We also design a cryptographic watermarking scheme analogous to Aaronson's Gumbel watermark, but naturally tailored for PF decoder. The watermarking scheme does not change the distribution to sample, while allowing arbitrarily low false positive rate and high recall whenever the generated text has high entropy. Our experiments show that the PF decoder (and its watermarked counterpart) significantly outperform(s) naive sampling (and it's Gumbel watermarked counterpart) in terms of perplexity, while retaining the same robustness (and detectability), hence making it a promising new approach for LLM decoding.},
  code     = {https://github.com/XuandongZhao/pf-decoding},
  eprint   = {https://arxiv.org/abs/2402.05864},
}

@Electronic{blog,
  language     = {Java},
  note         = {Latest version (v0.10). Brian Milch was the author for version 0.3 or earlier.},
  title        = {BLOG: A Probabilistic Modeling System for Bayesian Logic Probabilistic Programming Language},
  url          = {http://bayesianlogic.github.io/},
  entrysubtype = {software},
}

@Electronic{dynammo,
  language     = {Matlab},
  title        = {DynaMMo: a machine learning toolbox for multi-dimensional co-evolving time series},
  url          = {https://github.com/lileicc/dynammo},
  entrysubtype = {software},
}

@Electronic{lightseq,
  title        = {LightSeq: A High Performance GPU Training and Inference Library for Transformer Models},
  url          = {https://github.com/bytedance/lightseq},
  entrysubtype = {software},
}

@Electronic{swift,
  title        = {Swift: A compiler for BLOG probabilistic programming language},
  url          = {https://github.com/lileicc/swift},
  entrysubtype = {software},
}

@Electronic{volctrans,
  title        = {VolcTrans: a machine translation platform for 100+ languages},
  url          = {https://translate.volcengine.com},
  entrysubtype = {software},
}

@Comment{jabref-meta: databaseType:bibtex;}

@Comment{jabref-meta: fileDirectory:./;}

@Comment{jabref-meta: saveOrderConfig:specified;year;true;month;true;booktitle;false;}